PyPI - junshan-kit - Versions diffs - 2.1.6__py2.py3-none-any.whl → 2.1.8__py2.py3-none-any.whl - Mend

junshan-kit 2.1.6py2.py3-none-any.whl → 2.1.8py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

junshan_kit/DataProcessor.py +62 -2
junshan_kit/datahub.py +89 -13
junshan_kit/test.py +2 -4
junshan_kit-2.1.8.dist-info/METADATA +11 -0
junshan_kit-2.1.8.dist-info/RECORD +7 -0
junshan_kit-2.1.6.dist-info/METADATA +0 -31
junshan_kit-2.1.6.dist-info/RECORD +0 -7
{junshan_kit-2.1.6.dist-info → junshan_kit-2.1.8.dist-info}/WHEEL +0 -0

junshan_kit/DataProcessor.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import pandas as pd
 import os
+from sklearn.preprocessing import StandardScaler
 import junshan_kit.datahub
 class CSVToPandas:
@@ -16,6 +16,7 @@ class CSVToPandas:
     # ----------------- ccfd_kaggle ----------------------------------
     def ccfd_kaggle(self, data_name = 'ccfd-kaggle', show_info = True):
+        # download data if not exist
         self.read_csv(data_name)
         df = pd.read_csv(self.csv_path)
@@ -33,13 +34,72 @@ class CSVToPandas:
             print('='*60)
             print(f"{'Original size:':<25} {m_before} rows x {n_before} cols")
             print(f"{'Size after dropping NaNs:':<25} {m_after} rows x {n_after} cols")
-            print(f"{'Export size:':<25} {m_after} rows x {n_after} cols")
             print(f"{'Positive samples (+1):':<25} {pos_count}")
             print(f"{'Negative samples (-1):':<25} {neg_count}")
+            print(f"{'Export size:':<25} {m_after} rows x {n_after} cols")
             print('-'*60)
             print(f"More details: https://www.jianguoyun.com/p/Dd1clVgQ4ZThCxiwzZQGIAA")
             print('='*60 + '\n')
         return df
+    # ------------------------
+    def ghpdd_kaggle(self, data_name='ghpdd-kaggle', show_info=True):
+        # download data if not exist
+        self.read_csv(data_name)
+        # read csv
+        df = pd.read_csv(self.csv_path)
+        m_before, n_before = df.shape
+        # drop NaNs
+        df = df.dropna(axis=0, how='any')
+        m_after, n_after = df.shape
+        # drop unique identifier
+        if 'property_id' in df.columns:
+            df.drop(columns=['property_id'], inplace=True)
+        # Replace label 0 with -1
+        df['decision'] = df['decision'].replace(0, -1)
+        # Identify categorical and numerical columns
+        cat_cols = ['country', 'city', 'property_type', 'furnishing_status']
+        num_cols = [col for col in df.columns if col not in cat_cols + ['decision']]
+        # One-Hot encode categorical columns
+        df = pd.get_dummies(df, columns=cat_cols)
+        # Convert boolean columns to int
+        bool_cols = df.select_dtypes(include='bool').columns
+        for col in bool_cols:
+            df[col] = df[col].astype(int)
+        # Standardize numerical columns
+        scaler = StandardScaler()
+        df[num_cols] = scaler.fit_transform(df[num_cols])
+        # 导出后的大小
+        m_export, n_export = df.shape
+        if show_info:
+            pos_count = (df['decision'] == 1).sum()
+            neg_count = (df['decision'] == -1).sum()
+            print('\n' + '='*70)
+            print(f"{'GHPDD-Kaggle Dataset Info':^70}")
+            print('='*70)
+            print(f"{'Original size:':<35} {m_before} rows x {n_before} cols")
+            print(f"{'Size after dropping NaNs:':<35} {m_after} rows x {n_after} cols")
+            print(f"{'Positive samples (+1):':<35} {pos_count}")
+            print(f"{'Negative samples (-1):':<35} {neg_count}")
+            print(f"{'Export size (after encoding & scaling):':<35} {m_export} rows x {n_export} cols")
+            print('-'*70)
+            print(f"{'Note: categorical columns one-hot encoded, numerical standardized.'}")
+            print(f"More details: https://www.jianguoyun.com/p/DU6Lr9oQqdHDDRj5sI0GIAA")
+            print('='*70 + '\n')
+        return df

junshan_kit/datahub.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import kagglehub
-import os
+import os, time
 import warnings
 import shutil
 from kaggle.api.kaggle_api_extended import KaggleApi
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 class kaggle_data:
     def list_datasets(self):
@@ -42,20 +47,91 @@ class kaggle_data:
     # example: read_data(copy_path='./exp_data')
+class JianguoDownloader:
+    def __init__(self, url: str, download_path: str = "./downloads"):
+        self.url = url
+        self.download_path = os.path.abspath(download_path)
+        os.makedirs(self.download_path, exist_ok=True)
+        # Configure Chrome options
+        self.chrome_options = Options()
+        prefs = {
+            "download.default_directory": self.download_path,
+            "download.prompt_for_download": False,
+            "download.directory_upgrade": True,
+            "safebrowsing.enabled": True,
+            "profile.default_content_setting_values.automatic_downloads": 1,
+        }
+        self.chrome_options.add_experimental_option("prefs", prefs)
+        # Optional stability flags
+        self.chrome_options.add_argument("--disable-gpu")
+        self.chrome_options.add_argument("--no-sandbox")
+        self.chrome_options.add_argument("--disable-dev-shm-usage")
+        self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
-if __name__ == "__main__":
-    # Your code here
-    data = kaggle_data()
-    # Example usage
-    data.list_user_datasets()
-    data.download_data(data_name='letter-libsvm', copy_path='./exp_data/Letter')
+        # Start Chrome
+        self.driver = webdriver.Chrome(options=self.chrome_options)
-    """
-    import junshan_kit.datahub
-    data = junshan_kit.datahub.kaggle_data()
-    data.list_user_datasets()
-    data.read_data(data_name='letter-libsvm', copy_path='./exp_data/Letter')
-    """
+    def open_page(self):
+        """Open the Jianguoyun share page."""
+        print(f"🌐 Opening link: {self.url}")
+        self.driver.get(self.url)
+    def click_download_button(self):
+        """Find and click the download button."""
+        print("🔍 Looking for the download button...")
+        wait = WebDriverWait(self.driver, 30)
+        span = wait.until(
+            EC.presence_of_element_located((By.XPATH, "//span[contains(text(),'下载')]"))
+        )
+        parent = span.find_element(By.XPATH, "./..")
+        self.driver.execute_script("arguments[0].click();", parent)
+        print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
+        # If Jianguoyun opens a new tab, switch to it
+        time.sleep(3)
+        if len(self.driver.window_handles) > 1:
+            self.driver.switch_to.window(self.driver.window_handles[-1])
+            print("📂 Switched to download tab.")
+    def wait_for_downloads(self, timeout=300):
+        """Wait until all downloads are finished."""
+        print("⏳ Waiting for downloads to finish...")
+        start_time = time.time()
+        while True:
+            downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
+            if not downloading:
+                print("✅ Download completed!")
+                return True
+            if time.time() - start_time > timeout:
+                print("⏰ Timeout: downloads may not have finished.")
+                return False
+            time.sleep(2)
+    def get_latest_file(self):
+        """Return the most recently downloaded file (if any)."""
+        files = [os.path.join(self.download_path, f) for f in os.listdir(self.download_path)]
+        return max(files, key=os.path.getctime) if files else None
+    def close(self):
+        """Close the browser."""
+        self.driver.quit()
+        print("🚪 Browser closed.")
+    def run(self):
+        """Run the complete download process."""
+        print('*'*50)
+        try:
+            self.open_page()
+            self.click_download_button()
+            self.wait_for_downloads()
+            latest = self.get_latest_file()
+            if latest:
+                print(f"📄 Latest downloaded file: {latest}")
+        except Exception as e:
+            print("❌ Error occurred:", e)
+        finally:
+            self.close()
+        print('*'*50)

junshan_kit/test.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import DataProcessor
+from datahub import JianguoDownloader
-data_loader = DataProcessor.CSVToPandas()
-data_loader.ccfd_kaggle()
+data2 = JianguoDownloader('www.lka.com', './expspe')

junshan_kit-2.1.8.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,11 @@
+Metadata-Version: 2.4
+Name: junshan_kit
+Version: 2.1.8
+Summary: This is an optimization tool.
+Author-email: Junshan Yin <junshanyin@163.com>
+Requires-Dist: kaggle==1.7.4.5
+Requires-Dist: kagglehub==0.3.13
+Requires-Dist: numpy==2.2.6
+Requires-Dist: pandas==2.3.3
+Requires-Dist: scikit-learn==1.7.1
+Requires-Dist: selenium==4.36.0

junshan_kit-2.1.8.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+junshan_kit/DataProcessor.py,sha256=AW_1jROexC3s41-RgzqzYVwPI0sOf3tzjiph4qa_Vcw,3882
+junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+junshan_kit/datahub.py,sha256=mofbkp8ry6_LM_vW1LcZolp5tfkqOp_cUiwjfDFbRqI,5153
+junshan_kit/test.py,sha256=uSckjcr_Wgj__YPTwD6x0GY8Hfn5GBEXIpRf9vIYBbU,91
+junshan_kit-2.1.8.dist-info/METADATA,sha256=eFQmrVEUORZRhZqBCOlctfSU3vwCQ2RB4Jpyj1coAmE,329
+junshan_kit-2.1.8.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
+junshan_kit-2.1.8.dist-info/RECORD,,

junshan_kit-2.1.6.dist-info/METADATA DELETED Viewed

@@ -1,31 +0,0 @@
-Metadata-Version: 2.4
-Name: junshan_kit
-Version: 2.1.6
-Summary: This is an optimization tool.
-Author-email: Junshan Yin <junshanyin@163.com>
-Requires-Dist: kaggle==1.7.4.5
-Requires-Dist: kagglehub==0.3.13
-Requires-Dist: numpy==2.2.6
-Requires-Dist: pandas==2.3.3
-Requires-Dist: scikit-learn==1.7.1
-Description-Content-Type: text/markdown
-- For class kaggle_data in datahub
-  - We need to set API of kaggle.
-```python
-import junshan_kit.datahub
-data = junshan_kit.datahub.kaggle_data()
-data.list_user_datasets()
-data.read_data(data_name='letter-libsvm', copy_path='./exp_data/Letter')
-```

junshan_kit-2.1.6.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-junshan_kit/DataProcessor.py,sha256=9mlLYxdDiMX7baZmfJk5QuxT4vx_V728XIFbkXmCP0s,1594
-junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-junshan_kit/datahub.py,sha256=BWcG_TPW1xf_y_GzxRXanuOAB01WugBiO5r53EDbr8s,1815
-junshan_kit/test.py,sha256=aEaobINtr4Ri0jX6D8u49xgftyA6SE12wx0P6m5x-2w,90
-junshan_kit-2.1.6.dist-info/METADATA,sha256=KZTS690qvlgOduiYwo6oqshsk4dqY8m9HcVtWu-aXTI,599
-junshan_kit-2.1.6.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
-junshan_kit-2.1.6.dist-info/RECORD,,

{junshan_kit-2.1.6.dist-info → junshan_kit-2.1.8.dist-info}/WHEEL RENAMED Viewed

File without changes

junshan-kit 2.1.6__py2.py3-none-any.whl → 2.1.8__py2.py3-none-any.whl

junshan-kit 2.1.6py2.py3-none-any.whl → 2.1.8py2.py3-none-any.whl