PyPI - junshan-kit - Versions diffs - 2.2.3__py2.py3-none-any.whl → 2.5.1__py2.py3-none-any.whl - Mend

junshan-kit 2.2.3py2.py3-none-any.whl → 2.5.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

junshan_kit/DataHub.py +114 -0
junshan_kit/DataProcessor.py +280 -138
junshan_kit/DataSets.py +377 -0
junshan_kit/Evaluate_Metrics.py +40 -0
junshan_kit/ModelsHub.py +212 -0
junshan_kit/ParametersHub.py +419 -0
junshan_kit/Print_Info.py +63 -0
junshan_kit/TrainingHub.py +174 -0
junshan_kit/kit.py +279 -2
{junshan_kit-2.2.3.dist-info → junshan_kit-2.5.1.dist-info}/METADATA +2 -4
junshan_kit-2.5.1.dist-info/RECORD +13 -0
junshan_kit/datahub.py +0 -146
junshan_kit/meta.py +0 -256
junshan_kit/test.py +0 -8
junshan_kit-2.2.3.dist-info/RECORD +0 -9
{junshan_kit-2.2.3.dist-info → junshan_kit-2.5.1.dist-info}/WHEEL +0 -0

junshan_kit/kit.py CHANGED Viewed

@@ -6,7 +6,12 @@
 """
 import zipfile
-import os
+import os, time, openml
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 def unzip_file(zip_path: str, unzip_folder: str):
     """
@@ -30,6 +35,278 @@ def unzip_file(zip_path: str, unzip_folder: str):
     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
         zip_ref.extractall(unzip_folder)
-    print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
+    print(f"- Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
+# =============================================================
+#                   JIANGUOYUN (NUTSTORE) CHROME VERSION
+# =============================================================
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+class JianguoyunDownloaderChrome:
+    """ Example:
+    >>> url = "https://www.jianguoyun.com/p/DSQqUq8QqdHDDRiy6I0GIAA"
+    >>> downloader = JianguoyunDownloaderChrome(url)
+    >>> downloader.run()
+    """
+    def __init__(self, url, download_path="./exp_data"):
+        self.url = url
+        self.download_path = os.path.abspath(download_path)
+        os.makedirs(self.download_path, exist_ok=True)
+        self.chrome_options = ChromeOptions()
+        prefs = {
+            "download.default_directory": self.download_path,
+            "download.prompt_for_download": False,
+            "download.directory_upgrade": True,
+            "safebrowsing.enabled": True,
+            "profile.default_content_setting_values.automatic_downloads": 1,
+        }
+        self.chrome_options.add_experimental_option("prefs", prefs)
+        self.chrome_options.add_argument("--disable-gpu")
+        self.chrome_options.add_argument("--no-sandbox")
+        self.chrome_options.add_argument("--disable-dev-shm-usage")
+        self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
+        # Uncomment for headless mode:
+        # self.chrome_options.add_argument("--headless")
+        self.driver = webdriver.Chrome(options=self.chrome_options)
+    def open_page(self):
+        print(f">>> Opening page: {self.url}")
+        self.driver.get(self.url)
+        print(f">>> Page loaded: {self.driver.title}")
+    def click_download_button(self):
+        """Find and click the 'Download' button (supports English and Chinese)."""
+        print(">>> Searching for the download button...")
+        wait = WebDriverWait(self.driver, 30)
+        try:
+            # Match both English 'Download' (case-insensitive) and Chinese '下载'
+            xpath = (
+                "//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
+                " | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
+                " | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
+                " | //span[contains(text(),'下载')]"
+                " | //button[contains(text(),'下载')]"
+                " | //a[contains(text(),'下载')]"
+            )
+            button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
+            # Click using JavaScript to avoid overlay or interaction issues
+            self.driver.execute_script("arguments[0].click();", button)
+            print(f">>> Download button clicked. Files will be saved to: {self.download_path}")
+            # If the cloud service opens a new tab, switch to it
+            time.sleep(3)
+            if len(self.driver.window_handles) > 1:
+                self.driver.switch_to.window(self.driver.window_handles[-1])
+                print(">>> Switched to the new download tab.")
+        except Exception as e:
+            print(">>> Failed to find or click the download button:", e)
+            raise
+    def wait_for_downloads(self, timeout=3600):
+        print(">>> Waiting for downloads to finish...")
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
+            if not downloading:
+                print(">>> Download completed!")
+                return
+            time.sleep(2)
+        print(">>> Timeout: download not completed within 1 hour")
+    def close(self):
+        self.driver.quit()
+        print(">>> Browser closed.")
+    def run(self):
+        print('*' * 60)
+        try:
+            self.open_page()
+            self.click_download_button()
+            self.wait_for_downloads()
+        except Exception as e:
+            print(">>> Error:", e)
+        finally:
+            self.close()
+        print('*' * 60)
+# =============================================================
+#                   JIANGUOYUN (NUTSTORE) FIREFOX VERSION
+# =============================================================
+from selenium.webdriver.firefox.options import Options as FirefoxOptions
+from selenium.webdriver.firefox.service import Service
+class JianguoyunDownloaderFirefox:
+    """ Example:
+    >>> url = "https://www.jianguoyun.com/p/DSQqUq8QqdHDDRiy6I0GIAA"
+    >>> downloader = JianguoyunDownloaderFirefox(url)
+    >>> downloader.run()
+    """
+    def __init__(self, url, download_path="./exp_data"):
+        self.url = url
+        self.download_path = os.path.abspath(download_path)
+        os.makedirs(self.download_path, exist_ok=True)
+        options = FirefoxOptions()
+        options.add_argument("--headless")
+        options.set_preference("browser.download.folderList", 2)
+        options.set_preference("browser.download.manager.showWhenStarting", False)
+        options.set_preference("browser.download.dir", self.download_path)
+        options.set_preference("browser.helperApps.neverAsk.saveToDisk",
+                            "application/zip,application/octet-stream,application/x-zip-compressed,multipart/x-zip")
+        options.set_preference("pdfjs.disabled", True)
+        service = Service("/snap/bin/geckodriver")
+        self.driver = webdriver.Firefox(service=service, options=options)
+    def open_page(self):
+        print(f">>> Opening page: {self.url}")
+        self.driver.get(self.url)
+        print(f">>> Page loaded: {self.driver.title}")
+    def click_download_button(self):
+        """Find and click the 'Download' button (supports English and Chinese)."""
+        print(">>> Searching for the download button...")
+        wait = WebDriverWait(self.driver, 30)
+        try:
+            # Match both English 'Download' (case-insensitive) and Chinese '下载'
+            xpath = (
+                "//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
+                " | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
+                " | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
+                " | //span[contains(text(),'下载')]"
+                " | //button[contains(text(),'下载')]"
+                " | //a[contains(text(),'下载')]"
+            )
+            button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
+            # Click using JavaScript to avoid overlay or interaction issues
+            self.driver.execute_script("arguments[0].click();", button)
+            print(f">>> Download button clicked. Files will be saved to: {self.download_path}")
+            # If the cloud service opens a new tab, switch to it
+            time.sleep(3)
+            if len(self.driver.window_handles) > 1:
+                self.driver.switch_to.window(self.driver.window_handles[-1])
+                print(">>> Switched to the new download tab.")
+        except Exception as e:
+            print(">>> Failed to find or click the download button:", e)
+            raise
+    def wait_for_download(self, timeout=3600):
+        """Wait until all downloads are finished (auto-detects browser type)."""
+        print(">>> Waiting for downloads to finish...")
+        start_time = time.time()
+        # Determine the temporary file extension based on the browser type
+        temp_ext = ".crdownload" if "chrome" in self.driver.capabilities["browserName"].lower() else ".part"
+        while time.time() - start_time < timeout:
+            downloading = [f for f in os.listdir(self.download_path) if f.endswith(temp_ext)]
+            if not downloading:
+                print(">>> Download completed!")
+                return True
+            time.sleep(2)
+    def close(self):
+        print(">>> Closing browser...")
+        self.driver.quit()
+    def run(self):
+        print('*' * 60)
+        try:
+            self.open_page()
+            self.click_download_button()
+            self.wait_for_download(timeout=3600)
+        except Exception as e:
+            print(">>> Error:", e)
+        finally:
+            self.close()
+        print('*' * 60)
+def download_openml_data(data_name):
+    """
+    Returns
+    -------
+    X : ndarray, dataframe, or sparse matrix, shape (n_samples, n_columns)
+        Dataset
+    y : ndarray or pd.Series, shape (n_samples, ) or None
+        Target column
+    categorical_indicator : boolean ndarray
+        Mask that indicate categorical features.
+    attribute_names : List[str]
+        List of attribute names.
+    """
+    openml.config.set_root_cache_directory(f"./exp_data/{data_name}")
+    dataset = openml.datasets.get_dataset(f'{data_name}', download_data=True)
+    X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="dataframe")
+    return X, y, categorical_indicator, attribute_names
+def import_data_path_to_ignore():
+    def get_folder_size(folder_path):
+        """Recursively calculate the total size of a folder (in bytes)."""
+        total_size = 0
+        for root, dirs, files in os.walk(folder_path):
+            for f in files:
+                try:
+                    total_size += os.path.getsize(os.path.join(root, f))
+                except Exception:
+                    pass
+        return total_size
+    def list_and_ignore_large_folders(folder_path, limit_mb=99):
+        """List folder sizes and append large ones (> limit_mb) to .gitignore."""
+        gitignore_path = os.path.join(os.getcwd(), ".gitignore")
+        ignore_list = []
+        # Read existing .gitignore entries to avoid duplicates
+        if os.path.exists(gitignore_path):
+            with open(gitignore_path, "r", encoding="utf-8") as f:
+                existing_ignores = set(line.strip() for line in f if line.strip())
+        else:
+            existing_ignores = set()
+        for entry in os.scandir(folder_path):
+            if entry.is_dir():
+                folder_size_mb = get_folder_size(entry.path) / (1024 * 1024)
+                print(f"{entry.path}/ - {folder_size_mb:.2f} MB")
+                if folder_size_mb > limit_mb:
+                    rel_path = os.path.relpath(entry.path, start=os.getcwd())
+                    if rel_path not in existing_ignores:
+                        ignore_list.append(rel_path)
+            elif entry.is_file():
+                file_size_mb = os.path.getsize(entry.path) / (1024 * 1024)
+                print(f"{entry.path} - {file_size_mb:.2f} MB")
+        # Append new paths to .gitignore
+        if ignore_list:
+            with open(gitignore_path, "a", encoding="utf-8") as f:
+                for p in ignore_list:
+                    f.write(p + "\n")
+            print(f"\n✅ The following paths have been added to .gitignore:\n" + "\n".join(ignore_list))
+        else:
+            print("\nNo folders exceed the size limit (99 MB).")
+    folder_path = "./exp_data"
+    list_and_ignore_large_folders(folder_path, limit_mb=99)

{junshan_kit-2.2.3.dist-info → junshan_kit-2.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,11 +1,9 @@
 Metadata-Version: 2.4
 Name: junshan_kit
-Version: 2.2.3
+Version: 2.5.1
 Summary: This is an optimization tool.
 Author-email: Junshan Yin <junshanyin@163.com>
 Requires-Dist: kaggle==1.7.4.5
 Requires-Dist: kagglehub==0.3.13
-Requires-Dist: numpy==2.2.6
-Requires-Dist: pandas==2.3.3
-Requires-Dist: scikit-learn==1.7.1
+Requires-Dist: openml==0.15.1
 Requires-Dist: selenium==4.36.0

junshan_kit-2.5.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+junshan_kit/DataHub.py,sha256=D9G2vjCNvDLer4qoKdowgWJChLMQQn7cVhAPZLvRrbE,3332
+junshan_kit/DataProcessor.py,sha256=-6qjG52NDYq746vBPpc0uW2cfbc4syqSWZIzTxJt6fE,11806
+junshan_kit/DataSets.py,sha256=hwGnJsb-Lj90lk6VBwmsDBb3-IA_WgUWzAKayHyq2AI,13391
+junshan_kit/Evaluate_Metrics.py,sha256=Ic3VejsKtGT23ac7QKjRZ3WAETO1KP6JR-EaeiwblJE,1266
+junshan_kit/ModelsHub.py,sha256=z9NyC4PTxo3wCxa2XxOfcjrw9NcDs0LCjBGCp6Z-90s,7084
+junshan_kit/ParametersHub.py,sha256=usM2vu7fBP0n97rNEeJMxhzxRRGHhJMjELrnyJiVvTk,11520
+junshan_kit/Print_Info.py,sha256=yiGc6Qlprj0ds6w2DP7ScAgTBZwswxXqxuIrQ3_liL8,3111
+junshan_kit/TrainingHub.py,sha256=QOQ5BDctGysMbbSOEy6gR-ng0bSmrZl4iJZmj6n52m0,5960
+junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+junshan_kit/kit.py,sha256=F9f5qqn9ve-UVoYtXlFmNGl4YJ3eEy6T1yRrC0s-Wpw,12367
+junshan_kit-2.5.1.dist-info/METADATA,sha256=_gNNCaPWuspBXCD0Ce0maEYKtbO8eaoDXQIhmK2osOI,267
+junshan_kit-2.5.1.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
+junshan_kit-2.5.1.dist-info/RECORD,,

junshan_kit/datahub.py DELETED Viewed

@@ -1,146 +0,0 @@
-"""
-----------------------------------------------------------------------
->>> Author       : Junshan Yin
->>> Last Updated : 2025-10-12
-----------------------------------------------------------------------
-"""
-import kagglehub
-import os, time
-import warnings
-import shutil
-from kaggle.api.kaggle_api_extended import KaggleApi
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-class kaggle_data:
-    def list_datasets(self):
-        api = KaggleApi()
-        api.authenticate()
-        datasets = api.dataset_list(user='junshan888')
-        print('Available datasets:')
-        print('*' * 60)
-        if datasets is not None:
-            for ds in datasets:
-                if ds is not None:
-                    print(ds.title)
-        print('*' * 60)
-    def list_user_datasets(self):
-        warnings.warn(
-            "list_user_datasets() is deprecated. Use list_datasets() instead.",
-            DeprecationWarning,
-            stacklevel=2
-        )
-        return self.list_datasets()
-        # example:  list_user_datasets()
-    #---------------------------------------------------------------
-    def download_data(self, data_name = 'letter-libsvm', copy_path = None):
-        path = kagglehub.dataset_download(f'junshan888/{data_name}')
-        # print("Downloaded to:", path)
-        if copy_path is not None:
-            # Create target directory if it doesn't exist
-            os.makedirs(copy_path, exist_ok=True)
-            # Copy dataset to target directory
-            shutil.copytree(path, copy_path, dirs_exist_ok=True)
-            print(f"✅ Dataset has been copied to: {copy_path}")
-    # example: read_data(copy_path='./exp_data')
-class JianguoDownloaderChrome:
-    def __init__(self, url: str, download_path: str = "./downloads"):
-        self.url = url
-        self.download_path = os.path.abspath(download_path)
-        os.makedirs(self.download_path, exist_ok=True)
-        # Configure Chrome options
-        self.chrome_options = Options()
-        prefs = {
-            "download.default_directory": self.download_path,
-            "download.prompt_for_download": False,
-            "download.directory_upgrade": True,
-            "safebrowsing.enabled": True,
-            "profile.default_content_setting_values.automatic_downloads": 1,
-        }
-        self.chrome_options.add_experimental_option("prefs", prefs)
-        # Optional stability flags
-        self.chrome_options.add_argument("--disable-gpu")
-        self.chrome_options.add_argument("--no-sandbox")
-        self.chrome_options.add_argument("--disable-dev-shm-usage")
-        self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
-        # Start Chrome
-        self.driver = webdriver.Chrome(options=self.chrome_options)
-    def open_page(self):
-        """Open the Jianguoyun share page."""
-        print(f"🌐 Opening link: {self.url}")
-        self.driver.get(self.url)
-    def click_download_button(self):
-        """Find and click the download button."""
-        print("🔍 Looking for the download button...")
-        wait = WebDriverWait(self.driver, 30)
-        span = wait.until(
-            EC.presence_of_element_located((By.XPATH, "//span[contains(text(),'下载')]"))
-        )
-        parent = span.find_element(By.XPATH, "./..")
-        self.driver.execute_script("arguments[0].click();", parent)
-        print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
-        # If Jianguoyun opens a new tab, switch to it
-        time.sleep(3)
-        if len(self.driver.window_handles) > 1:
-            self.driver.switch_to.window(self.driver.window_handles[-1])
-            print("📂 Switched to download tab.")
-    def wait_for_downloads(self, timeout=30000):
-        """Wait until all downloads are finished."""
-        print("⏳ Waiting for downloads to finish...")
-        start_time = time.time()
-        while True:
-            downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
-            if not downloading:
-                print("✅ Download completed!")
-                return True
-            if time.time() - start_time > timeout:
-                print("⏰ Timeout: downloads may not have finished.")
-                return False
-            time.sleep(2)
-    def get_latest_file(self):
-        """Return the most recently downloaded file (if any)."""
-        files = [os.path.join(self.download_path, f) for f in os.listdir(self.download_path)]
-        return max(files, key=os.path.getctime) if files else None
-    def close(self):
-        """Close the browser."""
-        self.driver.quit()
-        print("🚪 Browser closed.")
-    def run(self):
-        """Run the complete download process."""
-        print('*'*50)
-        try:
-            self.open_page()
-            self.click_download_button()
-            self.wait_for_downloads()
-            latest = self.get_latest_file()
-            if latest:
-                print(f"📄 Latest downloaded file: {latest}")
-        except Exception as e:
-            print("❌ Error occurred:", e)
-        finally:
-            self.close()
-        print('*'*50)

junshan-kit 2.2.3__py2.py3-none-any.whl → 2.5.1__py2.py3-none-any.whl

junshan-kit 2.2.3py2.py3-none-any.whl → 2.5.1py2.py3-none-any.whl