junshan-kit 2.2.3__py2.py3-none-any.whl → 2.2.5__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,146 +1,68 @@
1
1
  """
2
2
  ----------------------------------------------------------------------
3
- >>> Author : Junshan Yin
3
+ >>> Author : Junshan Yin
4
4
  >>> Last Updated : 2025-10-12
5
5
  ----------------------------------------------------------------------
6
6
  """
7
7
 
8
8
  import pandas as pd
9
- import os
10
- from sklearn.preprocessing import StandardScaler
11
- import junshan_kit.datahub
12
- import zipfile
13
-
14
- class CSVToPandasMeta:
15
- def __init__(self):
16
- self.data_downloader = junshan_kit.datahub.kaggle_data()
17
-
18
-
19
- def read_csv(self, data_name):
20
- self.csv_path = f'exp_data/{data_name}/{data_name}.csv'
21
- if not os.path.exists(self.csv_path):
22
- self.data_downloader.download_data(f'{data_name}', f'exp_data/{data_name}')
23
-
24
- # ----------------- ccfd_kaggle ----------------------------------
25
- def ccfd_kaggle(self, data_name = 'ccfd-kaggle', show_info = True):
26
- # download data if not exist
27
- self.read_csv(data_name)
28
-
29
- df = pd.read_csv(self.csv_path)
30
- m_before, n_before = df.shape
31
- df = df.dropna(axis=0, how='any')
32
- m_after, n_after = df.shape
33
- df['Class'] = df['Class'].replace(0, -1)
34
-
35
- if show_info:
36
- pos_count = (df['Class'] == 1).sum()
37
- neg_count = (df['Class'] == -1).sum()
38
-
39
- print('\n' + '='*60)
40
- print(f"{'CCFD-Kaggle Dataset Info':^60}")
41
- print('='*60)
42
- print(f"{'Original size:':<25} {m_before} rows x {n_before} cols")
43
- print(f"{'Size after dropping NaNs:':<25} {m_after} rows x {n_after} cols")
44
- print(f"{'Positive samples (+1):':<25} {pos_count}")
45
- print(f"{'Negative samples (-1):':<25} {neg_count}")
46
- print(f"{'Export size:':<25} {m_after} rows x {n_after} cols")
47
- print('-'*60)
48
- print(f"More details: https://www.jianguoyun.com/p/Dd1clVgQ4ZThCxiwzZQGIAA")
49
- print('='*60 + '\n')
50
-
51
- return df
52
-
53
- # ------------------------
54
- def ghpdd_kaggle(self, data_name='ghpdd-kaggle', show_info=True):
55
- # download data if not exist
56
- self.read_csv(data_name)
57
-
58
- # read csv
59
- df = pd.read_csv(self.csv_path)
60
- m_before, n_before = df.shape
61
-
62
- # drop NaNs
63
- df = df.dropna(axis=0, how='any')
64
- m_after, n_after = df.shape
65
-
66
- # drop unique identifier
67
- if 'property_id' in df.columns:
68
- df.drop(columns=['property_id'], inplace=True)
69
-
70
- # Replace label 0 with -1
71
- df['decision'] = df['decision'].replace(0, -1)
72
-
73
- # Identify categorical and numerical columns
74
- cat_cols = ['country', 'city', 'property_type', 'furnishing_status']
75
- num_cols = [col for col in df.columns if col not in cat_cols + ['decision']]
76
-
77
- # One-Hot encode categorical columns
78
- df = pd.get_dummies(df, columns=cat_cols)
79
-
80
- # Convert boolean columns to int
81
- bool_cols = df.select_dtypes(include='bool').columns
82
- for col in bool_cols:
83
- df[col] = df[col].astype(int)
84
-
85
- # Standardize numerical columns
86
- scaler = StandardScaler()
87
- df[num_cols] = scaler.fit_transform(df[num_cols])
88
-
89
- # The size after export
90
- m_export, n_export = df.shape
91
-
92
- if show_info:
93
- pos_count = (df['decision'] == 1).sum()
94
- neg_count = (df['decision'] == -1).sum()
95
-
96
- print('\n' + '='*70)
97
- print(f"{'GHPDD-Kaggle Dataset Info':^70}")
98
- print('='*70)
99
- print(f"{'Original size:':<35} {m_before} rows x {n_before} cols")
100
- print(f"{'Size after dropping NaNs:':<35} {m_after} rows x {n_after} cols")
101
- print(f"{'Positive samples (+1):':<35} {pos_count}")
102
- print(f"{'Negative samples (-1):':<35} {neg_count}")
103
- print(f"{'Export size (after encoding & scaling):':<35} {m_export} rows x {n_export} cols")
104
- print('-'*70)
105
- print(f"{'Note: categorical columns one-hot encoded, numerical standardized.'}")
106
- print(f"More details: https://www.jianguoyun.com/p/DU6Lr9oQqdHDDRj5sI0GIAA")
107
- print('='*70 + '\n')
108
-
109
- return df
110
-
111
9
 
112
10
 
113
11
  class CSV_TO_Pandas:
114
12
  def __init__(self):
115
13
  pass
116
14
 
117
- def unzip_file(self, zip_path: str, unzip_folder: str):
118
- """
119
- Args:
120
- zip_path (str): Path to the ZIP file to extract.
121
- dest_folder (str, optional): Folder to extract files into.
122
- If None, the function will create a folder with the same
123
- name as the ZIP file (without extension).
124
-
125
- Examples:
126
- >>> zip_path = "./downloads/data.zip"
127
- >>> unzip_folder = "./exp_data/data"
128
- >>> unzip_file(zip_path, unzip_folder)
15
+ def preprocess_dataset(
16
+ self,
17
+ csv_path,
18
+ drop_cols: list,
19
+ label_col: str,
20
+ label_map: dict,
21
+ print_info=False,
22
+ ):
129
23
  """
24
+ Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
130
25
 
131
- if unzip_folder is None:
132
- unzip_folder = os.path.splitext(os.path.basename(zip_path))[0]
26
+ This function loads a dataset from a CSV file, removes specified non-feature columns,
27
+ drops rows with missing values, maps the target label to numerical values, and
28
+ one-hot encodes categorical features. Optionally, it can print dataset statistics
29
+ before and after preprocessing.
133
30
 
134
- os.makedirs(unzip_folder, exist_ok=True)
135
-
136
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
137
- zip_ref.extractall(unzip_folder)
138
-
139
- print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
140
-
141
- # -----------------------------------------------------
142
-
143
- def clean_data(self, csv_path, drop_cols: list, label_col: str, label_map: dict, print_info = False):
31
+ Args:
32
+ csv_path (str):
33
+ Path to the input CSV dataset.
34
+ drop_cols (list):
35
+ List of column names to drop from the dataset.
36
+ label_col (str):
37
+ Name of the target label column.
38
+ label_map (dict):
39
+ Mapping dictionary for label conversion (e.g., {"yes": 1, "no": -1}).
40
+ print_info (bool, optional):
41
+ Whether to print preprocessing information and dataset statistics.
42
+ Defaults to False.
43
+
44
+ Returns:
45
+ pandas.DataFrame:
46
+ The cleaned and preprocessed dataset ready for model input.
47
+
48
+ Steps:
49
+ 1. Load the dataset from CSV.
50
+ 2. Drop non-informative or irrelevant columns.
51
+ 3. Remove rows with missing values.
52
+ 4. Map label column to numerical values according to `label_map`.
53
+ 5. One-hot encode categorical (non-label) text features.
54
+ 6. Optionally print dataset information and summary statistics.
55
+
56
+ Example:
57
+ >>> label_map = {"positive": 1, "negative": -1}
58
+ >>> df = data_handler.preprocess_dataset(
59
+ ... csv_path="data/raw.csv",
60
+ ... drop_cols=["id", "timestamp"],
61
+ ... label_col="sentiment",
62
+ ... label_map=label_map,
63
+ ... print_info=True
64
+ ... )
65
+ """
144
66
  # Step 0: Load the dataset
145
67
  df = pd.read_csv(csv_path)
146
68
 
@@ -151,15 +73,19 @@ class CSV_TO_Pandas:
151
73
  df = df.drop(columns=drop_cols)
152
74
 
153
75
  # Step 2: Remove rows with missing values
154
- df = df.dropna(axis=0, how='any')
76
+ df = df.dropna(axis=0, how="any")
155
77
  m_encoded, n_encoded = df.shape
156
-
157
- # Step 3: Map target label to -1 and +1
78
+
79
+ # Step 3: Map target label (to -1 and +1)
158
80
  df[label_col] = df[label_col].map(label_map)
159
81
 
160
82
  # Step 4: Encode categorical features (exclude label column)
161
- text_feature_cols = df.select_dtypes(include=['object', 'string', 'category']).columns
162
- text_feature_cols = [col for col in text_feature_cols if col != label_col] # ✅ exclude label
83
+ text_feature_cols = df.select_dtypes(
84
+ include=["object", "string", "category"]
85
+ ).columns
86
+ text_feature_cols = [
87
+ col for col in text_feature_cols if col != label_col
88
+ ] # ✅ exclude label
163
89
 
164
90
  df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
165
91
  m_cleaned, n_cleaned = df.shape
@@ -170,23 +96,27 @@ class CSV_TO_Pandas:
170
96
  neg_count = (df[label_col] == -1).sum()
171
97
 
172
98
  # Step 6: Print dataset information
173
- print('\n' + '='*80)
99
+ print("\n" + "=" * 80)
174
100
  print(f"{'Dataset Info':^70}")
175
- print('='*80)
101
+ print("=" * 80)
176
102
  print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
177
- print(f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols")
103
+ print(
104
+ f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols"
105
+ )
178
106
  print(f"{'Positive samples (+1):':<40} {pos_count}")
179
107
  print(f"{'Negative samples (-1):':<40} {neg_count}")
180
- print(f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols")
181
- print('-'*80)
108
+ print(
109
+ f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols"
110
+ )
111
+ print("-" * 80)
182
112
  print(f"Note:")
183
113
  print(f"{'Label column:':<40} {label_col}")
184
- print(f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}")
185
- print(f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}")
186
- print('='*80 + '\n')
114
+ print(
115
+ f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
116
+ )
117
+ print(
118
+ f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
119
+ )
120
+ print("=" * 80 + "\n")
187
121
 
188
122
  return df
189
-
190
-
191
-
192
-
@@ -0,0 +1,3 @@
1
+
2
+
3
+
junshan_kit/kit.py CHANGED
@@ -6,7 +6,12 @@
6
6
  """
7
7
 
8
8
  import zipfile
9
- import os
9
+ import os, time
10
+
11
+ from selenium import webdriver
12
+ from selenium.webdriver.common.by import By
13
+ from selenium.webdriver.support.ui import WebDriverWait
14
+ from selenium.webdriver.support import expected_conditions as EC
10
15
 
11
16
  def unzip_file(zip_path: str, unzip_folder: str):
12
17
  """
@@ -33,3 +38,205 @@ def unzip_file(zip_path: str, unzip_folder: str):
33
38
  print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
34
39
 
35
40
 
41
+ # =============================================================
42
+ # JIANGUOYUN (NUTSTORE) CHROME VERSION
43
+ # =============================================================
44
+
45
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
46
+ class JianguoyunDownloaderChrome:
47
+ """ Example:
48
+ >>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
49
+ >>> downloader = JianguoyunDownloaderChrome(url)
50
+ >>> downloader.run()
51
+ """
52
+ def __init__(self, url, download_path="./downloads"):
53
+ self.url = url
54
+ self.download_path = os.path.abspath(download_path)
55
+ os.makedirs(self.download_path, exist_ok=True)
56
+
57
+ self.chrome_options = ChromeOptions()
58
+ prefs = {
59
+ "download.default_directory": self.download_path,
60
+ "download.prompt_for_download": False,
61
+ "download.directory_upgrade": True,
62
+ "safebrowsing.enabled": True,
63
+ "profile.default_content_setting_values.automatic_downloads": 1,
64
+ }
65
+ self.chrome_options.add_experimental_option("prefs", prefs)
66
+ self.chrome_options.add_argument("--disable-gpu")
67
+ self.chrome_options.add_argument("--no-sandbox")
68
+ self.chrome_options.add_argument("--disable-dev-shm-usage")
69
+ self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
70
+ # Uncomment for headless mode:
71
+ # self.chrome_options.add_argument("--headless")
72
+
73
+ self.driver = webdriver.Chrome(options=self.chrome_options)
74
+
75
+ def open_page(self):
76
+ print(f"🌐 Opening page: {self.url}")
77
+ self.driver.get(self.url)
78
+ print(f"✅ Page loaded: {self.driver.title}")
79
+
80
+ def click_download_button(self):
81
+ """Find and click the 'Download' button (supports English and Chinese)."""
82
+ print("🔍 Searching for the download button...")
83
+ wait = WebDriverWait(self.driver, 30)
84
+
85
+ try:
86
+ # Match both English 'Download' (case-insensitive) and Chinese '下载'
87
+ xpath = (
88
+ "//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
89
+ " | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
90
+ " | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
91
+ " | //span[contains(text(),'下载')]"
92
+ " | //button[contains(text(),'下载')]"
93
+ " | //a[contains(text(),'下载')]"
94
+ )
95
+
96
+ button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
97
+
98
+ # Click using JavaScript to avoid overlay or interaction issues
99
+ self.driver.execute_script("arguments[0].click();", button)
100
+ print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
101
+
102
+ # If the cloud service opens a new tab, switch to it
103
+ time.sleep(3)
104
+ if len(self.driver.window_handles) > 1:
105
+ self.driver.switch_to.window(self.driver.window_handles[-1])
106
+ print("📂 Switched to the new download tab.")
107
+
108
+ except Exception as e:
109
+ print("❌ Failed to find or click the download button:", e)
110
+ raise
111
+
112
+
113
+ def wait_for_downloads(self, timeout=3600):
114
+ print("⏳ Waiting for downloads to finish...")
115
+ start_time = time.time()
116
+ while time.time() - start_time < timeout:
117
+ downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
118
+ if not downloading:
119
+ print("✅ Download completed!")
120
+ return
121
+ time.sleep(2)
122
+ print("⚠️ Timeout: download not completed within 1 hour")
123
+
124
+ def close(self):
125
+ self.driver.quit()
126
+ print("🚪 Browser closed.")
127
+
128
+ def run(self):
129
+ print('*' * 60)
130
+ try:
131
+ self.open_page()
132
+ self.click_download_button()
133
+ self.wait_for_downloads()
134
+ except Exception as e:
135
+ print("❌ Error:", e)
136
+ finally:
137
+ self.close()
138
+ print('*' * 60)
139
+
140
+
141
+ # =============================================================
142
+ # JIANGUOYUN (NUTSTORE) FIREFOX VERSION
143
+ # =============================================================
144
+
145
+ from selenium.webdriver.firefox.options import Options as FirefoxOptions
146
+ from selenium.webdriver.firefox.service import Service
147
+
148
+ class JianguoyunDownloaderFirefox:
149
+ """ Example:
150
+ >>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
151
+ >>> downloader = JianguoyunDownloaderFirefox(url)
152
+ >>> downloader.run()
153
+ """
154
+ def __init__(self, url, download_path="./downloads"):
155
+ self.url = url
156
+ self.download_path = os.path.abspath(download_path)
157
+ os.makedirs(self.download_path, exist_ok=True)
158
+
159
+ options = FirefoxOptions()
160
+ options.add_argument("--headless")
161
+ options.set_preference("browser.download.folderList", 2)
162
+ options.set_preference("browser.download.manager.showWhenStarting", False)
163
+ options.set_preference("browser.download.dir", self.download_path)
164
+ options.set_preference("browser.helperApps.neverAsk.saveToDisk",
165
+ "application/zip,application/octet-stream,application/x-zip-compressed,multipart/x-zip")
166
+ options.set_preference("pdfjs.disabled", True)
167
+
168
+ service = Service("/snap/bin/geckodriver")
169
+ self.driver = webdriver.Firefox(service=service, options=options)
170
+
171
+ def open_page(self):
172
+ print(f"🌐 Opening page: {self.url}")
173
+ self.driver.get(self.url)
174
+ print(f"✅ Page loaded: {self.driver.title}")
175
+
176
+ def click_download_button(self):
177
+ """Find and click the 'Download' button (supports English and Chinese)."""
178
+ print("🔍 Searching for the download button...")
179
+ wait = WebDriverWait(self.driver, 30)
180
+
181
+ try:
182
+ # Match both English 'Download' (case-insensitive) and Chinese '下载'
183
+ xpath = (
184
+ "//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
185
+ " | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
186
+ " | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
187
+ " | //span[contains(text(),'下载')]"
188
+ " | //button[contains(text(),'下载')]"
189
+ " | //a[contains(text(),'下载')]"
190
+ )
191
+
192
+ button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
193
+
194
+ # Click using JavaScript to avoid overlay or interaction issues
195
+ self.driver.execute_script("arguments[0].click();", button)
196
+ print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
197
+
198
+ # If the cloud service opens a new tab, switch to it
199
+ time.sleep(3)
200
+ if len(self.driver.window_handles) > 1:
201
+ self.driver.switch_to.window(self.driver.window_handles[-1])
202
+ print("📂 Switched to the new download tab.")
203
+
204
+ except Exception as e:
205
+ print("❌ Failed to find or click the download button:", e)
206
+ raise
207
+
208
+ def wait_for_download(self, timeout=3600):
209
+ """Wait until all downloads are finished (auto-detects browser type)."""
210
+ print("⏳ Waiting for downloads to finish...")
211
+ start_time = time.time()
212
+
213
+ # Determine the temporary file extension based on the browser type
214
+ temp_ext = ".crdownload" if "chrome" in self.driver.capabilities["browserName"].lower() else ".part"
215
+
216
+ while time.time() - start_time < timeout:
217
+ downloading = [f for f in os.listdir(self.download_path) if f.endswith(temp_ext)]
218
+ if not downloading:
219
+ print("✅ Download completed!")
220
+ return True
221
+ time.sleep(2)
222
+
223
+
224
+ def close(self):
225
+ print("🛑 Closing browser...")
226
+ self.driver.quit()
227
+
228
+ def run(self):
229
+ print('*' * 60)
230
+ try:
231
+ self.open_page()
232
+ self.click_download_button()
233
+ self.wait_for_download(timeout=3600)
234
+ except Exception as e:
235
+ print("❌ Error:", e)
236
+ finally:
237
+ self.close()
238
+ print('*' * 60)
239
+
240
+
241
+
242
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: junshan_kit
3
- Version: 2.2.3
3
+ Version: 2.2.5
4
4
  Summary: This is an optimization tool.
5
5
  Author-email: Junshan Yin <junshanyin@163.com>
6
6
  Requires-Dist: kaggle==1.7.4.5
@@ -0,0 +1,7 @@
1
+ junshan_kit/DataProcessor.py,sha256=eryVmS5BFZj8wjDN2QWVHqkbFgFuWU0HXV9s6TGf9QM,4442
2
+ junshan_kit/DataSets.py,sha256=ajz1GSNU9xYVrFEDSz6Xwg7amWQ_yvW75tQa1ZvRIWc,3
3
+ junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ junshan_kit/kit.py,sha256=FaQT0H7FLKUuTNgU0zfU7Qn3MsjeW6C_rsrB_UOEVJ4,9571
5
+ junshan_kit-2.2.5.dist-info/METADATA,sha256=6eUE_T57eUMtYiE958tpJp7glbX2qXyMObOhW_n0INo,329
6
+ junshan_kit-2.2.5.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
7
+ junshan_kit-2.2.5.dist-info/RECORD,,
junshan_kit/datahub.py DELETED
@@ -1,146 +0,0 @@
1
- """
2
- ----------------------------------------------------------------------
3
- >>> Author : Junshan Yin
4
- >>> Last Updated : 2025-10-12
5
- ----------------------------------------------------------------------
6
- """
7
-
8
- import kagglehub
9
- import os, time
10
- import warnings
11
- import shutil
12
- from kaggle.api.kaggle_api_extended import KaggleApi
13
- from selenium import webdriver
14
- from selenium.webdriver.common.by import By
15
- from selenium.webdriver.chrome.options import Options
16
- from selenium.webdriver.support.ui import WebDriverWait
17
- from selenium.webdriver.support import expected_conditions as EC
18
-
19
- class kaggle_data:
20
- def list_datasets(self):
21
- api = KaggleApi()
22
- api.authenticate()
23
- datasets = api.dataset_list(user='junshan888')
24
- print('Available datasets:')
25
- print('*' * 60)
26
- if datasets is not None:
27
- for ds in datasets:
28
- if ds is not None:
29
- print(ds.title)
30
- print('*' * 60)
31
-
32
- def list_user_datasets(self):
33
- warnings.warn(
34
- "list_user_datasets() is deprecated. Use list_datasets() instead.",
35
- DeprecationWarning,
36
- stacklevel=2
37
- )
38
- return self.list_datasets()
39
-
40
- # example: list_user_datasets()
41
-
42
- #---------------------------------------------------------------
43
- def download_data(self, data_name = 'letter-libsvm', copy_path = None):
44
- path = kagglehub.dataset_download(f'junshan888/{data_name}')
45
- # print("Downloaded to:", path)
46
- if copy_path is not None:
47
- # Create target directory if it doesn't exist
48
- os.makedirs(copy_path, exist_ok=True)
49
- # Copy dataset to target directory
50
- shutil.copytree(path, copy_path, dirs_exist_ok=True)
51
-
52
- print(f"✅ Dataset has been copied to: {copy_path}")
53
-
54
- # example: read_data(copy_path='./exp_data')
55
-
56
-
57
- class JianguoDownloaderChrome:
58
- def __init__(self, url: str, download_path: str = "./downloads"):
59
- self.url = url
60
- self.download_path = os.path.abspath(download_path)
61
- os.makedirs(self.download_path, exist_ok=True)
62
-
63
- # Configure Chrome options
64
- self.chrome_options = Options()
65
- prefs = {
66
- "download.default_directory": self.download_path,
67
- "download.prompt_for_download": False,
68
- "download.directory_upgrade": True,
69
- "safebrowsing.enabled": True,
70
- "profile.default_content_setting_values.automatic_downloads": 1,
71
- }
72
- self.chrome_options.add_experimental_option("prefs", prefs)
73
-
74
- # Optional stability flags
75
- self.chrome_options.add_argument("--disable-gpu")
76
- self.chrome_options.add_argument("--no-sandbox")
77
- self.chrome_options.add_argument("--disable-dev-shm-usage")
78
- self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
79
-
80
- # Start Chrome
81
- self.driver = webdriver.Chrome(options=self.chrome_options)
82
-
83
- def open_page(self):
84
- """Open the Jianguoyun share page."""
85
- print(f"🌐 Opening link: {self.url}")
86
- self.driver.get(self.url)
87
-
88
- def click_download_button(self):
89
- """Find and click the download button."""
90
- print("🔍 Looking for the download button...")
91
- wait = WebDriverWait(self.driver, 30)
92
- span = wait.until(
93
- EC.presence_of_element_located((By.XPATH, "//span[contains(text(),'下载')]"))
94
- )
95
- parent = span.find_element(By.XPATH, "./..")
96
- self.driver.execute_script("arguments[0].click();", parent)
97
- print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
98
-
99
- # If Jianguoyun opens a new tab, switch to it
100
- time.sleep(3)
101
- if len(self.driver.window_handles) > 1:
102
- self.driver.switch_to.window(self.driver.window_handles[-1])
103
- print("📂 Switched to download tab.")
104
-
105
- def wait_for_downloads(self, timeout=30000):
106
- """Wait until all downloads are finished."""
107
- print("⏳ Waiting for downloads to finish...")
108
- start_time = time.time()
109
- while True:
110
- downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
111
- if not downloading:
112
- print("✅ Download completed!")
113
- return True
114
- if time.time() - start_time > timeout:
115
- print("⏰ Timeout: downloads may not have finished.")
116
- return False
117
- time.sleep(2)
118
-
119
- def get_latest_file(self):
120
- """Return the most recently downloaded file (if any)."""
121
- files = [os.path.join(self.download_path, f) for f in os.listdir(self.download_path)]
122
- return max(files, key=os.path.getctime) if files else None
123
-
124
- def close(self):
125
- """Close the browser."""
126
- self.driver.quit()
127
- print("🚪 Browser closed.")
128
-
129
- def run(self):
130
- """Run the complete download process."""
131
- print('*'*50)
132
- try:
133
- self.open_page()
134
- self.click_download_button()
135
- self.wait_for_downloads()
136
- latest = self.get_latest_file()
137
- if latest:
138
- print(f"📄 Latest downloaded file: {latest}")
139
- except Exception as e:
140
- print("❌ Error occurred:", e)
141
- finally:
142
- self.close()
143
- print('*'*50)
144
-
145
-
146
-
junshan_kit/meta.py DELETED
@@ -1,256 +0,0 @@
1
- """
2
- ----------------------------------------------------------------------
3
- >>> Author : Junshan Yin
4
- >>> Last Updated : 2025-10-12
5
- ----------------------------------------------------------------------
6
- """
7
-
8
- import os
9
- import time
10
- import shutil
11
- import warnings
12
- import kagglehub
13
- from kaggle.api.kaggle_api_extended import KaggleApi
14
-
15
- from selenium import webdriver
16
- from selenium.webdriver.common.by import By
17
- from selenium.webdriver.support.ui import WebDriverWait
18
- from selenium.webdriver.support import expected_conditions as EC
19
-
20
-
21
- # =============================================================
22
- # KAGGLE DATA MANAGEMENT
23
- # =============================================================
24
-
25
- class KaggleData:
26
- def list_datasets(self):
27
- """
28
- List available datasets from a specific user.
29
- """
30
- api = KaggleApi()
31
- api.authenticate()
32
- datasets = api.dataset_list(user='junshan888')
33
- print('Available datasets:')
34
- print('*' * 60)
35
- if datasets:
36
- for ds in datasets:
37
- print(ds.title) # type: ignore
38
- print('*' * 60)
39
-
40
- def list_user_datasets(self):
41
- warnings.warn(
42
- "list_user_datasets() is deprecated. Use list_datasets() instead.",
43
- DeprecationWarning,
44
- stacklevel=2
45
- )
46
- return self.list_datasets()
47
-
48
- def download_data(self, data_name='letter-libsvm', copy_path=None):
49
- """Download a Kaggle dataset and optionally copy it to a target folder."""
50
- path = kagglehub.dataset_download(f'junshan888/{data_name}')
51
- if copy_path:
52
- os.makedirs(copy_path, exist_ok=True)
53
- shutil.copytree(path, copy_path, dirs_exist_ok=True)
54
- print(f"✅ Dataset copied to: {copy_path}")
55
-
56
-
57
- # =============================================================
58
- # JIANGUOYUN (NUTSTORE) CHROME VERSION
59
- # =============================================================
60
-
61
- from selenium.webdriver.chrome.options import Options as ChromeOptions
62
-
63
- class JianguoyunDownloaderChrome:
64
- """ Example:
65
- >>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
66
- >>> downloader = JianguoyunDownloaderChrome(url)
67
- >>> downloader.run()
68
- """
69
- def __init__(self, url, download_path="./downloads"):
70
- self.url = url
71
- self.download_path = os.path.abspath(download_path)
72
- os.makedirs(self.download_path, exist_ok=True)
73
-
74
- self.chrome_options = ChromeOptions()
75
- prefs = {
76
- "download.default_directory": self.download_path,
77
- "download.prompt_for_download": False,
78
- "download.directory_upgrade": True,
79
- "safebrowsing.enabled": True,
80
- "profile.default_content_setting_values.automatic_downloads": 1,
81
- }
82
- self.chrome_options.add_experimental_option("prefs", prefs)
83
- self.chrome_options.add_argument("--disable-gpu")
84
- self.chrome_options.add_argument("--no-sandbox")
85
- self.chrome_options.add_argument("--disable-dev-shm-usage")
86
- self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
87
- # Uncomment for headless mode:
88
- # self.chrome_options.add_argument("--headless")
89
-
90
- self.driver = webdriver.Chrome(options=self.chrome_options)
91
-
92
- def open_page(self):
93
- print(f"🌐 Opening page: {self.url}")
94
- self.driver.get(self.url)
95
- print(f"✅ Page loaded: {self.driver.title}")
96
-
97
- def click_download_button(self):
98
- """Find and click the 'Download' button (supports English and Chinese)."""
99
- print("🔍 Searching for the download button...")
100
- wait = WebDriverWait(self.driver, 30)
101
-
102
- try:
103
- # Match both English 'Download' (case-insensitive) and Chinese '下载'
104
- xpath = (
105
- "//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
106
- " | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
107
- " | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
108
- " | //span[contains(text(),'下载')]"
109
- " | //button[contains(text(),'下载')]"
110
- " | //a[contains(text(),'下载')]"
111
- )
112
-
113
- button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
114
-
115
- # Click using JavaScript to avoid overlay or interaction issues
116
- self.driver.execute_script("arguments[0].click();", button)
117
- print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
118
-
119
- # If the cloud service opens a new tab, switch to it
120
- time.sleep(3)
121
- if len(self.driver.window_handles) > 1:
122
- self.driver.switch_to.window(self.driver.window_handles[-1])
123
- print("📂 Switched to the new download tab.")
124
-
125
- except Exception as e:
126
- print("❌ Failed to find or click the download button:", e)
127
- raise
128
-
129
-
130
- def wait_for_downloads(self, timeout=3600):
131
- print("⏳ Waiting for downloads to finish...")
132
- start_time = time.time()
133
- while time.time() - start_time < timeout:
134
- downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
135
- if not downloading:
136
- print("✅ Download completed!")
137
- return
138
- time.sleep(2)
139
- print("⚠️ Timeout: download not completed within 1 hour")
140
-
141
- def close(self):
142
- self.driver.quit()
143
- print("🚪 Browser closed.")
144
-
145
- def run(self):
146
- print('*' * 60)
147
- try:
148
- self.open_page()
149
- self.click_download_button()
150
- self.wait_for_downloads()
151
- except Exception as e:
152
- print("❌ Error:", e)
153
- finally:
154
- self.close()
155
- print('*' * 60)
156
-
157
-
158
- # =============================================================
159
- # JIANGUOYUN (NUTSTORE) FIREFOX VERSION
160
- # =============================================================
161
-
162
- from selenium.webdriver.firefox.options import Options as FirefoxOptions
163
- from selenium.webdriver.firefox.service import Service
164
-
165
- class JianguoyunDownloaderFirefox:
166
- """ Example:
167
- >>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
168
- >>> downloader = JianguoyunDownloaderFirefox(url)
169
- >>> downloader.run()
170
- """
171
- def __init__(self, url, download_path="./downloads"):
172
- self.url = url
173
- self.download_path = os.path.abspath(download_path)
174
- os.makedirs(self.download_path, exist_ok=True)
175
-
176
- options = FirefoxOptions()
177
- options.add_argument("--headless")
178
- options.set_preference("browser.download.folderList", 2)
179
- options.set_preference("browser.download.manager.showWhenStarting", False)
180
- options.set_preference("browser.download.dir", self.download_path)
181
- options.set_preference("browser.helperApps.neverAsk.saveToDisk",
182
- "application/zip,application/octet-stream,application/x-zip-compressed,multipart/x-zip")
183
- options.set_preference("pdfjs.disabled", True)
184
-
185
- service = Service("/snap/bin/geckodriver")
186
- self.driver = webdriver.Firefox(service=service, options=options)
187
-
188
- def open_page(self):
189
- print(f"🌐 Opening page: {self.url}")
190
- self.driver.get(self.url)
191
- print(f"✅ Page loaded: {self.driver.title}")
192
-
193
- def click_download_button(self):
194
- """Find and click the 'Download' button (supports English and Chinese)."""
195
- print("🔍 Searching for the download button...")
196
- wait = WebDriverWait(self.driver, 30)
197
-
198
- try:
199
- # Match both English 'Download' (case-insensitive) and Chinese '下载'
200
- xpath = (
201
- "//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
202
- " | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
203
- " | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
204
- " | //span[contains(text(),'下载')]"
205
- " | //button[contains(text(),'下载')]"
206
- " | //a[contains(text(),'下载')]"
207
- )
208
-
209
- button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
210
-
211
- # Click using JavaScript to avoid overlay or interaction issues
212
- self.driver.execute_script("arguments[0].click();", button)
213
- print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
214
-
215
- # If the cloud service opens a new tab, switch to it
216
- time.sleep(3)
217
- if len(self.driver.window_handles) > 1:
218
- self.driver.switch_to.window(self.driver.window_handles[-1])
219
- print("📂 Switched to the new download tab.")
220
-
221
- except Exception as e:
222
- print("❌ Failed to find or click the download button:", e)
223
- raise
224
-
225
- def wait_for_download(self, timeout=3600):
226
- """Wait until all downloads are finished (auto-detects browser type)."""
227
- print("⏳ Waiting for downloads to finish...")
228
- start_time = time.time()
229
-
230
- # Determine the temporary file extension based on the browser type
231
- temp_ext = ".crdownload" if "chrome" in self.driver.capabilities["browserName"].lower() else ".part"
232
-
233
- while time.time() - start_time < timeout:
234
- downloading = [f for f in os.listdir(self.download_path) if f.endswith(temp_ext)]
235
- if not downloading:
236
- print("✅ Download completed!")
237
- return True
238
- time.sleep(2)
239
-
240
-
241
- def close(self):
242
- print("🛑 Closing browser...")
243
- self.driver.quit()
244
-
245
- def run(self):
246
- print('*' * 60)
247
- try:
248
- self.open_page()
249
- self.click_download_button()
250
- self.wait_for_download(timeout=3600)
251
- except Exception as e:
252
- print("❌ Error:", e)
253
- finally:
254
- self.close()
255
- print('*' * 60)
256
-
junshan_kit/test.py DELETED
@@ -1,8 +0,0 @@
1
- from DataProcessor import CSV_TO_Pandas
2
-
3
-
4
- data_ = CSV_TO_Pandas()
5
-
6
-
7
-
8
- data_.clean_data('data_csv/Electric Vehicle Population Data/Electric_Vehicle_Population_Data.csv', [], [], {})
@@ -1,9 +0,0 @@
1
- junshan_kit/DataProcessor.py,sha256=rDL3NLD-WlT3x6x74XkB_542_sk3BrnIk5p4rYlVn5o,7212
2
- junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- junshan_kit/datahub.py,sha256=_Q_3AlZ8vk1Ma6h9I44SxWBA8w9m1CQNvYztMcsxzUo,5377
4
- junshan_kit/kit.py,sha256=h4Q_87hEJbXH4A9ryaGMu_nle5RlM8OR_PaW_hWCVBY,1040
5
- junshan_kit/meta.py,sha256=SiY9P93aABrksNE6G3ft5gzcuP2cUgc4Vx6LH7ZFmzg,10113
6
- junshan_kit/test.py,sha256=FgzG4oG7kkq6rWasxdBSY1qx_B0navRI5Ei-wJ1Dvo0,180
7
- junshan_kit-2.2.3.dist-info/METADATA,sha256=h4_Z0LMIigJgrkt2hD5TcYJwOCkArMRySh-OopgZ9Xo,329
8
- junshan_kit-2.2.3.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
9
- junshan_kit-2.2.3.dist-info/RECORD,,