junshan-kit 2.2.3__py2.py3-none-any.whl → 2.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
junshan_kit/kit.py CHANGED
@@ -6,7 +6,12 @@
6
6
  """
7
7
 
8
8
  import zipfile
9
- import os
9
+ import os, time, openml
10
+
11
+ from selenium import webdriver
12
+ from selenium.webdriver.common.by import By
13
+ from selenium.webdriver.support.ui import WebDriverWait
14
+ from selenium.webdriver.support import expected_conditions as EC
10
15
 
11
16
  def unzip_file(zip_path: str, unzip_folder: str):
12
17
  """
@@ -30,6 +35,278 @@ def unzip_file(zip_path: str, unzip_folder: str):
30
35
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
31
36
  zip_ref.extractall(unzip_folder)
32
37
 
33
- print(f" Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
38
+ print(f"- Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
39
+
40
+
41
+ # =============================================================
42
+ # JIANGUOYUN (NUTSTORE) CHROME VERSION
43
+ # =============================================================
44
+
45
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
46
+ class JianguoyunDownloaderChrome:
47
+ """ Example:
48
+ >>> url = "https://www.jianguoyun.com/p/DSQqUq8QqdHDDRiy6I0GIAA"
49
+ >>> downloader = JianguoyunDownloaderChrome(url)
50
+ >>> downloader.run()
51
+ """
52
+ def __init__(self, url, download_path="./exp_data"):
53
+ self.url = url
54
+ self.download_path = os.path.abspath(download_path)
55
+ os.makedirs(self.download_path, exist_ok=True)
56
+
57
+ self.chrome_options = ChromeOptions()
58
+ prefs = {
59
+ "download.default_directory": self.download_path,
60
+ "download.prompt_for_download": False,
61
+ "download.directory_upgrade": True,
62
+ "safebrowsing.enabled": True,
63
+ "profile.default_content_setting_values.automatic_downloads": 1,
64
+ }
65
+ self.chrome_options.add_experimental_option("prefs", prefs)
66
+ self.chrome_options.add_argument("--disable-gpu")
67
+ self.chrome_options.add_argument("--no-sandbox")
68
+ self.chrome_options.add_argument("--disable-dev-shm-usage")
69
+ self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
70
+ # Uncomment for headless mode:
71
+ # self.chrome_options.add_argument("--headless")
72
+
73
+ self.driver = webdriver.Chrome(options=self.chrome_options)
74
+
75
+ def open_page(self):
76
+ print(f">>> Opening page: {self.url}")
77
+ self.driver.get(self.url)
78
+ print(f">>> Page loaded: {self.driver.title}")
79
+
80
+ def click_download_button(self):
81
+ """Find and click the 'Download' button (supports English and Chinese)."""
82
+ print(">>> Searching for the download button...")
83
+ wait = WebDriverWait(self.driver, 30)
84
+
85
+ try:
86
+ # Match both English 'Download' (case-insensitive) and Chinese '下载'
87
+ xpath = (
88
+ "//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
89
+ " | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
90
+ " | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
91
+ " | //span[contains(text(),'下载')]"
92
+ " | //button[contains(text(),'下载')]"
93
+ " | //a[contains(text(),'下载')]"
94
+ )
95
+
96
+ button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
97
+
98
+ # Click using JavaScript to avoid overlay or interaction issues
99
+ self.driver.execute_script("arguments[0].click();", button)
100
+ print(f">>> Download button clicked. Files will be saved to: {self.download_path}")
101
+
102
+ # If the cloud service opens a new tab, switch to it
103
+ time.sleep(3)
104
+ if len(self.driver.window_handles) > 1:
105
+ self.driver.switch_to.window(self.driver.window_handles[-1])
106
+ print(">>> Switched to the new download tab.")
107
+
108
+ except Exception as e:
109
+ print(">>> Failed to find or click the download button:", e)
110
+ raise
111
+
112
+
113
+ def wait_for_downloads(self, timeout=3600):
114
+ print(">>> Waiting for downloads to finish...")
115
+ start_time = time.time()
116
+ while time.time() - start_time < timeout:
117
+ downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
118
+ if not downloading:
119
+ print(">>> Download completed!")
120
+ return
121
+ time.sleep(2)
122
+ print(">>> Timeout: download not completed within 1 hour")
123
+
124
+ def close(self):
125
+ self.driver.quit()
126
+ print(">>> Browser closed.")
127
+
128
+ def run(self):
129
+ print('*' * 60)
130
+ try:
131
+ self.open_page()
132
+ self.click_download_button()
133
+ self.wait_for_downloads()
134
+ except Exception as e:
135
+ print(">>> Error:", e)
136
+ finally:
137
+ self.close()
138
+ print('*' * 60)
139
+
140
+
141
+ # =============================================================
142
+ # JIANGUOYUN (NUTSTORE) FIREFOX VERSION
143
+ # =============================================================
144
+
145
+ from selenium.webdriver.firefox.options import Options as FirefoxOptions
146
+ from selenium.webdriver.firefox.service import Service
147
+
148
+ class JianguoyunDownloaderFirefox:
149
+ """ Example:
150
+ >>> url = "https://www.jianguoyun.com/p/DSQqUq8QqdHDDRiy6I0GIAA"
151
+ >>> downloader = JianguoyunDownloaderFirefox(url)
152
+ >>> downloader.run()
153
+ """
154
+ def __init__(self, url, download_path="./exp_data"):
155
+ self.url = url
156
+ self.download_path = os.path.abspath(download_path)
157
+ os.makedirs(self.download_path, exist_ok=True)
158
+
159
+ options = FirefoxOptions()
160
+ options.add_argument("--headless")
161
+ options.set_preference("browser.download.folderList", 2)
162
+ options.set_preference("browser.download.manager.showWhenStarting", False)
163
+ options.set_preference("browser.download.dir", self.download_path)
164
+ options.set_preference("browser.helperApps.neverAsk.saveToDisk",
165
+ "application/zip,application/octet-stream,application/x-zip-compressed,multipart/x-zip")
166
+ options.set_preference("pdfjs.disabled", True)
167
+
168
+ service = Service("/snap/bin/geckodriver")
169
+ self.driver = webdriver.Firefox(service=service, options=options)
170
+
171
+ def open_page(self):
172
+ print(f">>> Opening page: {self.url}")
173
+ self.driver.get(self.url)
174
+ print(f">>> Page loaded: {self.driver.title}")
175
+
176
+ def click_download_button(self):
177
+ """Find and click the 'Download' button (supports English and Chinese)."""
178
+ print(">>> Searching for the download button...")
179
+ wait = WebDriverWait(self.driver, 30)
180
+
181
+ try:
182
+ # Match both English 'Download' (case-insensitive) and Chinese '下载'
183
+ xpath = (
184
+ "//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
185
+ " | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
186
+ " | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
187
+ " | //span[contains(text(),'下载')]"
188
+ " | //button[contains(text(),'下载')]"
189
+ " | //a[contains(text(),'下载')]"
190
+ )
191
+
192
+ button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
193
+
194
+ # Click using JavaScript to avoid overlay or interaction issues
195
+ self.driver.execute_script("arguments[0].click();", button)
196
+ print(f">>> Download button clicked. Files will be saved to: {self.download_path}")
197
+
198
+ # If the cloud service opens a new tab, switch to it
199
+ time.sleep(3)
200
+ if len(self.driver.window_handles) > 1:
201
+ self.driver.switch_to.window(self.driver.window_handles[-1])
202
+ print(">>> Switched to the new download tab.")
203
+
204
+ except Exception as e:
205
+ print(">>> Failed to find or click the download button:", e)
206
+ raise
207
+
208
+ def wait_for_download(self, timeout=3600):
209
+ """Wait until all downloads are finished (auto-detects browser type)."""
210
+ print(">>> Waiting for downloads to finish...")
211
+ start_time = time.time()
212
+
213
+ # Determine the temporary file extension based on the browser type
214
+ temp_ext = ".crdownload" if "chrome" in self.driver.capabilities["browserName"].lower() else ".part"
215
+
216
+ while time.time() - start_time < timeout:
217
+ downloading = [f for f in os.listdir(self.download_path) if f.endswith(temp_ext)]
218
+ if not downloading:
219
+ print(">>> Download completed!")
220
+ return True
221
+ time.sleep(2)
222
+
223
+
224
+ def close(self):
225
+ print(">>> Closing browser...")
226
+ self.driver.quit()
227
+
228
+ def run(self):
229
+ print('*' * 60)
230
+ try:
231
+ self.open_page()
232
+ self.click_download_button()
233
+ self.wait_for_download(timeout=3600)
234
+ except Exception as e:
235
+ print(">>> Error:", e)
236
+ finally:
237
+ self.close()
238
+ print('*' * 60)
239
+
240
+
241
+ def download_openml_data(data_name):
242
+ """
243
+ Returns
244
+ -------
245
+ X : ndarray, dataframe, or sparse matrix, shape (n_samples, n_columns)
246
+ Dataset
247
+ y : ndarray or pd.Series, shape (n_samples, ) or None
248
+ Target column
249
+ categorical_indicator : boolean ndarray
250
+ Mask that indicate categorical features.
251
+ attribute_names : List[str]
252
+ List of attribute names.
253
+ """
254
+ openml.config.set_root_cache_directory(f"./exp_data/{data_name}")
255
+ dataset = openml.datasets.get_dataset(f'{data_name}', download_data=True)
256
+ X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="dataframe")
257
+
258
+ return X, y, categorical_indicator, attribute_names
259
+
260
+
261
+ def import_data_path_to_ignore():
262
+ def get_folder_size(folder_path):
263
+ """Recursively calculate the total size of a folder (in bytes)."""
264
+ total_size = 0
265
+ for root, dirs, files in os.walk(folder_path):
266
+ for f in files:
267
+ try:
268
+ total_size += os.path.getsize(os.path.join(root, f))
269
+ except Exception:
270
+ pass
271
+ return total_size
272
+
273
+ def list_and_ignore_large_folders(folder_path, limit_mb=99):
274
+ """List folder sizes and append large ones (> limit_mb) to .gitignore."""
275
+ gitignore_path = os.path.join(os.getcwd(), ".gitignore")
276
+ ignore_list = []
277
+
278
+ # Read existing .gitignore entries to avoid duplicates
279
+ if os.path.exists(gitignore_path):
280
+ with open(gitignore_path, "r", encoding="utf-8") as f:
281
+ existing_ignores = set(line.strip() for line in f if line.strip())
282
+ else:
283
+ existing_ignores = set()
284
+
285
+ for entry in os.scandir(folder_path):
286
+ if entry.is_dir():
287
+ folder_size_mb = get_folder_size(entry.path) / (1024 * 1024)
288
+ print(f"{entry.path}/ - {folder_size_mb:.2f} MB")
289
+
290
+ if folder_size_mb > limit_mb:
291
+ rel_path = os.path.relpath(entry.path, start=os.getcwd())
292
+ if rel_path not in existing_ignores:
293
+ ignore_list.append(rel_path)
294
+ elif entry.is_file():
295
+ file_size_mb = os.path.getsize(entry.path) / (1024 * 1024)
296
+ print(f"{entry.path} - {file_size_mb:.2f} MB")
297
+
298
+ # Append new paths to .gitignore
299
+ if ignore_list:
300
+ with open(gitignore_path, "a", encoding="utf-8") as f:
301
+ for p in ignore_list:
302
+ f.write(p + "\n")
303
+ print(f"\n✅ The following paths have been added to .gitignore:\n" + "\n".join(ignore_list))
304
+ else:
305
+ print("\nNo folders exceed the size limit (99 MB).")
306
+
307
+ folder_path = "./exp_data"
308
+ list_and_ignore_large_folders(folder_path, limit_mb=99)
309
+
310
+
34
311
 
35
312
 
@@ -1,11 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: junshan_kit
3
- Version: 2.2.3
3
+ Version: 2.5.1
4
4
  Summary: This is an optimization tool.
5
5
  Author-email: Junshan Yin <junshanyin@163.com>
6
6
  Requires-Dist: kaggle==1.7.4.5
7
7
  Requires-Dist: kagglehub==0.3.13
8
- Requires-Dist: numpy==2.2.6
9
- Requires-Dist: pandas==2.3.3
10
- Requires-Dist: scikit-learn==1.7.1
8
+ Requires-Dist: openml==0.15.1
11
9
  Requires-Dist: selenium==4.36.0
@@ -0,0 +1,13 @@
1
+ junshan_kit/DataHub.py,sha256=D9G2vjCNvDLer4qoKdowgWJChLMQQn7cVhAPZLvRrbE,3332
2
+ junshan_kit/DataProcessor.py,sha256=-6qjG52NDYq746vBPpc0uW2cfbc4syqSWZIzTxJt6fE,11806
3
+ junshan_kit/DataSets.py,sha256=hwGnJsb-Lj90lk6VBwmsDBb3-IA_WgUWzAKayHyq2AI,13391
4
+ junshan_kit/Evaluate_Metrics.py,sha256=Ic3VejsKtGT23ac7QKjRZ3WAETO1KP6JR-EaeiwblJE,1266
5
+ junshan_kit/ModelsHub.py,sha256=z9NyC4PTxo3wCxa2XxOfcjrw9NcDs0LCjBGCp6Z-90s,7084
6
+ junshan_kit/ParametersHub.py,sha256=usM2vu7fBP0n97rNEeJMxhzxRRGHhJMjELrnyJiVvTk,11520
7
+ junshan_kit/Print_Info.py,sha256=yiGc6Qlprj0ds6w2DP7ScAgTBZwswxXqxuIrQ3_liL8,3111
8
+ junshan_kit/TrainingHub.py,sha256=QOQ5BDctGysMbbSOEy6gR-ng0bSmrZl4iJZmj6n52m0,5960
9
+ junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ junshan_kit/kit.py,sha256=F9f5qqn9ve-UVoYtXlFmNGl4YJ3eEy6T1yRrC0s-Wpw,12367
11
+ junshan_kit-2.5.1.dist-info/METADATA,sha256=_gNNCaPWuspBXCD0Ce0maEYKtbO8eaoDXQIhmK2osOI,267
12
+ junshan_kit-2.5.1.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
13
+ junshan_kit-2.5.1.dist-info/RECORD,,
junshan_kit/datahub.py DELETED
@@ -1,146 +0,0 @@
1
- """
2
- ----------------------------------------------------------------------
3
- >>> Author : Junshan Yin
4
- >>> Last Updated : 2025-10-12
5
- ----------------------------------------------------------------------
6
- """
7
-
8
- import kagglehub
9
- import os, time
10
- import warnings
11
- import shutil
12
- from kaggle.api.kaggle_api_extended import KaggleApi
13
- from selenium import webdriver
14
- from selenium.webdriver.common.by import By
15
- from selenium.webdriver.chrome.options import Options
16
- from selenium.webdriver.support.ui import WebDriverWait
17
- from selenium.webdriver.support import expected_conditions as EC
18
-
19
- class kaggle_data:
20
- def list_datasets(self):
21
- api = KaggleApi()
22
- api.authenticate()
23
- datasets = api.dataset_list(user='junshan888')
24
- print('Available datasets:')
25
- print('*' * 60)
26
- if datasets is not None:
27
- for ds in datasets:
28
- if ds is not None:
29
- print(ds.title)
30
- print('*' * 60)
31
-
32
- def list_user_datasets(self):
33
- warnings.warn(
34
- "list_user_datasets() is deprecated. Use list_datasets() instead.",
35
- DeprecationWarning,
36
- stacklevel=2
37
- )
38
- return self.list_datasets()
39
-
40
- # example: list_user_datasets()
41
-
42
- #---------------------------------------------------------------
43
- def download_data(self, data_name = 'letter-libsvm', copy_path = None):
44
- path = kagglehub.dataset_download(f'junshan888/{data_name}')
45
- # print("Downloaded to:", path)
46
- if copy_path is not None:
47
- # Create target directory if it doesn't exist
48
- os.makedirs(copy_path, exist_ok=True)
49
- # Copy dataset to target directory
50
- shutil.copytree(path, copy_path, dirs_exist_ok=True)
51
-
52
- print(f"✅ Dataset has been copied to: {copy_path}")
53
-
54
- # example: read_data(copy_path='./exp_data')
55
-
56
-
57
- class JianguoDownloaderChrome:
58
- def __init__(self, url: str, download_path: str = "./downloads"):
59
- self.url = url
60
- self.download_path = os.path.abspath(download_path)
61
- os.makedirs(self.download_path, exist_ok=True)
62
-
63
- # Configure Chrome options
64
- self.chrome_options = Options()
65
- prefs = {
66
- "download.default_directory": self.download_path,
67
- "download.prompt_for_download": False,
68
- "download.directory_upgrade": True,
69
- "safebrowsing.enabled": True,
70
- "profile.default_content_setting_values.automatic_downloads": 1,
71
- }
72
- self.chrome_options.add_experimental_option("prefs", prefs)
73
-
74
- # Optional stability flags
75
- self.chrome_options.add_argument("--disable-gpu")
76
- self.chrome_options.add_argument("--no-sandbox")
77
- self.chrome_options.add_argument("--disable-dev-shm-usage")
78
- self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
79
-
80
- # Start Chrome
81
- self.driver = webdriver.Chrome(options=self.chrome_options)
82
-
83
- def open_page(self):
84
- """Open the Jianguoyun share page."""
85
- print(f"🌐 Opening link: {self.url}")
86
- self.driver.get(self.url)
87
-
88
- def click_download_button(self):
89
- """Find and click the download button."""
90
- print("🔍 Looking for the download button...")
91
- wait = WebDriverWait(self.driver, 30)
92
- span = wait.until(
93
- EC.presence_of_element_located((By.XPATH, "//span[contains(text(),'下载')]"))
94
- )
95
- parent = span.find_element(By.XPATH, "./..")
96
- self.driver.execute_script("arguments[0].click();", parent)
97
- print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
98
-
99
- # If Jianguoyun opens a new tab, switch to it
100
- time.sleep(3)
101
- if len(self.driver.window_handles) > 1:
102
- self.driver.switch_to.window(self.driver.window_handles[-1])
103
- print("📂 Switched to download tab.")
104
-
105
- def wait_for_downloads(self, timeout=30000):
106
- """Wait until all downloads are finished."""
107
- print("⏳ Waiting for downloads to finish...")
108
- start_time = time.time()
109
- while True:
110
- downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
111
- if not downloading:
112
- print("✅ Download completed!")
113
- return True
114
- if time.time() - start_time > timeout:
115
- print("⏰ Timeout: downloads may not have finished.")
116
- return False
117
- time.sleep(2)
118
-
119
- def get_latest_file(self):
120
- """Return the most recently downloaded file (if any)."""
121
- files = [os.path.join(self.download_path, f) for f in os.listdir(self.download_path)]
122
- return max(files, key=os.path.getctime) if files else None
123
-
124
- def close(self):
125
- """Close the browser."""
126
- self.driver.quit()
127
- print("🚪 Browser closed.")
128
-
129
- def run(self):
130
- """Run the complete download process."""
131
- print('*'*50)
132
- try:
133
- self.open_page()
134
- self.click_download_button()
135
- self.wait_for_downloads()
136
- latest = self.get_latest_file()
137
- if latest:
138
- print(f"📄 Latest downloaded file: {latest}")
139
- except Exception as e:
140
- print("❌ Error occurred:", e)
141
- finally:
142
- self.close()
143
- print('*'*50)
144
-
145
-
146
-