junshan-kit 2.1.7__py2.py3-none-any.whl β 2.1.8__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- junshan_kit/DataProcessor.py +2 -2
- junshan_kit/datahub.py +89 -13
- junshan_kit/test.py +2 -4
- junshan_kit-2.1.8.dist-info/METADATA +11 -0
- junshan_kit-2.1.8.dist-info/RECORD +7 -0
- junshan_kit-2.1.7.dist-info/METADATA +0 -31
- junshan_kit-2.1.7.dist-info/RECORD +0 -7
- {junshan_kit-2.1.7.dist-info β junshan_kit-2.1.8.dist-info}/WHEEL +0 -0
junshan_kit/DataProcessor.py
CHANGED
@@ -91,11 +91,11 @@ class CSVToPandas:
|
|
91
91
|
print('='*70)
|
92
92
|
print(f"{'Original size:':<35} {m_before} rows x {n_before} cols")
|
93
93
|
print(f"{'Size after dropping NaNs:':<35} {m_after} rows x {n_after} cols")
|
94
|
-
print(f"{'Export size (after encoding & scaling):':<35} {m_export} rows x {n_export} cols")
|
95
94
|
print(f"{'Positive samples (+1):':<35} {pos_count}")
|
96
95
|
print(f"{'Negative samples (-1):':<35} {neg_count}")
|
96
|
+
print(f"{'Export size (after encoding & scaling):':<35} {m_export} rows x {n_export} cols")
|
97
97
|
print('-'*70)
|
98
|
-
print(f"{'Note: categorical columns one-hot encoded, numerical standardized.'
|
98
|
+
print(f"{'Note: categorical columns one-hot encoded, numerical standardized.'}")
|
99
99
|
print(f"More details: https://www.jianguoyun.com/p/DU6Lr9oQqdHDDRj5sI0GIAA")
|
100
100
|
print('='*70 + '\n')
|
101
101
|
|
junshan_kit/datahub.py
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
import kagglehub
|
2
|
-
import os
|
2
|
+
import os, time
|
3
3
|
import warnings
|
4
4
|
import shutil
|
5
5
|
from kaggle.api.kaggle_api_extended import KaggleApi
|
6
|
+
from selenium import webdriver
|
7
|
+
from selenium.webdriver.common.by import By
|
8
|
+
from selenium.webdriver.chrome.options import Options
|
9
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
10
|
+
from selenium.webdriver.support import expected_conditions as EC
|
6
11
|
|
7
12
|
class kaggle_data:
|
8
13
|
def list_datasets(self):
|
@@ -42,20 +47,91 @@ class kaggle_data:
|
|
42
47
|
# example: read_data(copy_path='./exp_data')
|
43
48
|
|
44
49
|
|
50
|
+
class JianguoDownloader:
|
51
|
+
def __init__(self, url: str, download_path: str = "./downloads"):
|
52
|
+
self.url = url
|
53
|
+
self.download_path = os.path.abspath(download_path)
|
54
|
+
os.makedirs(self.download_path, exist_ok=True)
|
45
55
|
|
56
|
+
# Configure Chrome options
|
57
|
+
self.chrome_options = Options()
|
58
|
+
prefs = {
|
59
|
+
"download.default_directory": self.download_path,
|
60
|
+
"download.prompt_for_download": False,
|
61
|
+
"download.directory_upgrade": True,
|
62
|
+
"safebrowsing.enabled": True,
|
63
|
+
"profile.default_content_setting_values.automatic_downloads": 1,
|
64
|
+
}
|
65
|
+
self.chrome_options.add_experimental_option("prefs", prefs)
|
46
66
|
|
67
|
+
# Optional stability flags
|
68
|
+
self.chrome_options.add_argument("--disable-gpu")
|
69
|
+
self.chrome_options.add_argument("--no-sandbox")
|
70
|
+
self.chrome_options.add_argument("--disable-dev-shm-usage")
|
71
|
+
self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
47
72
|
|
48
|
-
|
49
|
-
|
50
|
-
data = kaggle_data()
|
51
|
-
# Example usage
|
52
|
-
data.list_user_datasets()
|
53
|
-
data.download_data(data_name='letter-libsvm', copy_path='./exp_data/Letter')
|
73
|
+
# Start Chrome
|
74
|
+
self.driver = webdriver.Chrome(options=self.chrome_options)
|
54
75
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
76
|
+
def open_page(self):
|
77
|
+
"""Open the Jianguoyun share page."""
|
78
|
+
print(f"π Opening link: {self.url}")
|
79
|
+
self.driver.get(self.url)
|
80
|
+
|
81
|
+
def click_download_button(self):
|
82
|
+
"""Find and click the download button."""
|
83
|
+
print("π Looking for the download button...")
|
84
|
+
wait = WebDriverWait(self.driver, 30)
|
85
|
+
span = wait.until(
|
86
|
+
EC.presence_of_element_located((By.XPATH, "//span[contains(text(),'δΈθ½½')]"))
|
87
|
+
)
|
88
|
+
parent = span.find_element(By.XPATH, "./..")
|
89
|
+
self.driver.execute_script("arguments[0].click();", parent)
|
90
|
+
print(f"β
Download button clicked. Files will be saved to: {self.download_path}")
|
91
|
+
|
92
|
+
# If Jianguoyun opens a new tab, switch to it
|
93
|
+
time.sleep(3)
|
94
|
+
if len(self.driver.window_handles) > 1:
|
95
|
+
self.driver.switch_to.window(self.driver.window_handles[-1])
|
96
|
+
print("π Switched to download tab.")
|
97
|
+
|
98
|
+
def wait_for_downloads(self, timeout=300):
|
99
|
+
"""Wait until all downloads are finished."""
|
100
|
+
print("β³ Waiting for downloads to finish...")
|
101
|
+
start_time = time.time()
|
102
|
+
while True:
|
103
|
+
downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
|
104
|
+
if not downloading:
|
105
|
+
print("β
Download completed!")
|
106
|
+
return True
|
107
|
+
if time.time() - start_time > timeout:
|
108
|
+
print("β° Timeout: downloads may not have finished.")
|
109
|
+
return False
|
110
|
+
time.sleep(2)
|
111
|
+
|
112
|
+
def get_latest_file(self):
|
113
|
+
"""Return the most recently downloaded file (if any)."""
|
114
|
+
files = [os.path.join(self.download_path, f) for f in os.listdir(self.download_path)]
|
115
|
+
return max(files, key=os.path.getctime) if files else None
|
116
|
+
|
117
|
+
def close(self):
|
118
|
+
"""Close the browser."""
|
119
|
+
self.driver.quit()
|
120
|
+
print("πͺ Browser closed.")
|
121
|
+
|
122
|
+
def run(self):
|
123
|
+
"""Run the complete download process."""
|
124
|
+
print('*'*50)
|
125
|
+
try:
|
126
|
+
self.open_page()
|
127
|
+
self.click_download_button()
|
128
|
+
self.wait_for_downloads()
|
129
|
+
latest = self.get_latest_file()
|
130
|
+
if latest:
|
131
|
+
print(f"π Latest downloaded file: {latest}")
|
132
|
+
except Exception as e:
|
133
|
+
print("β Error occurred:", e)
|
134
|
+
finally:
|
135
|
+
self.close()
|
136
|
+
print('*'*50)
|
61
137
|
|
junshan_kit/test.py
CHANGED
@@ -0,0 +1,11 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: junshan_kit
|
3
|
+
Version: 2.1.8
|
4
|
+
Summary: This is an optimization tool.
|
5
|
+
Author-email: Junshan Yin <junshanyin@163.com>
|
6
|
+
Requires-Dist: kaggle==1.7.4.5
|
7
|
+
Requires-Dist: kagglehub==0.3.13
|
8
|
+
Requires-Dist: numpy==2.2.6
|
9
|
+
Requires-Dist: pandas==2.3.3
|
10
|
+
Requires-Dist: scikit-learn==1.7.1
|
11
|
+
Requires-Dist: selenium==4.36.0
|
@@ -0,0 +1,7 @@
|
|
1
|
+
junshan_kit/DataProcessor.py,sha256=AW_1jROexC3s41-RgzqzYVwPI0sOf3tzjiph4qa_Vcw,3882
|
2
|
+
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
junshan_kit/datahub.py,sha256=mofbkp8ry6_LM_vW1LcZolp5tfkqOp_cUiwjfDFbRqI,5153
|
4
|
+
junshan_kit/test.py,sha256=uSckjcr_Wgj__YPTwD6x0GY8Hfn5GBEXIpRf9vIYBbU,91
|
5
|
+
junshan_kit-2.1.8.dist-info/METADATA,sha256=eFQmrVEUORZRhZqBCOlctfSU3vwCQ2RB4Jpyj1coAmE,329
|
6
|
+
junshan_kit-2.1.8.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
7
|
+
junshan_kit-2.1.8.dist-info/RECORD,,
|
@@ -1,31 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: junshan_kit
|
3
|
-
Version: 2.1.7
|
4
|
-
Summary: This is an optimization tool.
|
5
|
-
Author-email: Junshan Yin <junshanyin@163.com>
|
6
|
-
Requires-Dist: kaggle==1.7.4.5
|
7
|
-
Requires-Dist: kagglehub==0.3.13
|
8
|
-
Requires-Dist: numpy==2.2.6
|
9
|
-
Requires-Dist: pandas==2.3.3
|
10
|
-
Requires-Dist: scikit-learn==1.7.1
|
11
|
-
Description-Content-Type: text/markdown
|
12
|
-
|
13
|
-
- For class kaggle_data in datahub
|
14
|
-
- We need to set API of kaggle.
|
15
|
-
|
16
|
-
```python
|
17
|
-
import junshan_kit.datahub
|
18
|
-
data = junshan_kit.datahub.kaggle_data()
|
19
|
-
data.list_user_datasets()
|
20
|
-
data.read_data(data_name='letter-libsvm', copy_path='./exp_data/Letter')
|
21
|
-
```
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
@@ -1,7 +0,0 @@
|
|
1
|
-
junshan_kit/DataProcessor.py,sha256=rp_w325h8EvKcLMSa12w5B-UA8G75O1qP0ogE6GDSE0,3886
|
2
|
-
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
junshan_kit/datahub.py,sha256=BWcG_TPW1xf_y_GzxRXanuOAB01WugBiO5r53EDbr8s,1815
|
4
|
-
junshan_kit/test.py,sha256=jyZQPgX40HlLM23vGMbuZFwFBk7YiFqzzh9xuOTzbw8,91
|
5
|
-
junshan_kit-2.1.7.dist-info/METADATA,sha256=ePQG7bT7y7yVU7iSI3CxnfNacwJLyAlSB7nEmAG3_NM,599
|
6
|
-
junshan_kit-2.1.7.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
7
|
-
junshan_kit-2.1.7.dist-info/RECORD,,
|
File without changes
|