junshan-kit 2.2.3__py2.py3-none-any.whl → 2.2.5__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- junshan_kit/DataProcessor.py +74 -144
- junshan_kit/DataSets.py +3 -0
- junshan_kit/kit.py +208 -1
- {junshan_kit-2.2.3.dist-info → junshan_kit-2.2.5.dist-info}/METADATA +1 -1
- junshan_kit-2.2.5.dist-info/RECORD +7 -0
- junshan_kit/datahub.py +0 -146
- junshan_kit/meta.py +0 -256
- junshan_kit/test.py +0 -8
- junshan_kit-2.2.3.dist-info/RECORD +0 -9
- {junshan_kit-2.2.3.dist-info → junshan_kit-2.2.5.dist-info}/WHEEL +0 -0
junshan_kit/DataProcessor.py
CHANGED
@@ -1,146 +1,68 @@
|
|
1
1
|
"""
|
2
2
|
----------------------------------------------------------------------
|
3
|
-
>>> Author : Junshan Yin
|
3
|
+
>>> Author : Junshan Yin
|
4
4
|
>>> Last Updated : 2025-10-12
|
5
5
|
----------------------------------------------------------------------
|
6
6
|
"""
|
7
7
|
|
8
8
|
import pandas as pd
|
9
|
-
import os
|
10
|
-
from sklearn.preprocessing import StandardScaler
|
11
|
-
import junshan_kit.datahub
|
12
|
-
import zipfile
|
13
|
-
|
14
|
-
class CSVToPandasMeta:
|
15
|
-
def __init__(self):
|
16
|
-
self.data_downloader = junshan_kit.datahub.kaggle_data()
|
17
|
-
|
18
|
-
|
19
|
-
def read_csv(self, data_name):
|
20
|
-
self.csv_path = f'exp_data/{data_name}/{data_name}.csv'
|
21
|
-
if not os.path.exists(self.csv_path):
|
22
|
-
self.data_downloader.download_data(f'{data_name}', f'exp_data/{data_name}')
|
23
|
-
|
24
|
-
# ----------------- ccfd_kaggle ----------------------------------
|
25
|
-
def ccfd_kaggle(self, data_name = 'ccfd-kaggle', show_info = True):
|
26
|
-
# download data if not exist
|
27
|
-
self.read_csv(data_name)
|
28
|
-
|
29
|
-
df = pd.read_csv(self.csv_path)
|
30
|
-
m_before, n_before = df.shape
|
31
|
-
df = df.dropna(axis=0, how='any')
|
32
|
-
m_after, n_after = df.shape
|
33
|
-
df['Class'] = df['Class'].replace(0, -1)
|
34
|
-
|
35
|
-
if show_info:
|
36
|
-
pos_count = (df['Class'] == 1).sum()
|
37
|
-
neg_count = (df['Class'] == -1).sum()
|
38
|
-
|
39
|
-
print('\n' + '='*60)
|
40
|
-
print(f"{'CCFD-Kaggle Dataset Info':^60}")
|
41
|
-
print('='*60)
|
42
|
-
print(f"{'Original size:':<25} {m_before} rows x {n_before} cols")
|
43
|
-
print(f"{'Size after dropping NaNs:':<25} {m_after} rows x {n_after} cols")
|
44
|
-
print(f"{'Positive samples (+1):':<25} {pos_count}")
|
45
|
-
print(f"{'Negative samples (-1):':<25} {neg_count}")
|
46
|
-
print(f"{'Export size:':<25} {m_after} rows x {n_after} cols")
|
47
|
-
print('-'*60)
|
48
|
-
print(f"More details: https://www.jianguoyun.com/p/Dd1clVgQ4ZThCxiwzZQGIAA")
|
49
|
-
print('='*60 + '\n')
|
50
|
-
|
51
|
-
return df
|
52
|
-
|
53
|
-
# ------------------------
|
54
|
-
def ghpdd_kaggle(self, data_name='ghpdd-kaggle', show_info=True):
|
55
|
-
# download data if not exist
|
56
|
-
self.read_csv(data_name)
|
57
|
-
|
58
|
-
# read csv
|
59
|
-
df = pd.read_csv(self.csv_path)
|
60
|
-
m_before, n_before = df.shape
|
61
|
-
|
62
|
-
# drop NaNs
|
63
|
-
df = df.dropna(axis=0, how='any')
|
64
|
-
m_after, n_after = df.shape
|
65
|
-
|
66
|
-
# drop unique identifier
|
67
|
-
if 'property_id' in df.columns:
|
68
|
-
df.drop(columns=['property_id'], inplace=True)
|
69
|
-
|
70
|
-
# Replace label 0 with -1
|
71
|
-
df['decision'] = df['decision'].replace(0, -1)
|
72
|
-
|
73
|
-
# Identify categorical and numerical columns
|
74
|
-
cat_cols = ['country', 'city', 'property_type', 'furnishing_status']
|
75
|
-
num_cols = [col for col in df.columns if col not in cat_cols + ['decision']]
|
76
|
-
|
77
|
-
# One-Hot encode categorical columns
|
78
|
-
df = pd.get_dummies(df, columns=cat_cols)
|
79
|
-
|
80
|
-
# Convert boolean columns to int
|
81
|
-
bool_cols = df.select_dtypes(include='bool').columns
|
82
|
-
for col in bool_cols:
|
83
|
-
df[col] = df[col].astype(int)
|
84
|
-
|
85
|
-
# Standardize numerical columns
|
86
|
-
scaler = StandardScaler()
|
87
|
-
df[num_cols] = scaler.fit_transform(df[num_cols])
|
88
|
-
|
89
|
-
# The size after export
|
90
|
-
m_export, n_export = df.shape
|
91
|
-
|
92
|
-
if show_info:
|
93
|
-
pos_count = (df['decision'] == 1).sum()
|
94
|
-
neg_count = (df['decision'] == -1).sum()
|
95
|
-
|
96
|
-
print('\n' + '='*70)
|
97
|
-
print(f"{'GHPDD-Kaggle Dataset Info':^70}")
|
98
|
-
print('='*70)
|
99
|
-
print(f"{'Original size:':<35} {m_before} rows x {n_before} cols")
|
100
|
-
print(f"{'Size after dropping NaNs:':<35} {m_after} rows x {n_after} cols")
|
101
|
-
print(f"{'Positive samples (+1):':<35} {pos_count}")
|
102
|
-
print(f"{'Negative samples (-1):':<35} {neg_count}")
|
103
|
-
print(f"{'Export size (after encoding & scaling):':<35} {m_export} rows x {n_export} cols")
|
104
|
-
print('-'*70)
|
105
|
-
print(f"{'Note: categorical columns one-hot encoded, numerical standardized.'}")
|
106
|
-
print(f"More details: https://www.jianguoyun.com/p/DU6Lr9oQqdHDDRj5sI0GIAA")
|
107
|
-
print('='*70 + '\n')
|
108
|
-
|
109
|
-
return df
|
110
|
-
|
111
9
|
|
112
10
|
|
113
11
|
class CSV_TO_Pandas:
|
114
12
|
def __init__(self):
|
115
13
|
pass
|
116
14
|
|
117
|
-
def
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
Examples:
|
126
|
-
>>> zip_path = "./downloads/data.zip"
|
127
|
-
>>> unzip_folder = "./exp_data/data"
|
128
|
-
>>> unzip_file(zip_path, unzip_folder)
|
15
|
+
def preprocess_dataset(
|
16
|
+
self,
|
17
|
+
csv_path,
|
18
|
+
drop_cols: list,
|
19
|
+
label_col: str,
|
20
|
+
label_map: dict,
|
21
|
+
print_info=False,
|
22
|
+
):
|
129
23
|
"""
|
24
|
+
Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
|
130
25
|
|
131
|
-
|
132
|
-
|
26
|
+
This function loads a dataset from a CSV file, removes specified non-feature columns,
|
27
|
+
drops rows with missing values, maps the target label to numerical values, and
|
28
|
+
one-hot encodes categorical features. Optionally, it can print dataset statistics
|
29
|
+
before and after preprocessing.
|
133
30
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
31
|
+
Args:
|
32
|
+
csv_path (str):
|
33
|
+
Path to the input CSV dataset.
|
34
|
+
drop_cols (list):
|
35
|
+
List of column names to drop from the dataset.
|
36
|
+
label_col (str):
|
37
|
+
Name of the target label column.
|
38
|
+
label_map (dict):
|
39
|
+
Mapping dictionary for label conversion (e.g., {"yes": 1, "no": -1}).
|
40
|
+
print_info (bool, optional):
|
41
|
+
Whether to print preprocessing information and dataset statistics.
|
42
|
+
Defaults to False.
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
pandas.DataFrame:
|
46
|
+
The cleaned and preprocessed dataset ready for model input.
|
47
|
+
|
48
|
+
Steps:
|
49
|
+
1. Load the dataset from CSV.
|
50
|
+
2. Drop non-informative or irrelevant columns.
|
51
|
+
3. Remove rows with missing values.
|
52
|
+
4. Map label column to numerical values according to `label_map`.
|
53
|
+
5. One-hot encode categorical (non-label) text features.
|
54
|
+
6. Optionally print dataset information and summary statistics.
|
55
|
+
|
56
|
+
Example:
|
57
|
+
>>> label_map = {"positive": 1, "negative": -1}
|
58
|
+
>>> df = data_handler.preprocess_dataset(
|
59
|
+
... csv_path="data/raw.csv",
|
60
|
+
... drop_cols=["id", "timestamp"],
|
61
|
+
... label_col="sentiment",
|
62
|
+
... label_map=label_map,
|
63
|
+
... print_info=True
|
64
|
+
... )
|
65
|
+
"""
|
144
66
|
# Step 0: Load the dataset
|
145
67
|
df = pd.read_csv(csv_path)
|
146
68
|
|
@@ -151,15 +73,19 @@ class CSV_TO_Pandas:
|
|
151
73
|
df = df.drop(columns=drop_cols)
|
152
74
|
|
153
75
|
# Step 2: Remove rows with missing values
|
154
|
-
df = df.dropna(axis=0, how=
|
76
|
+
df = df.dropna(axis=0, how="any")
|
155
77
|
m_encoded, n_encoded = df.shape
|
156
|
-
|
157
|
-
# Step 3: Map target label to -1 and +1
|
78
|
+
|
79
|
+
# Step 3: Map target label (to -1 and +1)
|
158
80
|
df[label_col] = df[label_col].map(label_map)
|
159
81
|
|
160
82
|
# Step 4: Encode categorical features (exclude label column)
|
161
|
-
text_feature_cols = df.select_dtypes(
|
162
|
-
|
83
|
+
text_feature_cols = df.select_dtypes(
|
84
|
+
include=["object", "string", "category"]
|
85
|
+
).columns
|
86
|
+
text_feature_cols = [
|
87
|
+
col for col in text_feature_cols if col != label_col
|
88
|
+
] # ✅ exclude label
|
163
89
|
|
164
90
|
df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
|
165
91
|
m_cleaned, n_cleaned = df.shape
|
@@ -170,23 +96,27 @@ class CSV_TO_Pandas:
|
|
170
96
|
neg_count = (df[label_col] == -1).sum()
|
171
97
|
|
172
98
|
# Step 6: Print dataset information
|
173
|
-
print(
|
99
|
+
print("\n" + "=" * 80)
|
174
100
|
print(f"{'Dataset Info':^70}")
|
175
|
-
print(
|
101
|
+
print("=" * 80)
|
176
102
|
print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
|
177
|
-
print(
|
103
|
+
print(
|
104
|
+
f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols"
|
105
|
+
)
|
178
106
|
print(f"{'Positive samples (+1):':<40} {pos_count}")
|
179
107
|
print(f"{'Negative samples (-1):':<40} {neg_count}")
|
180
|
-
print(
|
181
|
-
|
108
|
+
print(
|
109
|
+
f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols"
|
110
|
+
)
|
111
|
+
print("-" * 80)
|
182
112
|
print(f"Note:")
|
183
113
|
print(f"{'Label column:':<40} {label_col}")
|
184
|
-
print(
|
185
|
-
|
186
|
-
|
114
|
+
print(
|
115
|
+
f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
|
116
|
+
)
|
117
|
+
print(
|
118
|
+
f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
|
119
|
+
)
|
120
|
+
print("=" * 80 + "\n")
|
187
121
|
|
188
122
|
return df
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
junshan_kit/DataSets.py
ADDED
junshan_kit/kit.py
CHANGED
@@ -6,7 +6,12 @@
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
import zipfile
|
9
|
-
import os
|
9
|
+
import os, time
|
10
|
+
|
11
|
+
from selenium import webdriver
|
12
|
+
from selenium.webdriver.common.by import By
|
13
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
14
|
+
from selenium.webdriver.support import expected_conditions as EC
|
10
15
|
|
11
16
|
def unzip_file(zip_path: str, unzip_folder: str):
|
12
17
|
"""
|
@@ -33,3 +38,205 @@ def unzip_file(zip_path: str, unzip_folder: str):
|
|
33
38
|
print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
|
34
39
|
|
35
40
|
|
41
|
+
# =============================================================
|
42
|
+
# JIANGUOYUN (NUTSTORE) CHROME VERSION
|
43
|
+
# =============================================================
|
44
|
+
|
45
|
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
46
|
+
class JianguoyunDownloaderChrome:
|
47
|
+
""" Example:
|
48
|
+
>>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
|
49
|
+
>>> downloader = JianguoyunDownloaderChrome(url)
|
50
|
+
>>> downloader.run()
|
51
|
+
"""
|
52
|
+
def __init__(self, url, download_path="./downloads"):
|
53
|
+
self.url = url
|
54
|
+
self.download_path = os.path.abspath(download_path)
|
55
|
+
os.makedirs(self.download_path, exist_ok=True)
|
56
|
+
|
57
|
+
self.chrome_options = ChromeOptions()
|
58
|
+
prefs = {
|
59
|
+
"download.default_directory": self.download_path,
|
60
|
+
"download.prompt_for_download": False,
|
61
|
+
"download.directory_upgrade": True,
|
62
|
+
"safebrowsing.enabled": True,
|
63
|
+
"profile.default_content_setting_values.automatic_downloads": 1,
|
64
|
+
}
|
65
|
+
self.chrome_options.add_experimental_option("prefs", prefs)
|
66
|
+
self.chrome_options.add_argument("--disable-gpu")
|
67
|
+
self.chrome_options.add_argument("--no-sandbox")
|
68
|
+
self.chrome_options.add_argument("--disable-dev-shm-usage")
|
69
|
+
self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
70
|
+
# Uncomment for headless mode:
|
71
|
+
# self.chrome_options.add_argument("--headless")
|
72
|
+
|
73
|
+
self.driver = webdriver.Chrome(options=self.chrome_options)
|
74
|
+
|
75
|
+
def open_page(self):
|
76
|
+
print(f"🌐 Opening page: {self.url}")
|
77
|
+
self.driver.get(self.url)
|
78
|
+
print(f"✅ Page loaded: {self.driver.title}")
|
79
|
+
|
80
|
+
def click_download_button(self):
|
81
|
+
"""Find and click the 'Download' button (supports English and Chinese)."""
|
82
|
+
print("🔍 Searching for the download button...")
|
83
|
+
wait = WebDriverWait(self.driver, 30)
|
84
|
+
|
85
|
+
try:
|
86
|
+
# Match both English 'Download' (case-insensitive) and Chinese '下载'
|
87
|
+
xpath = (
|
88
|
+
"//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
89
|
+
" | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
90
|
+
" | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
91
|
+
" | //span[contains(text(),'下载')]"
|
92
|
+
" | //button[contains(text(),'下载')]"
|
93
|
+
" | //a[contains(text(),'下载')]"
|
94
|
+
)
|
95
|
+
|
96
|
+
button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
|
97
|
+
|
98
|
+
# Click using JavaScript to avoid overlay or interaction issues
|
99
|
+
self.driver.execute_script("arguments[0].click();", button)
|
100
|
+
print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
|
101
|
+
|
102
|
+
# If the cloud service opens a new tab, switch to it
|
103
|
+
time.sleep(3)
|
104
|
+
if len(self.driver.window_handles) > 1:
|
105
|
+
self.driver.switch_to.window(self.driver.window_handles[-1])
|
106
|
+
print("📂 Switched to the new download tab.")
|
107
|
+
|
108
|
+
except Exception as e:
|
109
|
+
print("❌ Failed to find or click the download button:", e)
|
110
|
+
raise
|
111
|
+
|
112
|
+
|
113
|
+
def wait_for_downloads(self, timeout=3600):
|
114
|
+
print("⏳ Waiting for downloads to finish...")
|
115
|
+
start_time = time.time()
|
116
|
+
while time.time() - start_time < timeout:
|
117
|
+
downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
|
118
|
+
if not downloading:
|
119
|
+
print("✅ Download completed!")
|
120
|
+
return
|
121
|
+
time.sleep(2)
|
122
|
+
print("⚠️ Timeout: download not completed within 1 hour")
|
123
|
+
|
124
|
+
def close(self):
|
125
|
+
self.driver.quit()
|
126
|
+
print("🚪 Browser closed.")
|
127
|
+
|
128
|
+
def run(self):
|
129
|
+
print('*' * 60)
|
130
|
+
try:
|
131
|
+
self.open_page()
|
132
|
+
self.click_download_button()
|
133
|
+
self.wait_for_downloads()
|
134
|
+
except Exception as e:
|
135
|
+
print("❌ Error:", e)
|
136
|
+
finally:
|
137
|
+
self.close()
|
138
|
+
print('*' * 60)
|
139
|
+
|
140
|
+
|
141
|
+
# =============================================================
|
142
|
+
# JIANGUOYUN (NUTSTORE) FIREFOX VERSION
|
143
|
+
# =============================================================
|
144
|
+
|
145
|
+
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
146
|
+
from selenium.webdriver.firefox.service import Service
|
147
|
+
|
148
|
+
class JianguoyunDownloaderFirefox:
|
149
|
+
""" Example:
|
150
|
+
>>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
|
151
|
+
>>> downloader = JianguoyunDownloaderFirefox(url)
|
152
|
+
>>> downloader.run()
|
153
|
+
"""
|
154
|
+
def __init__(self, url, download_path="./downloads"):
|
155
|
+
self.url = url
|
156
|
+
self.download_path = os.path.abspath(download_path)
|
157
|
+
os.makedirs(self.download_path, exist_ok=True)
|
158
|
+
|
159
|
+
options = FirefoxOptions()
|
160
|
+
options.add_argument("--headless")
|
161
|
+
options.set_preference("browser.download.folderList", 2)
|
162
|
+
options.set_preference("browser.download.manager.showWhenStarting", False)
|
163
|
+
options.set_preference("browser.download.dir", self.download_path)
|
164
|
+
options.set_preference("browser.helperApps.neverAsk.saveToDisk",
|
165
|
+
"application/zip,application/octet-stream,application/x-zip-compressed,multipart/x-zip")
|
166
|
+
options.set_preference("pdfjs.disabled", True)
|
167
|
+
|
168
|
+
service = Service("/snap/bin/geckodriver")
|
169
|
+
self.driver = webdriver.Firefox(service=service, options=options)
|
170
|
+
|
171
|
+
def open_page(self):
|
172
|
+
print(f"🌐 Opening page: {self.url}")
|
173
|
+
self.driver.get(self.url)
|
174
|
+
print(f"✅ Page loaded: {self.driver.title}")
|
175
|
+
|
176
|
+
def click_download_button(self):
|
177
|
+
"""Find and click the 'Download' button (supports English and Chinese)."""
|
178
|
+
print("🔍 Searching for the download button...")
|
179
|
+
wait = WebDriverWait(self.driver, 30)
|
180
|
+
|
181
|
+
try:
|
182
|
+
# Match both English 'Download' (case-insensitive) and Chinese '下载'
|
183
|
+
xpath = (
|
184
|
+
"//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
185
|
+
" | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
186
|
+
" | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
187
|
+
" | //span[contains(text(),'下载')]"
|
188
|
+
" | //button[contains(text(),'下载')]"
|
189
|
+
" | //a[contains(text(),'下载')]"
|
190
|
+
)
|
191
|
+
|
192
|
+
button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
|
193
|
+
|
194
|
+
# Click using JavaScript to avoid overlay or interaction issues
|
195
|
+
self.driver.execute_script("arguments[0].click();", button)
|
196
|
+
print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
|
197
|
+
|
198
|
+
# If the cloud service opens a new tab, switch to it
|
199
|
+
time.sleep(3)
|
200
|
+
if len(self.driver.window_handles) > 1:
|
201
|
+
self.driver.switch_to.window(self.driver.window_handles[-1])
|
202
|
+
print("📂 Switched to the new download tab.")
|
203
|
+
|
204
|
+
except Exception as e:
|
205
|
+
print("❌ Failed to find or click the download button:", e)
|
206
|
+
raise
|
207
|
+
|
208
|
+
def wait_for_download(self, timeout=3600):
|
209
|
+
"""Wait until all downloads are finished (auto-detects browser type)."""
|
210
|
+
print("⏳ Waiting for downloads to finish...")
|
211
|
+
start_time = time.time()
|
212
|
+
|
213
|
+
# Determine the temporary file extension based on the browser type
|
214
|
+
temp_ext = ".crdownload" if "chrome" in self.driver.capabilities["browserName"].lower() else ".part"
|
215
|
+
|
216
|
+
while time.time() - start_time < timeout:
|
217
|
+
downloading = [f for f in os.listdir(self.download_path) if f.endswith(temp_ext)]
|
218
|
+
if not downloading:
|
219
|
+
print("✅ Download completed!")
|
220
|
+
return True
|
221
|
+
time.sleep(2)
|
222
|
+
|
223
|
+
|
224
|
+
def close(self):
|
225
|
+
print("🛑 Closing browser...")
|
226
|
+
self.driver.quit()
|
227
|
+
|
228
|
+
def run(self):
|
229
|
+
print('*' * 60)
|
230
|
+
try:
|
231
|
+
self.open_page()
|
232
|
+
self.click_download_button()
|
233
|
+
self.wait_for_download(timeout=3600)
|
234
|
+
except Exception as e:
|
235
|
+
print("❌ Error:", e)
|
236
|
+
finally:
|
237
|
+
self.close()
|
238
|
+
print('*' * 60)
|
239
|
+
|
240
|
+
|
241
|
+
|
242
|
+
|
@@ -0,0 +1,7 @@
|
|
1
|
+
junshan_kit/DataProcessor.py,sha256=eryVmS5BFZj8wjDN2QWVHqkbFgFuWU0HXV9s6TGf9QM,4442
|
2
|
+
junshan_kit/DataSets.py,sha256=ajz1GSNU9xYVrFEDSz6Xwg7amWQ_yvW75tQa1ZvRIWc,3
|
3
|
+
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
junshan_kit/kit.py,sha256=FaQT0H7FLKUuTNgU0zfU7Qn3MsjeW6C_rsrB_UOEVJ4,9571
|
5
|
+
junshan_kit-2.2.5.dist-info/METADATA,sha256=6eUE_T57eUMtYiE958tpJp7glbX2qXyMObOhW_n0INo,329
|
6
|
+
junshan_kit-2.2.5.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
7
|
+
junshan_kit-2.2.5.dist-info/RECORD,,
|
junshan_kit/datahub.py
DELETED
@@ -1,146 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
----------------------------------------------------------------------
|
3
|
-
>>> Author : Junshan Yin
|
4
|
-
>>> Last Updated : 2025-10-12
|
5
|
-
----------------------------------------------------------------------
|
6
|
-
"""
|
7
|
-
|
8
|
-
import kagglehub
|
9
|
-
import os, time
|
10
|
-
import warnings
|
11
|
-
import shutil
|
12
|
-
from kaggle.api.kaggle_api_extended import KaggleApi
|
13
|
-
from selenium import webdriver
|
14
|
-
from selenium.webdriver.common.by import By
|
15
|
-
from selenium.webdriver.chrome.options import Options
|
16
|
-
from selenium.webdriver.support.ui import WebDriverWait
|
17
|
-
from selenium.webdriver.support import expected_conditions as EC
|
18
|
-
|
19
|
-
class kaggle_data:
|
20
|
-
def list_datasets(self):
|
21
|
-
api = KaggleApi()
|
22
|
-
api.authenticate()
|
23
|
-
datasets = api.dataset_list(user='junshan888')
|
24
|
-
print('Available datasets:')
|
25
|
-
print('*' * 60)
|
26
|
-
if datasets is not None:
|
27
|
-
for ds in datasets:
|
28
|
-
if ds is not None:
|
29
|
-
print(ds.title)
|
30
|
-
print('*' * 60)
|
31
|
-
|
32
|
-
def list_user_datasets(self):
|
33
|
-
warnings.warn(
|
34
|
-
"list_user_datasets() is deprecated. Use list_datasets() instead.",
|
35
|
-
DeprecationWarning,
|
36
|
-
stacklevel=2
|
37
|
-
)
|
38
|
-
return self.list_datasets()
|
39
|
-
|
40
|
-
# example: list_user_datasets()
|
41
|
-
|
42
|
-
#---------------------------------------------------------------
|
43
|
-
def download_data(self, data_name = 'letter-libsvm', copy_path = None):
|
44
|
-
path = kagglehub.dataset_download(f'junshan888/{data_name}')
|
45
|
-
# print("Downloaded to:", path)
|
46
|
-
if copy_path is not None:
|
47
|
-
# Create target directory if it doesn't exist
|
48
|
-
os.makedirs(copy_path, exist_ok=True)
|
49
|
-
# Copy dataset to target directory
|
50
|
-
shutil.copytree(path, copy_path, dirs_exist_ok=True)
|
51
|
-
|
52
|
-
print(f"✅ Dataset has been copied to: {copy_path}")
|
53
|
-
|
54
|
-
# example: read_data(copy_path='./exp_data')
|
55
|
-
|
56
|
-
|
57
|
-
class JianguoDownloaderChrome:
|
58
|
-
def __init__(self, url: str, download_path: str = "./downloads"):
|
59
|
-
self.url = url
|
60
|
-
self.download_path = os.path.abspath(download_path)
|
61
|
-
os.makedirs(self.download_path, exist_ok=True)
|
62
|
-
|
63
|
-
# Configure Chrome options
|
64
|
-
self.chrome_options = Options()
|
65
|
-
prefs = {
|
66
|
-
"download.default_directory": self.download_path,
|
67
|
-
"download.prompt_for_download": False,
|
68
|
-
"download.directory_upgrade": True,
|
69
|
-
"safebrowsing.enabled": True,
|
70
|
-
"profile.default_content_setting_values.automatic_downloads": 1,
|
71
|
-
}
|
72
|
-
self.chrome_options.add_experimental_option("prefs", prefs)
|
73
|
-
|
74
|
-
# Optional stability flags
|
75
|
-
self.chrome_options.add_argument("--disable-gpu")
|
76
|
-
self.chrome_options.add_argument("--no-sandbox")
|
77
|
-
self.chrome_options.add_argument("--disable-dev-shm-usage")
|
78
|
-
self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
79
|
-
|
80
|
-
# Start Chrome
|
81
|
-
self.driver = webdriver.Chrome(options=self.chrome_options)
|
82
|
-
|
83
|
-
def open_page(self):
|
84
|
-
"""Open the Jianguoyun share page."""
|
85
|
-
print(f"🌐 Opening link: {self.url}")
|
86
|
-
self.driver.get(self.url)
|
87
|
-
|
88
|
-
def click_download_button(self):
|
89
|
-
"""Find and click the download button."""
|
90
|
-
print("🔍 Looking for the download button...")
|
91
|
-
wait = WebDriverWait(self.driver, 30)
|
92
|
-
span = wait.until(
|
93
|
-
EC.presence_of_element_located((By.XPATH, "//span[contains(text(),'下载')]"))
|
94
|
-
)
|
95
|
-
parent = span.find_element(By.XPATH, "./..")
|
96
|
-
self.driver.execute_script("arguments[0].click();", parent)
|
97
|
-
print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
|
98
|
-
|
99
|
-
# If Jianguoyun opens a new tab, switch to it
|
100
|
-
time.sleep(3)
|
101
|
-
if len(self.driver.window_handles) > 1:
|
102
|
-
self.driver.switch_to.window(self.driver.window_handles[-1])
|
103
|
-
print("📂 Switched to download tab.")
|
104
|
-
|
105
|
-
def wait_for_downloads(self, timeout=30000):
|
106
|
-
"""Wait until all downloads are finished."""
|
107
|
-
print("⏳ Waiting for downloads to finish...")
|
108
|
-
start_time = time.time()
|
109
|
-
while True:
|
110
|
-
downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
|
111
|
-
if not downloading:
|
112
|
-
print("✅ Download completed!")
|
113
|
-
return True
|
114
|
-
if time.time() - start_time > timeout:
|
115
|
-
print("⏰ Timeout: downloads may not have finished.")
|
116
|
-
return False
|
117
|
-
time.sleep(2)
|
118
|
-
|
119
|
-
def get_latest_file(self):
|
120
|
-
"""Return the most recently downloaded file (if any)."""
|
121
|
-
files = [os.path.join(self.download_path, f) for f in os.listdir(self.download_path)]
|
122
|
-
return max(files, key=os.path.getctime) if files else None
|
123
|
-
|
124
|
-
def close(self):
|
125
|
-
"""Close the browser."""
|
126
|
-
self.driver.quit()
|
127
|
-
print("🚪 Browser closed.")
|
128
|
-
|
129
|
-
def run(self):
|
130
|
-
"""Run the complete download process."""
|
131
|
-
print('*'*50)
|
132
|
-
try:
|
133
|
-
self.open_page()
|
134
|
-
self.click_download_button()
|
135
|
-
self.wait_for_downloads()
|
136
|
-
latest = self.get_latest_file()
|
137
|
-
if latest:
|
138
|
-
print(f"📄 Latest downloaded file: {latest}")
|
139
|
-
except Exception as e:
|
140
|
-
print("❌ Error occurred:", e)
|
141
|
-
finally:
|
142
|
-
self.close()
|
143
|
-
print('*'*50)
|
144
|
-
|
145
|
-
|
146
|
-
|
junshan_kit/meta.py
DELETED
@@ -1,256 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
----------------------------------------------------------------------
|
3
|
-
>>> Author : Junshan Yin
|
4
|
-
>>> Last Updated : 2025-10-12
|
5
|
-
----------------------------------------------------------------------
|
6
|
-
"""
|
7
|
-
|
8
|
-
import os
|
9
|
-
import time
|
10
|
-
import shutil
|
11
|
-
import warnings
|
12
|
-
import kagglehub
|
13
|
-
from kaggle.api.kaggle_api_extended import KaggleApi
|
14
|
-
|
15
|
-
from selenium import webdriver
|
16
|
-
from selenium.webdriver.common.by import By
|
17
|
-
from selenium.webdriver.support.ui import WebDriverWait
|
18
|
-
from selenium.webdriver.support import expected_conditions as EC
|
19
|
-
|
20
|
-
|
21
|
-
# =============================================================
|
22
|
-
# KAGGLE DATA MANAGEMENT
|
23
|
-
# =============================================================
|
24
|
-
|
25
|
-
class KaggleData:
|
26
|
-
def list_datasets(self):
|
27
|
-
"""
|
28
|
-
List available datasets from a specific user.
|
29
|
-
"""
|
30
|
-
api = KaggleApi()
|
31
|
-
api.authenticate()
|
32
|
-
datasets = api.dataset_list(user='junshan888')
|
33
|
-
print('Available datasets:')
|
34
|
-
print('*' * 60)
|
35
|
-
if datasets:
|
36
|
-
for ds in datasets:
|
37
|
-
print(ds.title) # type: ignore
|
38
|
-
print('*' * 60)
|
39
|
-
|
40
|
-
def list_user_datasets(self):
|
41
|
-
warnings.warn(
|
42
|
-
"list_user_datasets() is deprecated. Use list_datasets() instead.",
|
43
|
-
DeprecationWarning,
|
44
|
-
stacklevel=2
|
45
|
-
)
|
46
|
-
return self.list_datasets()
|
47
|
-
|
48
|
-
def download_data(self, data_name='letter-libsvm', copy_path=None):
|
49
|
-
"""Download a Kaggle dataset and optionally copy it to a target folder."""
|
50
|
-
path = kagglehub.dataset_download(f'junshan888/{data_name}')
|
51
|
-
if copy_path:
|
52
|
-
os.makedirs(copy_path, exist_ok=True)
|
53
|
-
shutil.copytree(path, copy_path, dirs_exist_ok=True)
|
54
|
-
print(f"✅ Dataset copied to: {copy_path}")
|
55
|
-
|
56
|
-
|
57
|
-
# =============================================================
|
58
|
-
# JIANGUOYUN (NUTSTORE) CHROME VERSION
|
59
|
-
# =============================================================
|
60
|
-
|
61
|
-
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
62
|
-
|
63
|
-
class JianguoyunDownloaderChrome:
|
64
|
-
""" Example:
|
65
|
-
>>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
|
66
|
-
>>> downloader = JianguoyunDownloaderChrome(url)
|
67
|
-
>>> downloader.run()
|
68
|
-
"""
|
69
|
-
def __init__(self, url, download_path="./downloads"):
|
70
|
-
self.url = url
|
71
|
-
self.download_path = os.path.abspath(download_path)
|
72
|
-
os.makedirs(self.download_path, exist_ok=True)
|
73
|
-
|
74
|
-
self.chrome_options = ChromeOptions()
|
75
|
-
prefs = {
|
76
|
-
"download.default_directory": self.download_path,
|
77
|
-
"download.prompt_for_download": False,
|
78
|
-
"download.directory_upgrade": True,
|
79
|
-
"safebrowsing.enabled": True,
|
80
|
-
"profile.default_content_setting_values.automatic_downloads": 1,
|
81
|
-
}
|
82
|
-
self.chrome_options.add_experimental_option("prefs", prefs)
|
83
|
-
self.chrome_options.add_argument("--disable-gpu")
|
84
|
-
self.chrome_options.add_argument("--no-sandbox")
|
85
|
-
self.chrome_options.add_argument("--disable-dev-shm-usage")
|
86
|
-
self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
87
|
-
# Uncomment for headless mode:
|
88
|
-
# self.chrome_options.add_argument("--headless")
|
89
|
-
|
90
|
-
self.driver = webdriver.Chrome(options=self.chrome_options)
|
91
|
-
|
92
|
-
def open_page(self):
|
93
|
-
print(f"🌐 Opening page: {self.url}")
|
94
|
-
self.driver.get(self.url)
|
95
|
-
print(f"✅ Page loaded: {self.driver.title}")
|
96
|
-
|
97
|
-
def click_download_button(self):
|
98
|
-
"""Find and click the 'Download' button (supports English and Chinese)."""
|
99
|
-
print("🔍 Searching for the download button...")
|
100
|
-
wait = WebDriverWait(self.driver, 30)
|
101
|
-
|
102
|
-
try:
|
103
|
-
# Match both English 'Download' (case-insensitive) and Chinese '下载'
|
104
|
-
xpath = (
|
105
|
-
"//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
106
|
-
" | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
107
|
-
" | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
108
|
-
" | //span[contains(text(),'下载')]"
|
109
|
-
" | //button[contains(text(),'下载')]"
|
110
|
-
" | //a[contains(text(),'下载')]"
|
111
|
-
)
|
112
|
-
|
113
|
-
button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
|
114
|
-
|
115
|
-
# Click using JavaScript to avoid overlay or interaction issues
|
116
|
-
self.driver.execute_script("arguments[0].click();", button)
|
117
|
-
print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
|
118
|
-
|
119
|
-
# If the cloud service opens a new tab, switch to it
|
120
|
-
time.sleep(3)
|
121
|
-
if len(self.driver.window_handles) > 1:
|
122
|
-
self.driver.switch_to.window(self.driver.window_handles[-1])
|
123
|
-
print("📂 Switched to the new download tab.")
|
124
|
-
|
125
|
-
except Exception as e:
|
126
|
-
print("❌ Failed to find or click the download button:", e)
|
127
|
-
raise
|
128
|
-
|
129
|
-
|
130
|
-
def wait_for_downloads(self, timeout=3600):
|
131
|
-
print("⏳ Waiting for downloads to finish...")
|
132
|
-
start_time = time.time()
|
133
|
-
while time.time() - start_time < timeout:
|
134
|
-
downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
|
135
|
-
if not downloading:
|
136
|
-
print("✅ Download completed!")
|
137
|
-
return
|
138
|
-
time.sleep(2)
|
139
|
-
print("⚠️ Timeout: download not completed within 1 hour")
|
140
|
-
|
141
|
-
def close(self):
|
142
|
-
self.driver.quit()
|
143
|
-
print("🚪 Browser closed.")
|
144
|
-
|
145
|
-
def run(self):
|
146
|
-
print('*' * 60)
|
147
|
-
try:
|
148
|
-
self.open_page()
|
149
|
-
self.click_download_button()
|
150
|
-
self.wait_for_downloads()
|
151
|
-
except Exception as e:
|
152
|
-
print("❌ Error:", e)
|
153
|
-
finally:
|
154
|
-
self.close()
|
155
|
-
print('*' * 60)
|
156
|
-
|
157
|
-
|
158
|
-
# =============================================================
|
159
|
-
# JIANGUOYUN (NUTSTORE) FIREFOX VERSION
|
160
|
-
# =============================================================
|
161
|
-
|
162
|
-
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
163
|
-
from selenium.webdriver.firefox.service import Service
|
164
|
-
|
165
|
-
class JianguoyunDownloaderFirefox:
|
166
|
-
""" Example:
|
167
|
-
>>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
|
168
|
-
>>> downloader = JianguoyunDownloaderFirefox(url)
|
169
|
-
>>> downloader.run()
|
170
|
-
"""
|
171
|
-
def __init__(self, url, download_path="./downloads"):
|
172
|
-
self.url = url
|
173
|
-
self.download_path = os.path.abspath(download_path)
|
174
|
-
os.makedirs(self.download_path, exist_ok=True)
|
175
|
-
|
176
|
-
options = FirefoxOptions()
|
177
|
-
options.add_argument("--headless")
|
178
|
-
options.set_preference("browser.download.folderList", 2)
|
179
|
-
options.set_preference("browser.download.manager.showWhenStarting", False)
|
180
|
-
options.set_preference("browser.download.dir", self.download_path)
|
181
|
-
options.set_preference("browser.helperApps.neverAsk.saveToDisk",
|
182
|
-
"application/zip,application/octet-stream,application/x-zip-compressed,multipart/x-zip")
|
183
|
-
options.set_preference("pdfjs.disabled", True)
|
184
|
-
|
185
|
-
service = Service("/snap/bin/geckodriver")
|
186
|
-
self.driver = webdriver.Firefox(service=service, options=options)
|
187
|
-
|
188
|
-
def open_page(self):
|
189
|
-
print(f"🌐 Opening page: {self.url}")
|
190
|
-
self.driver.get(self.url)
|
191
|
-
print(f"✅ Page loaded: {self.driver.title}")
|
192
|
-
|
193
|
-
def click_download_button(self):
|
194
|
-
"""Find and click the 'Download' button (supports English and Chinese)."""
|
195
|
-
print("🔍 Searching for the download button...")
|
196
|
-
wait = WebDriverWait(self.driver, 30)
|
197
|
-
|
198
|
-
try:
|
199
|
-
# Match both English 'Download' (case-insensitive) and Chinese '下载'
|
200
|
-
xpath = (
|
201
|
-
"//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
202
|
-
" | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
203
|
-
" | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
204
|
-
" | //span[contains(text(),'下载')]"
|
205
|
-
" | //button[contains(text(),'下载')]"
|
206
|
-
" | //a[contains(text(),'下载')]"
|
207
|
-
)
|
208
|
-
|
209
|
-
button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
|
210
|
-
|
211
|
-
# Click using JavaScript to avoid overlay or interaction issues
|
212
|
-
self.driver.execute_script("arguments[0].click();", button)
|
213
|
-
print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
|
214
|
-
|
215
|
-
# If the cloud service opens a new tab, switch to it
|
216
|
-
time.sleep(3)
|
217
|
-
if len(self.driver.window_handles) > 1:
|
218
|
-
self.driver.switch_to.window(self.driver.window_handles[-1])
|
219
|
-
print("📂 Switched to the new download tab.")
|
220
|
-
|
221
|
-
except Exception as e:
|
222
|
-
print("❌ Failed to find or click the download button:", e)
|
223
|
-
raise
|
224
|
-
|
225
|
-
def wait_for_download(self, timeout=3600):
|
226
|
-
"""Wait until all downloads are finished (auto-detects browser type)."""
|
227
|
-
print("⏳ Waiting for downloads to finish...")
|
228
|
-
start_time = time.time()
|
229
|
-
|
230
|
-
# Determine the temporary file extension based on the browser type
|
231
|
-
temp_ext = ".crdownload" if "chrome" in self.driver.capabilities["browserName"].lower() else ".part"
|
232
|
-
|
233
|
-
while time.time() - start_time < timeout:
|
234
|
-
downloading = [f for f in os.listdir(self.download_path) if f.endswith(temp_ext)]
|
235
|
-
if not downloading:
|
236
|
-
print("✅ Download completed!")
|
237
|
-
return True
|
238
|
-
time.sleep(2)
|
239
|
-
|
240
|
-
|
241
|
-
def close(self):
|
242
|
-
print("🛑 Closing browser...")
|
243
|
-
self.driver.quit()
|
244
|
-
|
245
|
-
def run(self):
|
246
|
-
print('*' * 60)
|
247
|
-
try:
|
248
|
-
self.open_page()
|
249
|
-
self.click_download_button()
|
250
|
-
self.wait_for_download(timeout=3600)
|
251
|
-
except Exception as e:
|
252
|
-
print("❌ Error:", e)
|
253
|
-
finally:
|
254
|
-
self.close()
|
255
|
-
print('*' * 60)
|
256
|
-
|
junshan_kit/test.py
DELETED
@@ -1,9 +0,0 @@
|
|
1
|
-
junshan_kit/DataProcessor.py,sha256=rDL3NLD-WlT3x6x74XkB_542_sk3BrnIk5p4rYlVn5o,7212
|
2
|
-
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
junshan_kit/datahub.py,sha256=_Q_3AlZ8vk1Ma6h9I44SxWBA8w9m1CQNvYztMcsxzUo,5377
|
4
|
-
junshan_kit/kit.py,sha256=h4Q_87hEJbXH4A9ryaGMu_nle5RlM8OR_PaW_hWCVBY,1040
|
5
|
-
junshan_kit/meta.py,sha256=SiY9P93aABrksNE6G3ft5gzcuP2cUgc4Vx6LH7ZFmzg,10113
|
6
|
-
junshan_kit/test.py,sha256=FgzG4oG7kkq6rWasxdBSY1qx_B0navRI5Ei-wJ1Dvo0,180
|
7
|
-
junshan_kit-2.2.3.dist-info/METADATA,sha256=h4_Z0LMIigJgrkt2hD5TcYJwOCkArMRySh-OopgZ9Xo,329
|
8
|
-
junshan_kit-2.2.3.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
9
|
-
junshan_kit-2.2.3.dist-info/RECORD,,
|
File without changes
|