junshan-kit 2.2.4__py2.py3-none-any.whl → 2.2.6__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- junshan_kit/DataProcessor.py +46 -26
- junshan_kit/DataSets.py +77 -0
- junshan_kit/kit.py +242 -0
- {junshan_kit-2.2.4.dist-info → junshan_kit-2.2.6.dist-info}/METADATA +1 -1
- junshan_kit-2.2.6.dist-info/RECORD +7 -0
- junshan_kit-2.2.4.dist-info/RECORD +0 -5
- {junshan_kit-2.2.4.dist-info → junshan_kit-2.2.6.dist-info}/WHEEL +0 -0
junshan_kit/DataProcessor.py
CHANGED
@@ -1,36 +1,44 @@
|
|
1
1
|
"""
|
2
2
|
----------------------------------------------------------------------
|
3
|
-
>>> Author : Junshan Yin
|
3
|
+
>>> Author : Junshan Yin
|
4
4
|
>>> Last Updated : 2025-10-12
|
5
5
|
----------------------------------------------------------------------
|
6
6
|
"""
|
7
|
+
|
7
8
|
import pandas as pd
|
8
9
|
|
10
|
+
|
9
11
|
class CSV_TO_Pandas:
|
10
12
|
def __init__(self):
|
11
13
|
pass
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
+
def preprocess_dataset(
|
16
|
+
self,
|
17
|
+
csv_path,
|
18
|
+
drop_cols: list,
|
19
|
+
label_col: str,
|
20
|
+
label_map: dict,
|
21
|
+
print_info=False,
|
22
|
+
):
|
15
23
|
"""
|
16
24
|
Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
|
17
25
|
|
18
26
|
This function loads a dataset from a CSV file, removes specified non-feature columns,
|
19
|
-
drops rows with missing values, maps the target label to numerical values, and
|
20
|
-
one-hot encodes categorical features. Optionally, it can print dataset statistics
|
27
|
+
drops rows with missing values, maps the target label to numerical values, and
|
28
|
+
one-hot encodes categorical features. Optionally, it can print dataset statistics
|
21
29
|
before and after preprocessing.
|
22
30
|
|
23
31
|
Args:
|
24
|
-
csv_path (str):
|
32
|
+
csv_path (str):
|
25
33
|
Path to the input CSV dataset.
|
26
|
-
drop_cols (list):
|
34
|
+
drop_cols (list):
|
27
35
|
List of column names to drop from the dataset.
|
28
|
-
label_col (str):
|
36
|
+
label_col (str):
|
29
37
|
Name of the target label column.
|
30
|
-
label_map (dict):
|
38
|
+
label_map (dict):
|
31
39
|
Mapping dictionary for label conversion (e.g., {"yes": 1, "no": -1}).
|
32
|
-
print_info (bool, optional):
|
33
|
-
Whether to print preprocessing information and dataset statistics.
|
40
|
+
print_info (bool, optional):
|
41
|
+
Whether to print preprocessing information and dataset statistics.
|
34
42
|
Defaults to False.
|
35
43
|
|
36
44
|
Returns:
|
@@ -65,15 +73,19 @@ class CSV_TO_Pandas:
|
|
65
73
|
df = df.drop(columns=drop_cols)
|
66
74
|
|
67
75
|
# Step 2: Remove rows with missing values
|
68
|
-
df = df.dropna(axis=0, how=
|
76
|
+
df = df.dropna(axis=0, how="any")
|
69
77
|
m_encoded, n_encoded = df.shape
|
70
|
-
|
71
|
-
# Step 3: Map target label to -1 and +1
|
78
|
+
|
79
|
+
# Step 3: Map target label (to -1 and +1)
|
72
80
|
df[label_col] = df[label_col].map(label_map)
|
73
81
|
|
74
82
|
# Step 4: Encode categorical features (exclude label column)
|
75
|
-
text_feature_cols = df.select_dtypes(
|
76
|
-
|
83
|
+
text_feature_cols = df.select_dtypes(
|
84
|
+
include=["object", "string", "category"]
|
85
|
+
).columns
|
86
|
+
text_feature_cols = [
|
87
|
+
col for col in text_feature_cols if col != label_col
|
88
|
+
] # ✅ exclude label
|
77
89
|
|
78
90
|
df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
|
79
91
|
m_cleaned, n_cleaned = df.shape
|
@@ -84,19 +96,27 @@ class CSV_TO_Pandas:
|
|
84
96
|
neg_count = (df[label_col] == -1).sum()
|
85
97
|
|
86
98
|
# Step 6: Print dataset information
|
87
|
-
print(
|
99
|
+
print("\n" + "=" * 80)
|
88
100
|
print(f"{'Dataset Info':^70}")
|
89
|
-
print(
|
101
|
+
print("=" * 80)
|
90
102
|
print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
|
91
|
-
print(
|
103
|
+
print(
|
104
|
+
f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols"
|
105
|
+
)
|
92
106
|
print(f"{'Positive samples (+1):':<40} {pos_count}")
|
93
107
|
print(f"{'Negative samples (-1):':<40} {neg_count}")
|
94
|
-
print(
|
95
|
-
|
108
|
+
print(
|
109
|
+
f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols"
|
110
|
+
)
|
111
|
+
print("-" * 80)
|
96
112
|
print(f"Note:")
|
97
113
|
print(f"{'Label column:':<40} {label_col}")
|
98
|
-
print(
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
114
|
+
print(
|
115
|
+
f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
|
116
|
+
)
|
117
|
+
print(
|
118
|
+
f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
|
119
|
+
)
|
120
|
+
print("=" * 80 + "\n")
|
121
|
+
|
122
|
+
return df
|
junshan_kit/DataSets.py
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
"""
|
2
|
+
----------------------------------------------------------------------
|
3
|
+
>>> Author : Junshan Yin
|
4
|
+
>>> Last Updated : 2025-xx-xx
|
5
|
+
----------------------------------------------------------------------
|
6
|
+
"""
|
7
|
+
|
8
|
+
import os, time
|
9
|
+
import pandas as pd
|
10
|
+
import junshan_kit.DataProcessor
|
11
|
+
import junshan_kit.kit
|
12
|
+
from sklearn.preprocessing import StandardScaler
|
13
|
+
|
14
|
+
|
15
|
+
def download_data():
|
16
|
+
from junshan_kit.kit import JianguoyunDownloaderFirefox, JianguoyunDownloaderChrome
|
17
|
+
|
18
|
+
# User selects download method
|
19
|
+
while True:
|
20
|
+
# User inputs download URL
|
21
|
+
url = input("Enter the Jianguoyun download URL: ").strip()
|
22
|
+
|
23
|
+
print("Select download method:")
|
24
|
+
print("1. Firefox")
|
25
|
+
print("2. Chrome")
|
26
|
+
choice = input("Enter the number of your choice (1 or 2): ").strip()
|
27
|
+
|
28
|
+
if choice == "1":
|
29
|
+
JianguoyunDownloaderFirefox(url=url).run()
|
30
|
+
print("✅ Download completed using Firefox")
|
31
|
+
break
|
32
|
+
elif choice == "2":
|
33
|
+
JianguoyunDownloaderChrome(url=url).run()
|
34
|
+
print("✅ Download completed using Chrome")
|
35
|
+
break
|
36
|
+
else:
|
37
|
+
print("❌ Invalid choice. Please enter 1 or 2.\n")
|
38
|
+
|
39
|
+
|
40
|
+
def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection"):
|
41
|
+
|
42
|
+
csv_path = f'./exp_data/{data_name}' + 'creditcard.csv'
|
43
|
+
drop_cols = []
|
44
|
+
label_col = 'Class'
|
45
|
+
label_map = {0: -1, 1: 1}
|
46
|
+
|
47
|
+
if not os.path.exists(csv_path):
|
48
|
+
print('\n' + '*'*60)
|
49
|
+
print(f"Please download the data.")
|
50
|
+
print(csv_path)
|
51
|
+
download_data()
|
52
|
+
junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
cleaner = junshan_kit.DataProcessor.CSV_TO_Pandas()
|
62
|
+
cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map)
|
63
|
+
|
64
|
+
|
65
|
+
assert False
|
66
|
+
|
67
|
+
dataloader = junshan_kit.DataProcessor.CSV_TO_Pandas()
|
68
|
+
dataloader.preprocess_dataset()
|
69
|
+
# Step 0: Load the dataset
|
70
|
+
csv_path = "creditcard.csv"
|
71
|
+
df = pd.read_csv(csv_path)
|
72
|
+
|
73
|
+
|
74
|
+
|
75
|
+
def wine_and_food_pairing_dataset():
|
76
|
+
pass
|
77
|
+
|
junshan_kit/kit.py
ADDED
@@ -0,0 +1,242 @@
|
|
1
|
+
"""
|
2
|
+
----------------------------------------------------------------------
|
3
|
+
>>> Author : Junshan Yin
|
4
|
+
>>> Last Updated : 2025-10-13
|
5
|
+
----------------------------------------------------------------------
|
6
|
+
"""
|
7
|
+
|
8
|
+
import zipfile
|
9
|
+
import os, time
|
10
|
+
|
11
|
+
from selenium import webdriver
|
12
|
+
from selenium.webdriver.common.by import By
|
13
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
14
|
+
from selenium.webdriver.support import expected_conditions as EC
|
15
|
+
|
16
|
+
def unzip_file(zip_path: str, unzip_folder: str):
|
17
|
+
"""
|
18
|
+
Args:
|
19
|
+
zip_path (str): Path to the ZIP file to extract.
|
20
|
+
dest_folder (str, optional): Folder to extract files into.
|
21
|
+
If None, the function will create a folder with the same
|
22
|
+
name as the ZIP file (without extension).
|
23
|
+
|
24
|
+
Examples:
|
25
|
+
>>> zip_path = "./downloads/data.zip"
|
26
|
+
>>> unzip_folder = "./exp_data/data"
|
27
|
+
>>> unzip_file(zip_path, unzip_folder)
|
28
|
+
"""
|
29
|
+
|
30
|
+
if unzip_folder is None:
|
31
|
+
unzip_folder = os.path.splitext(os.path.basename(zip_path))[0]
|
32
|
+
|
33
|
+
os.makedirs(unzip_folder, exist_ok=True)
|
34
|
+
|
35
|
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
36
|
+
zip_ref.extractall(unzip_folder)
|
37
|
+
|
38
|
+
print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
|
39
|
+
|
40
|
+
|
41
|
+
# =============================================================
|
42
|
+
# JIANGUOYUN (NUTSTORE) CHROME VERSION
|
43
|
+
# =============================================================
|
44
|
+
|
45
|
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
46
|
+
class JianguoyunDownloaderChrome:
|
47
|
+
""" Example:
|
48
|
+
>>> url = "https://www.jianguoyun.com/p/DSQqUq8QqdHDDRiy6I0GIAA"
|
49
|
+
>>> downloader = JianguoyunDownloaderChrome(url)
|
50
|
+
>>> downloader.run()
|
51
|
+
"""
|
52
|
+
def __init__(self, url, download_path="./exp_data"):
|
53
|
+
self.url = url
|
54
|
+
self.download_path = os.path.abspath(download_path)
|
55
|
+
os.makedirs(self.download_path, exist_ok=True)
|
56
|
+
|
57
|
+
self.chrome_options = ChromeOptions()
|
58
|
+
prefs = {
|
59
|
+
"download.default_directory": self.download_path,
|
60
|
+
"download.prompt_for_download": False,
|
61
|
+
"download.directory_upgrade": True,
|
62
|
+
"safebrowsing.enabled": True,
|
63
|
+
"profile.default_content_setting_values.automatic_downloads": 1,
|
64
|
+
}
|
65
|
+
self.chrome_options.add_experimental_option("prefs", prefs)
|
66
|
+
self.chrome_options.add_argument("--disable-gpu")
|
67
|
+
self.chrome_options.add_argument("--no-sandbox")
|
68
|
+
self.chrome_options.add_argument("--disable-dev-shm-usage")
|
69
|
+
self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
70
|
+
# Uncomment for headless mode:
|
71
|
+
# self.chrome_options.add_argument("--headless")
|
72
|
+
|
73
|
+
self.driver = webdriver.Chrome(options=self.chrome_options)
|
74
|
+
|
75
|
+
def open_page(self):
|
76
|
+
print(f"🌐 Opening page: {self.url}")
|
77
|
+
self.driver.get(self.url)
|
78
|
+
print(f"✅ Page loaded: {self.driver.title}")
|
79
|
+
|
80
|
+
def click_download_button(self):
|
81
|
+
"""Find and click the 'Download' button (supports English and Chinese)."""
|
82
|
+
print("🔍 Searching for the download button...")
|
83
|
+
wait = WebDriverWait(self.driver, 30)
|
84
|
+
|
85
|
+
try:
|
86
|
+
# Match both English 'Download' (case-insensitive) and Chinese '下载'
|
87
|
+
xpath = (
|
88
|
+
"//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
89
|
+
" | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
90
|
+
" | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
91
|
+
" | //span[contains(text(),'下载')]"
|
92
|
+
" | //button[contains(text(),'下载')]"
|
93
|
+
" | //a[contains(text(),'下载')]"
|
94
|
+
)
|
95
|
+
|
96
|
+
button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
|
97
|
+
|
98
|
+
# Click using JavaScript to avoid overlay or interaction issues
|
99
|
+
self.driver.execute_script("arguments[0].click();", button)
|
100
|
+
print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
|
101
|
+
|
102
|
+
# If the cloud service opens a new tab, switch to it
|
103
|
+
time.sleep(3)
|
104
|
+
if len(self.driver.window_handles) > 1:
|
105
|
+
self.driver.switch_to.window(self.driver.window_handles[-1])
|
106
|
+
print("📂 Switched to the new download tab.")
|
107
|
+
|
108
|
+
except Exception as e:
|
109
|
+
print("❌ Failed to find or click the download button:", e)
|
110
|
+
raise
|
111
|
+
|
112
|
+
|
113
|
+
def wait_for_downloads(self, timeout=3600):
|
114
|
+
print("⏳ Waiting for downloads to finish...")
|
115
|
+
start_time = time.time()
|
116
|
+
while time.time() - start_time < timeout:
|
117
|
+
downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
|
118
|
+
if not downloading:
|
119
|
+
print("✅ Download completed!")
|
120
|
+
return
|
121
|
+
time.sleep(2)
|
122
|
+
print("⚠️ Timeout: download not completed within 1 hour")
|
123
|
+
|
124
|
+
def close(self):
|
125
|
+
self.driver.quit()
|
126
|
+
print("🚪 Browser closed.")
|
127
|
+
|
128
|
+
def run(self):
|
129
|
+
print('*' * 60)
|
130
|
+
try:
|
131
|
+
self.open_page()
|
132
|
+
self.click_download_button()
|
133
|
+
self.wait_for_downloads()
|
134
|
+
except Exception as e:
|
135
|
+
print("❌ Error:", e)
|
136
|
+
finally:
|
137
|
+
self.close()
|
138
|
+
print('*' * 60)
|
139
|
+
|
140
|
+
|
141
|
+
# =============================================================
|
142
|
+
# JIANGUOYUN (NUTSTORE) FIREFOX VERSION
|
143
|
+
# =============================================================
|
144
|
+
|
145
|
+
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
146
|
+
from selenium.webdriver.firefox.service import Service
|
147
|
+
|
148
|
+
class JianguoyunDownloaderFirefox:
|
149
|
+
""" Example:
|
150
|
+
>>> url = "https://www.jianguoyun.com/p/DSQqUq8QqdHDDRiy6I0GIAA"
|
151
|
+
>>> downloader = JianguoyunDownloaderFirefox(url)
|
152
|
+
>>> downloader.run()
|
153
|
+
"""
|
154
|
+
def __init__(self, url, download_path="./exp_data"):
|
155
|
+
self.url = url
|
156
|
+
self.download_path = os.path.abspath(download_path)
|
157
|
+
os.makedirs(self.download_path, exist_ok=True)
|
158
|
+
|
159
|
+
options = FirefoxOptions()
|
160
|
+
options.add_argument("--headless")
|
161
|
+
options.set_preference("browser.download.folderList", 2)
|
162
|
+
options.set_preference("browser.download.manager.showWhenStarting", False)
|
163
|
+
options.set_preference("browser.download.dir", self.download_path)
|
164
|
+
options.set_preference("browser.helperApps.neverAsk.saveToDisk",
|
165
|
+
"application/zip,application/octet-stream,application/x-zip-compressed,multipart/x-zip")
|
166
|
+
options.set_preference("pdfjs.disabled", True)
|
167
|
+
|
168
|
+
service = Service("/snap/bin/geckodriver")
|
169
|
+
self.driver = webdriver.Firefox(service=service, options=options)
|
170
|
+
|
171
|
+
def open_page(self):
|
172
|
+
print(f"🌐 Opening page: {self.url}")
|
173
|
+
self.driver.get(self.url)
|
174
|
+
print(f"✅ Page loaded: {self.driver.title}")
|
175
|
+
|
176
|
+
def click_download_button(self):
|
177
|
+
"""Find and click the 'Download' button (supports English and Chinese)."""
|
178
|
+
print("🔍 Searching for the download button...")
|
179
|
+
wait = WebDriverWait(self.driver, 30)
|
180
|
+
|
181
|
+
try:
|
182
|
+
# Match both English 'Download' (case-insensitive) and Chinese '下载'
|
183
|
+
xpath = (
|
184
|
+
"//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
185
|
+
" | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
186
|
+
" | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
|
187
|
+
" | //span[contains(text(),'下载')]"
|
188
|
+
" | //button[contains(text(),'下载')]"
|
189
|
+
" | //a[contains(text(),'下载')]"
|
190
|
+
)
|
191
|
+
|
192
|
+
button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
|
193
|
+
|
194
|
+
# Click using JavaScript to avoid overlay or interaction issues
|
195
|
+
self.driver.execute_script("arguments[0].click();", button)
|
196
|
+
print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
|
197
|
+
|
198
|
+
# If the cloud service opens a new tab, switch to it
|
199
|
+
time.sleep(3)
|
200
|
+
if len(self.driver.window_handles) > 1:
|
201
|
+
self.driver.switch_to.window(self.driver.window_handles[-1])
|
202
|
+
print("📂 Switched to the new download tab.")
|
203
|
+
|
204
|
+
except Exception as e:
|
205
|
+
print("❌ Failed to find or click the download button:", e)
|
206
|
+
raise
|
207
|
+
|
208
|
+
def wait_for_download(self, timeout=3600):
|
209
|
+
"""Wait until all downloads are finished (auto-detects browser type)."""
|
210
|
+
print("⏳ Waiting for downloads to finish...")
|
211
|
+
start_time = time.time()
|
212
|
+
|
213
|
+
# Determine the temporary file extension based on the browser type
|
214
|
+
temp_ext = ".crdownload" if "chrome" in self.driver.capabilities["browserName"].lower() else ".part"
|
215
|
+
|
216
|
+
while time.time() - start_time < timeout:
|
217
|
+
downloading = [f for f in os.listdir(self.download_path) if f.endswith(temp_ext)]
|
218
|
+
if not downloading:
|
219
|
+
print("✅ Download completed!")
|
220
|
+
return True
|
221
|
+
time.sleep(2)
|
222
|
+
|
223
|
+
|
224
|
+
def close(self):
|
225
|
+
print("🛑 Closing browser...")
|
226
|
+
self.driver.quit()
|
227
|
+
|
228
|
+
def run(self):
|
229
|
+
print('*' * 60)
|
230
|
+
try:
|
231
|
+
self.open_page()
|
232
|
+
self.click_download_button()
|
233
|
+
self.wait_for_download(timeout=3600)
|
234
|
+
except Exception as e:
|
235
|
+
print("❌ Error:", e)
|
236
|
+
finally:
|
237
|
+
self.close()
|
238
|
+
print('*' * 60)
|
239
|
+
|
240
|
+
|
241
|
+
|
242
|
+
|
@@ -0,0 +1,7 @@
|
|
1
|
+
junshan_kit/DataProcessor.py,sha256=eryVmS5BFZj8wjDN2QWVHqkbFgFuWU0HXV9s6TGf9QM,4442
|
2
|
+
junshan_kit/DataSets.py,sha256=jaKB5kh1pOR-o97hab2G2r_YfH69Bs4zR2CkiOfpyss,2085
|
3
|
+
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
junshan_kit/kit.py,sha256=tB1TpW9hW1EweK1RQwHOdUo7uG1QU4vSeyR0fdaSydo,9569
|
5
|
+
junshan_kit-2.2.6.dist-info/METADATA,sha256=CqdL1Yui6UbnUt5teWg0EpYP8ofCdviwWK8EkM_MuKw,329
|
6
|
+
junshan_kit-2.2.6.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
7
|
+
junshan_kit-2.2.6.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
junshan_kit/DataProcessor.py,sha256=ZysCzYyMxi6uuGb6LyJmyl_QnmqrADHNGcxOdC7_COQ,4232
|
2
|
-
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
junshan_kit-2.2.4.dist-info/METADATA,sha256=MTVQXdnHWcZYhb4zAbx5rdwniT21Wy6BhO031XfOjMk,329
|
4
|
-
junshan_kit-2.2.4.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
5
|
-
junshan_kit-2.2.4.dist-info/RECORD,,
|
File without changes
|