junshan-kit 2.2.4__py2.py3-none-any.whl → 2.2.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,36 +1,44 @@
1
1
  """
2
2
  ----------------------------------------------------------------------
3
- >>> Author : Junshan Yin
3
+ >>> Author : Junshan Yin
4
4
  >>> Last Updated : 2025-10-12
5
5
  ----------------------------------------------------------------------
6
6
  """
7
+
7
8
  import pandas as pd
8
9
 
10
+
9
11
  class CSV_TO_Pandas:
10
12
  def __init__(self):
11
13
  pass
12
14
 
13
-
14
- def preprocess_dataset(self, csv_path, drop_cols: list, label_col: str, label_map: dict, print_info = False):
15
+ def preprocess_dataset(
16
+ self,
17
+ csv_path,
18
+ drop_cols: list,
19
+ label_col: str,
20
+ label_map: dict,
21
+ print_info=False,
22
+ ):
15
23
  """
16
24
  Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
17
25
 
18
26
  This function loads a dataset from a CSV file, removes specified non-feature columns,
19
- drops rows with missing values, maps the target label to numerical values, and
20
- one-hot encodes categorical features. Optionally, it can print dataset statistics
27
+ drops rows with missing values, maps the target label to numerical values, and
28
+ one-hot encodes categorical features. Optionally, it can print dataset statistics
21
29
  before and after preprocessing.
22
30
 
23
31
  Args:
24
- csv_path (str):
32
+ csv_path (str):
25
33
  Path to the input CSV dataset.
26
- drop_cols (list):
34
+ drop_cols (list):
27
35
  List of column names to drop from the dataset.
28
- label_col (str):
36
+ label_col (str):
29
37
  Name of the target label column.
30
- label_map (dict):
38
+ label_map (dict):
31
39
  Mapping dictionary for label conversion (e.g., {"yes": 1, "no": -1}).
32
- print_info (bool, optional):
33
- Whether to print preprocessing information and dataset statistics.
40
+ print_info (bool, optional):
41
+ Whether to print preprocessing information and dataset statistics.
34
42
  Defaults to False.
35
43
 
36
44
  Returns:
@@ -65,15 +73,19 @@ class CSV_TO_Pandas:
65
73
  df = df.drop(columns=drop_cols)
66
74
 
67
75
  # Step 2: Remove rows with missing values
68
- df = df.dropna(axis=0, how='any')
76
+ df = df.dropna(axis=0, how="any")
69
77
  m_encoded, n_encoded = df.shape
70
-
71
- # Step 3: Map target label to -1 and +1
78
+
79
+ # Step 3: Map target label (to -1 and +1)
72
80
  df[label_col] = df[label_col].map(label_map)
73
81
 
74
82
  # Step 4: Encode categorical features (exclude label column)
75
- text_feature_cols = df.select_dtypes(include=['object', 'string', 'category']).columns
76
- text_feature_cols = [col for col in text_feature_cols if col != label_col] # ✅ exclude label
83
+ text_feature_cols = df.select_dtypes(
84
+ include=["object", "string", "category"]
85
+ ).columns
86
+ text_feature_cols = [
87
+ col for col in text_feature_cols if col != label_col
88
+ ] # ✅ exclude label
77
89
 
78
90
  df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
79
91
  m_cleaned, n_cleaned = df.shape
@@ -84,19 +96,27 @@ class CSV_TO_Pandas:
84
96
  neg_count = (df[label_col] == -1).sum()
85
97
 
86
98
  # Step 6: Print dataset information
87
- print('\n' + '='*80)
99
+ print("\n" + "=" * 80)
88
100
  print(f"{'Dataset Info':^70}")
89
- print('='*80)
101
+ print("=" * 80)
90
102
  print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
91
- print(f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols")
103
+ print(
104
+ f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols"
105
+ )
92
106
  print(f"{'Positive samples (+1):':<40} {pos_count}")
93
107
  print(f"{'Negative samples (-1):':<40} {neg_count}")
94
- print(f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols")
95
- print('-'*80)
108
+ print(
109
+ f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols"
110
+ )
111
+ print("-" * 80)
96
112
  print(f"Note:")
97
113
  print(f"{'Label column:':<40} {label_col}")
98
- print(f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}")
99
- print(f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}")
100
- print('='*80 + '\n')
101
-
102
- return df
114
+ print(
115
+ f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
116
+ )
117
+ print(
118
+ f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
119
+ )
120
+ print("=" * 80 + "\n")
121
+
122
+ return df
@@ -0,0 +1,77 @@
1
+ """
2
+ ----------------------------------------------------------------------
3
+ >>> Author : Junshan Yin
4
+ >>> Last Updated : 2025-xx-xx
5
+ ----------------------------------------------------------------------
6
+ """
7
+
8
+ import os, time
9
+ import pandas as pd
10
+ import junshan_kit.DataProcessor
11
+ import junshan_kit.kit
12
+ from sklearn.preprocessing import StandardScaler
13
+
14
+
15
+ def download_data():
16
+ from junshan_kit.kit import JianguoyunDownloaderFirefox, JianguoyunDownloaderChrome
17
+
18
+ # User selects download method
19
+ while True:
20
+ # User inputs download URL
21
+ url = input("Enter the Jianguoyun download URL: ").strip()
22
+
23
+ print("Select download method:")
24
+ print("1. Firefox")
25
+ print("2. Chrome")
26
+ choice = input("Enter the number of your choice (1 or 2): ").strip()
27
+
28
+ if choice == "1":
29
+ JianguoyunDownloaderFirefox(url=url).run()
30
+ print("✅ Download completed using Firefox")
31
+ break
32
+ elif choice == "2":
33
+ JianguoyunDownloaderChrome(url=url).run()
34
+ print("✅ Download completed using Chrome")
35
+ break
36
+ else:
37
+ print("❌ Invalid choice. Please enter 1 or 2.\n")
38
+
39
+
40
+ def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection"):
41
+
42
+ csv_path = f'./exp_data/{data_name}' + 'creditcard.csv'
43
+ drop_cols = []
44
+ label_col = 'Class'
45
+ label_map = {0: -1, 1: 1}
46
+
47
+ if not os.path.exists(csv_path):
48
+ print('\n' + '*'*60)
49
+ print(f"Please download the data.")
50
+ print(csv_path)
51
+ download_data()
52
+ junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+ cleaner = junshan_kit.DataProcessor.CSV_TO_Pandas()
62
+ cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map)
63
+
64
+
65
+ assert False
66
+
67
+ dataloader = junshan_kit.DataProcessor.CSV_TO_Pandas()
68
+ dataloader.preprocess_dataset()
69
+ # Step 0: Load the dataset
70
+ csv_path = "creditcard.csv"
71
+ df = pd.read_csv(csv_path)
72
+
73
+
74
+
75
+ def wine_and_food_pairing_dataset():
76
+ pass
77
+
junshan_kit/kit.py ADDED
@@ -0,0 +1,242 @@
1
+ """
2
+ ----------------------------------------------------------------------
3
+ >>> Author : Junshan Yin
4
+ >>> Last Updated : 2025-10-13
5
+ ----------------------------------------------------------------------
6
+ """
7
+
8
+ import zipfile
9
+ import os, time
10
+
11
+ from selenium import webdriver
12
+ from selenium.webdriver.common.by import By
13
+ from selenium.webdriver.support.ui import WebDriverWait
14
+ from selenium.webdriver.support import expected_conditions as EC
15
+
16
+ def unzip_file(zip_path: str, unzip_folder: str):
17
+ """
18
+ Args:
19
+ zip_path (str): Path to the ZIP file to extract.
20
+ dest_folder (str, optional): Folder to extract files into.
21
+ If None, the function will create a folder with the same
22
+ name as the ZIP file (without extension).
23
+
24
+ Examples:
25
+ >>> zip_path = "./downloads/data.zip"
26
+ >>> unzip_folder = "./exp_data/data"
27
+ >>> unzip_file(zip_path, unzip_folder)
28
+ """
29
+
30
+ if unzip_folder is None:
31
+ unzip_folder = os.path.splitext(os.path.basename(zip_path))[0]
32
+
33
+ os.makedirs(unzip_folder, exist_ok=True)
34
+
35
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
36
+ zip_ref.extractall(unzip_folder)
37
+
38
+ print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
39
+
40
+
41
+ # =============================================================
42
+ # JIANGUOYUN (NUTSTORE) CHROME VERSION
43
+ # =============================================================
44
+
45
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
46
+ class JianguoyunDownloaderChrome:
47
+ """ Example:
48
+ >>> url = "https://www.jianguoyun.com/p/DSQqUq8QqdHDDRiy6I0GIAA"
49
+ >>> downloader = JianguoyunDownloaderChrome(url)
50
+ >>> downloader.run()
51
+ """
52
+ def __init__(self, url, download_path="./exp_data"):
53
+ self.url = url
54
+ self.download_path = os.path.abspath(download_path)
55
+ os.makedirs(self.download_path, exist_ok=True)
56
+
57
+ self.chrome_options = ChromeOptions()
58
+ prefs = {
59
+ "download.default_directory": self.download_path,
60
+ "download.prompt_for_download": False,
61
+ "download.directory_upgrade": True,
62
+ "safebrowsing.enabled": True,
63
+ "profile.default_content_setting_values.automatic_downloads": 1,
64
+ }
65
+ self.chrome_options.add_experimental_option("prefs", prefs)
66
+ self.chrome_options.add_argument("--disable-gpu")
67
+ self.chrome_options.add_argument("--no-sandbox")
68
+ self.chrome_options.add_argument("--disable-dev-shm-usage")
69
+ self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
70
+ # Uncomment for headless mode:
71
+ # self.chrome_options.add_argument("--headless")
72
+
73
+ self.driver = webdriver.Chrome(options=self.chrome_options)
74
+
75
+ def open_page(self):
76
+ print(f"🌐 Opening page: {self.url}")
77
+ self.driver.get(self.url)
78
+ print(f"✅ Page loaded: {self.driver.title}")
79
+
80
+ def click_download_button(self):
81
+ """Find and click the 'Download' button (supports English and Chinese)."""
82
+ print("🔍 Searching for the download button...")
83
+ wait = WebDriverWait(self.driver, 30)
84
+
85
+ try:
86
+ # Match both English 'Download' (case-insensitive) and Chinese '下载'
87
+ xpath = (
88
+ "//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
89
+ " | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
90
+ " | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
91
+ " | //span[contains(text(),'下载')]"
92
+ " | //button[contains(text(),'下载')]"
93
+ " | //a[contains(text(),'下载')]"
94
+ )
95
+
96
+ button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
97
+
98
+ # Click using JavaScript to avoid overlay or interaction issues
99
+ self.driver.execute_script("arguments[0].click();", button)
100
+ print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
101
+
102
+ # If the cloud service opens a new tab, switch to it
103
+ time.sleep(3)
104
+ if len(self.driver.window_handles) > 1:
105
+ self.driver.switch_to.window(self.driver.window_handles[-1])
106
+ print("📂 Switched to the new download tab.")
107
+
108
+ except Exception as e:
109
+ print("❌ Failed to find or click the download button:", e)
110
+ raise
111
+
112
+
113
+ def wait_for_downloads(self, timeout=3600):
114
+ print("⏳ Waiting for downloads to finish...")
115
+ start_time = time.time()
116
+ while time.time() - start_time < timeout:
117
+ downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
118
+ if not downloading:
119
+ print("✅ Download completed!")
120
+ return
121
+ time.sleep(2)
122
+ print("⚠️ Timeout: download not completed within 1 hour")
123
+
124
+ def close(self):
125
+ self.driver.quit()
126
+ print("🚪 Browser closed.")
127
+
128
+ def run(self):
129
+ print('*' * 60)
130
+ try:
131
+ self.open_page()
132
+ self.click_download_button()
133
+ self.wait_for_downloads()
134
+ except Exception as e:
135
+ print("❌ Error:", e)
136
+ finally:
137
+ self.close()
138
+ print('*' * 60)
139
+
140
+
141
+ # =============================================================
142
+ # JIANGUOYUN (NUTSTORE) FIREFOX VERSION
143
+ # =============================================================
144
+
145
+ from selenium.webdriver.firefox.options import Options as FirefoxOptions
146
+ from selenium.webdriver.firefox.service import Service
147
+
148
+ class JianguoyunDownloaderFirefox:
149
+ """ Example:
150
+ >>> url = "https://www.jianguoyun.com/p/DSQqUq8QqdHDDRiy6I0GIAA"
151
+ >>> downloader = JianguoyunDownloaderFirefox(url)
152
+ >>> downloader.run()
153
+ """
154
+ def __init__(self, url, download_path="./exp_data"):
155
+ self.url = url
156
+ self.download_path = os.path.abspath(download_path)
157
+ os.makedirs(self.download_path, exist_ok=True)
158
+
159
+ options = FirefoxOptions()
160
+ options.add_argument("--headless")
161
+ options.set_preference("browser.download.folderList", 2)
162
+ options.set_preference("browser.download.manager.showWhenStarting", False)
163
+ options.set_preference("browser.download.dir", self.download_path)
164
+ options.set_preference("browser.helperApps.neverAsk.saveToDisk",
165
+ "application/zip,application/octet-stream,application/x-zip-compressed,multipart/x-zip")
166
+ options.set_preference("pdfjs.disabled", True)
167
+
168
+ service = Service("/snap/bin/geckodriver")
169
+ self.driver = webdriver.Firefox(service=service, options=options)
170
+
171
+ def open_page(self):
172
+ print(f"🌐 Opening page: {self.url}")
173
+ self.driver.get(self.url)
174
+ print(f"✅ Page loaded: {self.driver.title}")
175
+
176
+ def click_download_button(self):
177
+ """Find and click the 'Download' button (supports English and Chinese)."""
178
+ print("🔍 Searching for the download button...")
179
+ wait = WebDriverWait(self.driver, 30)
180
+
181
+ try:
182
+ # Match both English 'Download' (case-insensitive) and Chinese '下载'
183
+ xpath = (
184
+ "//span[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
185
+ " | //button[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
186
+ " | //a[contains(translate(text(),'DOWNLOAD下载','download下载'),'download')]"
187
+ " | //span[contains(text(),'下载')]"
188
+ " | //button[contains(text(),'下载')]"
189
+ " | //a[contains(text(),'下载')]"
190
+ )
191
+
192
+ button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
193
+
194
+ # Click using JavaScript to avoid overlay or interaction issues
195
+ self.driver.execute_script("arguments[0].click();", button)
196
+ print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
197
+
198
+ # If the cloud service opens a new tab, switch to it
199
+ time.sleep(3)
200
+ if len(self.driver.window_handles) > 1:
201
+ self.driver.switch_to.window(self.driver.window_handles[-1])
202
+ print("📂 Switched to the new download tab.")
203
+
204
+ except Exception as e:
205
+ print("❌ Failed to find or click the download button:", e)
206
+ raise
207
+
208
+ def wait_for_download(self, timeout=3600):
209
+ """Wait until all downloads are finished (auto-detects browser type)."""
210
+ print("⏳ Waiting for downloads to finish...")
211
+ start_time = time.time()
212
+
213
+ # Determine the temporary file extension based on the browser type
214
+ temp_ext = ".crdownload" if "chrome" in self.driver.capabilities["browserName"].lower() else ".part"
215
+
216
+ while time.time() - start_time < timeout:
217
+ downloading = [f for f in os.listdir(self.download_path) if f.endswith(temp_ext)]
218
+ if not downloading:
219
+ print("✅ Download completed!")
220
+ return True
221
+ time.sleep(2)
222
+
223
+
224
+ def close(self):
225
+ print("🛑 Closing browser...")
226
+ self.driver.quit()
227
+
228
+ def run(self):
229
+ print('*' * 60)
230
+ try:
231
+ self.open_page()
232
+ self.click_download_button()
233
+ self.wait_for_download(timeout=3600)
234
+ except Exception as e:
235
+ print("❌ Error:", e)
236
+ finally:
237
+ self.close()
238
+ print('*' * 60)
239
+
240
+
241
+
242
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: junshan_kit
3
- Version: 2.2.4
3
+ Version: 2.2.6
4
4
  Summary: This is an optimization tool.
5
5
  Author-email: Junshan Yin <junshanyin@163.com>
6
6
  Requires-Dist: kaggle==1.7.4.5
@@ -0,0 +1,7 @@
1
+ junshan_kit/DataProcessor.py,sha256=eryVmS5BFZj8wjDN2QWVHqkbFgFuWU0HXV9s6TGf9QM,4442
2
+ junshan_kit/DataSets.py,sha256=jaKB5kh1pOR-o97hab2G2r_YfH69Bs4zR2CkiOfpyss,2085
3
+ junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ junshan_kit/kit.py,sha256=tB1TpW9hW1EweK1RQwHOdUo7uG1QU4vSeyR0fdaSydo,9569
5
+ junshan_kit-2.2.6.dist-info/METADATA,sha256=CqdL1Yui6UbnUt5teWg0EpYP8ofCdviwWK8EkM_MuKw,329
6
+ junshan_kit-2.2.6.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
7
+ junshan_kit-2.2.6.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- junshan_kit/DataProcessor.py,sha256=ZysCzYyMxi6uuGb6LyJmyl_QnmqrADHNGcxOdC7_COQ,4232
2
- junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- junshan_kit-2.2.4.dist-info/METADATA,sha256=MTVQXdnHWcZYhb4zAbx5rdwniT21Wy6BhO031XfOjMk,329
4
- junshan_kit-2.2.4.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
5
- junshan_kit-2.2.4.dist-info/RECORD,,