junshan-kit 2.2.1__py2.py3-none-any.whl → 2.2.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,17 @@
1
+ """
2
+ ----------------------------------------------------------------------
3
+ >>> Author : Junshan Yin
4
+ >>> Last Updated : 2025-10-12
5
+ ----------------------------------------------------------------------
6
+ """
1
7
 
2
8
  import pandas as pd
3
9
  import os
4
10
  from sklearn.preprocessing import StandardScaler
5
11
  import junshan_kit.datahub
12
+ import kit
6
13
 
7
- class CSVToPandas:
14
+ class CSVToPandasMeta:
8
15
  def __init__(self):
9
16
  self.data_downloader = junshan_kit.datahub.kaggle_data()
10
17
 
@@ -79,7 +86,7 @@ class CSVToPandas:
79
86
  scaler = StandardScaler()
80
87
  df[num_cols] = scaler.fit_transform(df[num_cols])
81
88
 
82
- # 导出后的大小
89
+ # The size after export
83
90
  m_export, n_export = df.shape
84
91
 
85
92
  if show_info:
@@ -103,3 +110,57 @@ class CSVToPandas:
103
110
 
104
111
 
105
112
 
113
+ class CSV_TO_Pandas:
114
+ def __init__(self):
115
+ pass
116
+
117
+ def clean_data(self, csv_path, drop_cols: list, label_col: str, label_map: dict, print_info = False):
118
+ # Step 0: Load the dataset
119
+ df = pd.read_csv(csv_path)
120
+
121
+ # Save original size
122
+ m_original, n_original = df.shape
123
+
124
+ # Step 1: Drop non-informative columns
125
+ df = df.drop(columns=drop_cols)
126
+
127
+ # Step 2: Remove rows with missing values
128
+ df = df.dropna(axis=0, how='any')
129
+ m_encoded, n_encoded = df.shape
130
+
131
+ # Step 3: Map target label to -1 and +1
132
+ df[label_col] = df[label_col].map(label_map)
133
+
134
+ # Step 4: Encode categorical features (exclude label column)
135
+ text_feature_cols = df.select_dtypes(include=['object', 'string', 'category']).columns
136
+ text_feature_cols = [col for col in text_feature_cols if col != label_col] # ✅ exclude label
137
+
138
+ df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
139
+ m_cleaned, n_cleaned = df.shape
140
+
141
+ # print info
142
+ if print_info:
143
+ pos_count = (df[label_col] == 1).sum()
144
+ neg_count = (df[label_col] == -1).sum()
145
+
146
+ # Step 6: Print dataset information
147
+ print('\n' + '='*80)
148
+ print(f"{'Dataset Info':^70}")
149
+ print('='*80)
150
+ print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
151
+ print(f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols")
152
+ print(f"{'Positive samples (+1):':<40} {pos_count}")
153
+ print(f"{'Negative samples (-1):':<40} {neg_count}")
154
+ print(f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols")
155
+ print('-'*80)
156
+ print(f"Note:")
157
+ print(f"{'Label column:':<40} {label_col}")
158
+ print(f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}")
159
+ print(f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}")
160
+ print('='*80 + '\n')
161
+
162
+ return df
163
+
164
+
165
+
166
+
junshan_kit/datahub.py CHANGED
@@ -1,3 +1,10 @@
1
+ """
2
+ ----------------------------------------------------------------------
3
+ >>> Author : Junshan Yin
4
+ >>> Last Updated : 2025-10-12
5
+ ----------------------------------------------------------------------
6
+ """
7
+
1
8
  import kagglehub
2
9
  import os, time
3
10
  import warnings
junshan_kit/kit.py CHANGED
@@ -1,22 +1,35 @@
1
+ """
2
+ ----------------------------------------------------------------------
3
+ >>> Author : Junshan Yin
4
+ >>> Last Updated : 2025-10-13
5
+ ----------------------------------------------------------------------
6
+ """
7
+
1
8
  import zipfile
2
9
  import os
3
10
 
4
- def unzip_file(zip_path, dest_folder=None):
11
+ def unzip_file(zip_path: str, unzip_folder: str):
5
12
  """
6
- Extract a ZIP file to a folder.
7
-
8
- Parameters:
13
+ Args:
9
14
  zip_path (str): Path to the ZIP file to extract.
10
- dest_folder (str, optional): Folder to extract files into.
11
- If None, the function uses a folder
12
- with the same name as the ZIP file.
15
+ dest_folder (str, optional): Folder to extract files into.
16
+ If None, the function will create a folder with the same
17
+ name as the ZIP file (without extension).
18
+
19
+ Examples:
20
+ >>> zip_path = "./downloads/data.zip"
21
+ >>> unzip_folder = "./exp_data/data"
22
+ >>> unzip_file(zip_path, unzip_folder)
13
23
  """
14
- if dest_folder is None:
15
- dest_folder = os.path.splitext(os.path.basename(zip_path))[0]
16
24
 
17
- os.makedirs(dest_folder, exist_ok=True)
25
+ if unzip_folder is None:
26
+ unzip_folder = os.path.splitext(os.path.basename(zip_path))[0]
27
+
28
+ os.makedirs(unzip_folder, exist_ok=True)
18
29
 
19
30
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
20
- zip_ref.extractall(dest_folder)
31
+ zip_ref.extractall(unzip_folder)
32
+
33
+ print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
34
+
21
35
 
22
- print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(dest_folder)}'")
junshan_kit/meta.py CHANGED
@@ -1,3 +1,10 @@
1
+ """
2
+ ----------------------------------------------------------------------
3
+ >>> Author : Junshan Yin
4
+ >>> Last Updated : 2025-10-12
5
+ ----------------------------------------------------------------------
6
+ """
7
+
1
8
  import os
2
9
  import time
3
10
  import shutil
@@ -10,13 +17,16 @@ from selenium.webdriver.common.by import By
10
17
  from selenium.webdriver.support.ui import WebDriverWait
11
18
  from selenium.webdriver.support import expected_conditions as EC
12
19
 
20
+
13
21
  # =============================================================
14
22
  # KAGGLE DATA MANAGEMENT
15
23
  # =============================================================
16
24
 
17
25
  class KaggleData:
18
26
  def list_datasets(self):
19
- """List available datasets from a specific user."""
27
+ """
28
+ List available datasets from a specific user.
29
+ """
20
30
  api = KaggleApi()
21
31
  api.authenticate()
22
32
  datasets = api.dataset_list(user='junshan888')
@@ -51,6 +61,11 @@ class KaggleData:
51
61
  from selenium.webdriver.chrome.options import Options as ChromeOptions
52
62
 
53
63
  class JianguoyunDownloaderChrome:
64
+ """ Example:
65
+ >>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
66
+ >>> downloader = JianguoyunDownloaderChrome(url)
67
+ >>> downloader.run()
68
+ """
54
69
  def __init__(self, url, download_path="./downloads"):
55
70
  self.url = url
56
71
  self.download_path = os.path.abspath(download_path)
@@ -148,6 +163,11 @@ from selenium.webdriver.firefox.options import Options as FirefoxOptions
148
163
  from selenium.webdriver.firefox.service import Service
149
164
 
150
165
  class JianguoyunDownloaderFirefox:
166
+ """ Example:
167
+ >>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
168
+ >>> downloader = JianguoyunDownloaderFirefox(url)
169
+ >>> downloader.run()
170
+ """
151
171
  def __init__(self, url, download_path="./downloads"):
152
172
  self.url = url
153
173
  self.download_path = os.path.abspath(download_path)
@@ -234,19 +254,3 @@ class JianguoyunDownloaderFirefox:
234
254
  self.close()
235
255
  print('*' * 60)
236
256
 
237
-
238
- # =============================================================
239
- # MAIN
240
- # =============================================================
241
-
242
- if __name__ == "__main__":
243
- url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
244
-
245
- use_chrome = True # Set True to use Chrome, False for Firefox
246
-
247
- if use_chrome:
248
- downloader = JianguoyunDownloaderChrome(url)
249
- else:
250
- downloader = JianguoyunDownloaderFirefox(url)
251
-
252
- downloader.run()
junshan_kit/test.py CHANGED
@@ -1,3 +1,8 @@
1
- from datahub import JianguoDownloader
1
+ from DataProcessor import CSV_TO_Pandas
2
2
 
3
- data2 = JianguoDownloader('www.lka.com', './expspe')
3
+
4
+ data_ = CSV_TO_Pandas()
5
+
6
+
7
+
8
+ data_.clean_data('data_csv/Electric Vehicle Population Data/Electric_Vehicle_Population_Data.csv', [], [], {})
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: junshan_kit
3
- Version: 2.2.1
3
+ Version: 2.2.2
4
4
  Summary: This is an optimization tool.
5
5
  Author-email: Junshan Yin <junshanyin@163.com>
6
6
  Requires-Dist: kaggle==1.7.4.5
@@ -0,0 +1,9 @@
1
+ junshan_kit/DataProcessor.py,sha256=S-_QG2ZkHCGyhS8cxYEnO9z1vyKMrNHYd2j1DuAeNG0,6266
2
+ junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ junshan_kit/datahub.py,sha256=_Q_3AlZ8vk1Ma6h9I44SxWBA8w9m1CQNvYztMcsxzUo,5377
4
+ junshan_kit/kit.py,sha256=h4Q_87hEJbXH4A9ryaGMu_nle5RlM8OR_PaW_hWCVBY,1040
5
+ junshan_kit/meta.py,sha256=SiY9P93aABrksNE6G3ft5gzcuP2cUgc4Vx6LH7ZFmzg,10113
6
+ junshan_kit/test.py,sha256=FgzG4oG7kkq6rWasxdBSY1qx_B0navRI5Ei-wJ1Dvo0,180
7
+ junshan_kit-2.2.2.dist-info/METADATA,sha256=Qe9kokd4FFGlKhg5NDaMhpQrhRSulPvCAr4wcp9rsEo,329
8
+ junshan_kit-2.2.2.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
9
+ junshan_kit-2.2.2.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- junshan_kit/DataProcessor.py,sha256=AW_1jROexC3s41-RgzqzYVwPI0sOf3tzjiph4qa_Vcw,3882
2
- junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- junshan_kit/datahub.py,sha256=I34e26psFS8WK4X6SNucKPLtdBm0Ujzqa0VDIRACah4,5163
4
- junshan_kit/kit.py,sha256=Y-GD6rPxi0BG4V_pALcYGUBt4GBCl2jbUE3MKLvGIq0,721
5
- junshan_kit/meta.py,sha256=5aHyUPVr3P3yoAdC4DzOZv4AtaO9iX8zGjluwpOly6Q,10017
6
- junshan_kit/test.py,sha256=uSckjcr_Wgj__YPTwD6x0GY8Hfn5GBEXIpRf9vIYBbU,91
7
- junshan_kit-2.2.1.dist-info/METADATA,sha256=0dNpnGogyiuxiC9JJXuD0aSzMjigCzps2ylaWgl08YM,329
8
- junshan_kit-2.2.1.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
9
- junshan_kit-2.2.1.dist-info/RECORD,,