junshan-kit 2.2.0__py2.py3-none-any.whl → 2.2.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- junshan_kit/DataProcessor.py +63 -2
- junshan_kit/datahub.py +7 -0
- junshan_kit/kit.py +35 -0
- junshan_kit/meta.py +21 -17
- junshan_kit/test.py +7 -2
- {junshan_kit-2.2.0.dist-info → junshan_kit-2.2.2.dist-info}/METADATA +1 -1
- junshan_kit-2.2.2.dist-info/RECORD +9 -0
- junshan_kit-2.2.0.dist-info/RECORD +0 -8
- {junshan_kit-2.2.0.dist-info → junshan_kit-2.2.2.dist-info}/WHEEL +0 -0
junshan_kit/DataProcessor.py
CHANGED
@@ -1,10 +1,17 @@
|
|
1
|
+
"""
|
2
|
+
----------------------------------------------------------------------
|
3
|
+
>>> Author : Junshan Yin
|
4
|
+
>>> Last Updated : 2025-10-12
|
5
|
+
----------------------------------------------------------------------
|
6
|
+
"""
|
1
7
|
|
2
8
|
import pandas as pd
|
3
9
|
import os
|
4
10
|
from sklearn.preprocessing import StandardScaler
|
5
11
|
import junshan_kit.datahub
|
12
|
+
import kit
|
6
13
|
|
7
|
-
class
|
14
|
+
class CSVToPandasMeta:
|
8
15
|
def __init__(self):
|
9
16
|
self.data_downloader = junshan_kit.datahub.kaggle_data()
|
10
17
|
|
@@ -79,7 +86,7 @@ class CSVToPandas:
|
|
79
86
|
scaler = StandardScaler()
|
80
87
|
df[num_cols] = scaler.fit_transform(df[num_cols])
|
81
88
|
|
82
|
-
#
|
89
|
+
# The size after export
|
83
90
|
m_export, n_export = df.shape
|
84
91
|
|
85
92
|
if show_info:
|
@@ -103,3 +110,57 @@ class CSVToPandas:
|
|
103
110
|
|
104
111
|
|
105
112
|
|
113
|
+
class CSV_TO_Pandas:
|
114
|
+
def __init__(self):
|
115
|
+
pass
|
116
|
+
|
117
|
+
def clean_data(self, csv_path, drop_cols: list, label_col: str, label_map: dict, print_info = False):
|
118
|
+
# Step 0: Load the dataset
|
119
|
+
df = pd.read_csv(csv_path)
|
120
|
+
|
121
|
+
# Save original size
|
122
|
+
m_original, n_original = df.shape
|
123
|
+
|
124
|
+
# Step 1: Drop non-informative columns
|
125
|
+
df = df.drop(columns=drop_cols)
|
126
|
+
|
127
|
+
# Step 2: Remove rows with missing values
|
128
|
+
df = df.dropna(axis=0, how='any')
|
129
|
+
m_encoded, n_encoded = df.shape
|
130
|
+
|
131
|
+
# Step 3: Map target label to -1 and +1
|
132
|
+
df[label_col] = df[label_col].map(label_map)
|
133
|
+
|
134
|
+
# Step 4: Encode categorical features (exclude label column)
|
135
|
+
text_feature_cols = df.select_dtypes(include=['object', 'string', 'category']).columns
|
136
|
+
text_feature_cols = [col for col in text_feature_cols if col != label_col] # ✅ exclude label
|
137
|
+
|
138
|
+
df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
|
139
|
+
m_cleaned, n_cleaned = df.shape
|
140
|
+
|
141
|
+
# print info
|
142
|
+
if print_info:
|
143
|
+
pos_count = (df[label_col] == 1).sum()
|
144
|
+
neg_count = (df[label_col] == -1).sum()
|
145
|
+
|
146
|
+
# Step 6: Print dataset information
|
147
|
+
print('\n' + '='*80)
|
148
|
+
print(f"{'Dataset Info':^70}")
|
149
|
+
print('='*80)
|
150
|
+
print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
|
151
|
+
print(f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols")
|
152
|
+
print(f"{'Positive samples (+1):':<40} {pos_count}")
|
153
|
+
print(f"{'Negative samples (-1):':<40} {neg_count}")
|
154
|
+
print(f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols")
|
155
|
+
print('-'*80)
|
156
|
+
print(f"Note:")
|
157
|
+
print(f"{'Label column:':<40} {label_col}")
|
158
|
+
print(f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}")
|
159
|
+
print(f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}")
|
160
|
+
print('='*80 + '\n')
|
161
|
+
|
162
|
+
return df
|
163
|
+
|
164
|
+
|
165
|
+
|
166
|
+
|
junshan_kit/datahub.py
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
"""
|
2
|
+
----------------------------------------------------------------------
|
3
|
+
>>> Author : Junshan Yin
|
4
|
+
>>> Last Updated : 2025-10-12
|
5
|
+
----------------------------------------------------------------------
|
6
|
+
"""
|
7
|
+
|
1
8
|
import kagglehub
|
2
9
|
import os, time
|
3
10
|
import warnings
|
junshan_kit/kit.py
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
"""
|
2
|
+
----------------------------------------------------------------------
|
3
|
+
>>> Author : Junshan Yin
|
4
|
+
>>> Last Updated : 2025-10-13
|
5
|
+
----------------------------------------------------------------------
|
6
|
+
"""
|
7
|
+
|
8
|
+
import zipfile
|
9
|
+
import os
|
10
|
+
|
11
|
+
def unzip_file(zip_path: str, unzip_folder: str):
|
12
|
+
"""
|
13
|
+
Args:
|
14
|
+
zip_path (str): Path to the ZIP file to extract.
|
15
|
+
dest_folder (str, optional): Folder to extract files into.
|
16
|
+
If None, the function will create a folder with the same
|
17
|
+
name as the ZIP file (without extension).
|
18
|
+
|
19
|
+
Examples:
|
20
|
+
>>> zip_path = "./downloads/data.zip"
|
21
|
+
>>> unzip_folder = "./exp_data/data"
|
22
|
+
>>> unzip_file(zip_path, unzip_folder)
|
23
|
+
"""
|
24
|
+
|
25
|
+
if unzip_folder is None:
|
26
|
+
unzip_folder = os.path.splitext(os.path.basename(zip_path))[0]
|
27
|
+
|
28
|
+
os.makedirs(unzip_folder, exist_ok=True)
|
29
|
+
|
30
|
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
31
|
+
zip_ref.extractall(unzip_folder)
|
32
|
+
|
33
|
+
print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
|
34
|
+
|
35
|
+
|
junshan_kit/meta.py
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
"""
|
2
|
+
----------------------------------------------------------------------
|
3
|
+
>>> Author : Junshan Yin
|
4
|
+
>>> Last Updated : 2025-10-12
|
5
|
+
----------------------------------------------------------------------
|
6
|
+
"""
|
7
|
+
|
1
8
|
import os
|
2
9
|
import time
|
3
10
|
import shutil
|
@@ -10,13 +17,16 @@ from selenium.webdriver.common.by import By
|
|
10
17
|
from selenium.webdriver.support.ui import WebDriverWait
|
11
18
|
from selenium.webdriver.support import expected_conditions as EC
|
12
19
|
|
20
|
+
|
13
21
|
# =============================================================
|
14
22
|
# KAGGLE DATA MANAGEMENT
|
15
23
|
# =============================================================
|
16
24
|
|
17
25
|
class KaggleData:
|
18
26
|
def list_datasets(self):
|
19
|
-
"""
|
27
|
+
"""
|
28
|
+
List available datasets from a specific user.
|
29
|
+
"""
|
20
30
|
api = KaggleApi()
|
21
31
|
api.authenticate()
|
22
32
|
datasets = api.dataset_list(user='junshan888')
|
@@ -51,6 +61,11 @@ class KaggleData:
|
|
51
61
|
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
52
62
|
|
53
63
|
class JianguoyunDownloaderChrome:
|
64
|
+
""" Example:
|
65
|
+
>>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
|
66
|
+
>>> downloader = JianguoyunDownloaderChrome(url)
|
67
|
+
>>> downloader.run()
|
68
|
+
"""
|
54
69
|
def __init__(self, url, download_path="./downloads"):
|
55
70
|
self.url = url
|
56
71
|
self.download_path = os.path.abspath(download_path)
|
@@ -148,6 +163,11 @@ from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
|
148
163
|
from selenium.webdriver.firefox.service import Service
|
149
164
|
|
150
165
|
class JianguoyunDownloaderFirefox:
|
166
|
+
""" Example:
|
167
|
+
>>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
|
168
|
+
>>> downloader = JianguoyunDownloaderFirefox(url)
|
169
|
+
>>> downloader.run()
|
170
|
+
"""
|
151
171
|
def __init__(self, url, download_path="./downloads"):
|
152
172
|
self.url = url
|
153
173
|
self.download_path = os.path.abspath(download_path)
|
@@ -234,19 +254,3 @@ class JianguoyunDownloaderFirefox:
|
|
234
254
|
self.close()
|
235
255
|
print('*' * 60)
|
236
256
|
|
237
|
-
|
238
|
-
# =============================================================
|
239
|
-
# MAIN
|
240
|
-
# =============================================================
|
241
|
-
|
242
|
-
if __name__ == "__main__":
|
243
|
-
url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
|
244
|
-
|
245
|
-
use_chrome = True # Set True to use Chrome, False for Firefox
|
246
|
-
|
247
|
-
if use_chrome:
|
248
|
-
downloader = JianguoyunDownloaderChrome(url)
|
249
|
-
else:
|
250
|
-
downloader = JianguoyunDownloaderFirefox(url)
|
251
|
-
|
252
|
-
downloader.run()
|
junshan_kit/test.py
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
-
from
|
1
|
+
from DataProcessor import CSV_TO_Pandas
|
2
2
|
|
3
|
-
|
3
|
+
|
4
|
+
data_ = CSV_TO_Pandas()
|
5
|
+
|
6
|
+
|
7
|
+
|
8
|
+
data_.clean_data('data_csv/Electric Vehicle Population Data/Electric_Vehicle_Population_Data.csv', [], [], {})
|
@@ -0,0 +1,9 @@
|
|
1
|
+
junshan_kit/DataProcessor.py,sha256=S-_QG2ZkHCGyhS8cxYEnO9z1vyKMrNHYd2j1DuAeNG0,6266
|
2
|
+
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
junshan_kit/datahub.py,sha256=_Q_3AlZ8vk1Ma6h9I44SxWBA8w9m1CQNvYztMcsxzUo,5377
|
4
|
+
junshan_kit/kit.py,sha256=h4Q_87hEJbXH4A9ryaGMu_nle5RlM8OR_PaW_hWCVBY,1040
|
5
|
+
junshan_kit/meta.py,sha256=SiY9P93aABrksNE6G3ft5gzcuP2cUgc4Vx6LH7ZFmzg,10113
|
6
|
+
junshan_kit/test.py,sha256=FgzG4oG7kkq6rWasxdBSY1qx_B0navRI5Ei-wJ1Dvo0,180
|
7
|
+
junshan_kit-2.2.2.dist-info/METADATA,sha256=Qe9kokd4FFGlKhg5NDaMhpQrhRSulPvCAr4wcp9rsEo,329
|
8
|
+
junshan_kit-2.2.2.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
9
|
+
junshan_kit-2.2.2.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
junshan_kit/DataProcessor.py,sha256=AW_1jROexC3s41-RgzqzYVwPI0sOf3tzjiph4qa_Vcw,3882
|
2
|
-
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
junshan_kit/datahub.py,sha256=I34e26psFS8WK4X6SNucKPLtdBm0Ujzqa0VDIRACah4,5163
|
4
|
-
junshan_kit/meta.py,sha256=5aHyUPVr3P3yoAdC4DzOZv4AtaO9iX8zGjluwpOly6Q,10017
|
5
|
-
junshan_kit/test.py,sha256=uSckjcr_Wgj__YPTwD6x0GY8Hfn5GBEXIpRf9vIYBbU,91
|
6
|
-
junshan_kit-2.2.0.dist-info/METADATA,sha256=aWDiR4w_Z7sVVrLcqjQNYgt3L-iFWSydzcoiUPqDsg8,329
|
7
|
-
junshan_kit-2.2.0.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
8
|
-
junshan_kit-2.2.0.dist-info/RECORD,,
|
File without changes
|