junshan-kit 2.4.7__py2.py3-none-any.whl → 2.4.9__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of junshan-kit might be problematic. Click here for more details.
- junshan_kit/DataHub.py +114 -0
- junshan_kit/DataProcessor.py +114 -24
- junshan_kit/DataSets.py +186 -37
- junshan_kit/{Models.py → ModelsHub.py} +5 -0
- junshan_kit/ParametersHub.py +404 -0
- junshan_kit/Print_Info.py +6 -2
- junshan_kit/TrainingHub.py +75 -0
- junshan_kit/kit.py +94 -30
- {junshan_kit-2.4.7.dist-info → junshan_kit-2.4.9.dist-info}/METADATA +2 -2
- junshan_kit-2.4.9.dist-info/RECORD +12 -0
- junshan_kit/ComOptimizers.py +0 -126
- junshan_kit/ExperimentHub.py +0 -338
- junshan_kit/SPBM.py +0 -350
- junshan_kit/SPBM_func.py +0 -601
- junshan_kit/TrainingParas.py +0 -470
- junshan_kit/check_args.py +0 -116
- junshan_kit/datahub.py +0 -281
- junshan_kit-2.4.7.dist-info/RECORD +0 -16
- {junshan_kit-2.4.7.dist-info → junshan_kit-2.4.9.dist-info}/WHEEL +0 -0
junshan_kit/DataHub.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import torchvision, torch
|
|
2
|
+
import torchvision.transforms as transforms
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from junshan_kit import DataSets, DataProcessor
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def Adult_Income_Prediction(Paras):
|
|
18
|
+
|
|
19
|
+
df = DataSets.adult_income_prediction()
|
|
20
|
+
transform = {
|
|
21
|
+
"train_size": 0.7,
|
|
22
|
+
"normalization": True
|
|
23
|
+
}
|
|
24
|
+
label_col='income'
|
|
25
|
+
|
|
26
|
+
train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
|
|
27
|
+
|
|
28
|
+
return train_dataset, test_dataset, transform
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def Credit_Card_Fraud_Detection(Paras):
|
|
32
|
+
df = DataSets.credit_card_fraud_detection()
|
|
33
|
+
transform = {
|
|
34
|
+
"train_size": 0.7,
|
|
35
|
+
"normalization": True
|
|
36
|
+
}
|
|
37
|
+
label_col='Class'
|
|
38
|
+
|
|
39
|
+
train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
|
|
40
|
+
|
|
41
|
+
return train_dataset, test_dataset, transform
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def MNIST(Paras, model_name):
|
|
46
|
+
"""
|
|
47
|
+
Load the MNIST dataset and return both the training and test sets,
|
|
48
|
+
along with the transformation applied (ToTensor).
|
|
49
|
+
"""
|
|
50
|
+
transform = torchvision.transforms.ToTensor()
|
|
51
|
+
|
|
52
|
+
train_dataset = torchvision.datasets.MNIST(
|
|
53
|
+
root='./exp_data/MNIST',
|
|
54
|
+
train=True,
|
|
55
|
+
download=True,
|
|
56
|
+
transform=transform
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
test_dataset = torchvision.datasets.MNIST(
|
|
60
|
+
root='./exp_data/MNIST',
|
|
61
|
+
train=False,
|
|
62
|
+
download=True,
|
|
63
|
+
transform=transform
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if Paras["model_type"][model_name] == "binary":
|
|
67
|
+
|
|
68
|
+
train_mask = (train_dataset.targets == 0) | (train_dataset.targets == 1)
|
|
69
|
+
test_mask = (test_dataset.targets == 0) | (test_dataset.targets == 1)
|
|
70
|
+
|
|
71
|
+
train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
|
|
72
|
+
test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
|
|
73
|
+
|
|
74
|
+
train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
|
|
75
|
+
test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
|
|
76
|
+
|
|
77
|
+
return train_dataset, test_dataset, transform
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def CIFAR100(Paras, model_name):
|
|
81
|
+
"""
|
|
82
|
+
Load the CIFAR-100 dataset with standard normalization and return both
|
|
83
|
+
the training and test sets, along with the transformation applied.
|
|
84
|
+
"""
|
|
85
|
+
transform = transforms.Compose([
|
|
86
|
+
transforms.ToTensor(),
|
|
87
|
+
transforms.Normalize(mean=[0.5071, 0.4867, 0.4408],
|
|
88
|
+
std=[0.2675, 0.2565, 0.2761])
|
|
89
|
+
])
|
|
90
|
+
|
|
91
|
+
train_dataset = torchvision.datasets.CIFAR100(
|
|
92
|
+
root='./exp_data/CIFAR100',
|
|
93
|
+
train=True,
|
|
94
|
+
download=True,
|
|
95
|
+
transform=transform
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
test_dataset = torchvision.datasets.CIFAR100(
|
|
99
|
+
root='./exp_data/CIFAR100',
|
|
100
|
+
train=False,
|
|
101
|
+
download=True,
|
|
102
|
+
transform=transform
|
|
103
|
+
)
|
|
104
|
+
if Paras["model_type"][model_name] == "binary":
|
|
105
|
+
train_mask = (torch.tensor(train_dataset.targets) == 0) | (torch.tensor(train_dataset.targets) == 1)
|
|
106
|
+
test_mask = (torch.tensor(test_dataset.targets) == 0) | (torch.tensor(test_dataset.targets) == 1)
|
|
107
|
+
|
|
108
|
+
train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
|
|
109
|
+
test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
|
|
110
|
+
|
|
111
|
+
train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
|
|
112
|
+
test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
|
|
113
|
+
|
|
114
|
+
return train_dataset, test_dataset, transform
|
junshan_kit/DataProcessor.py
CHANGED
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
|
+
import torch
|
|
9
10
|
from sklearn.preprocessing import StandardScaler
|
|
10
11
|
|
|
11
12
|
|
|
@@ -13,7 +14,6 @@ class CSV_TO_Pandas:
|
|
|
13
14
|
def __init__(self):
|
|
14
15
|
pass
|
|
15
16
|
|
|
16
|
-
|
|
17
17
|
def _trans_time_fea(self, df, time_info: dict):
|
|
18
18
|
"""
|
|
19
19
|
Transform and extract time-based features from a specified datetime column.
|
|
@@ -82,6 +82,15 @@ class CSV_TO_Pandas:
|
|
|
82
82
|
|
|
83
83
|
user_text_fea = ['hour','dayofweek','is_weekend']
|
|
84
84
|
df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
|
|
85
|
+
|
|
86
|
+
elif trans_type == 2:
|
|
87
|
+
df.loc[:, "year"] = df[time_col_name].dt.year
|
|
88
|
+
df.loc[:, "month"] = df[time_col_name].dt.month
|
|
89
|
+
df.loc[:, "day"] = df[time_col_name].dt.day
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
user_text_fea = ['year','month','day']
|
|
93
|
+
df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
|
|
85
94
|
else:
|
|
86
95
|
print("error!")
|
|
87
96
|
|
|
@@ -91,15 +100,15 @@ class CSV_TO_Pandas:
|
|
|
91
100
|
|
|
92
101
|
def preprocess_dataset(
|
|
93
102
|
self,
|
|
94
|
-
|
|
103
|
+
df,
|
|
95
104
|
drop_cols: list,
|
|
96
105
|
label_col: str,
|
|
97
106
|
label_map: dict,
|
|
98
107
|
title_name: str,
|
|
99
108
|
user_one_hot_cols=[],
|
|
100
109
|
print_info=False,
|
|
101
|
-
|
|
102
|
-
|
|
110
|
+
time_info: dict | None = None,
|
|
111
|
+
missing_strategy = 'drop', # [drop, mode]
|
|
103
112
|
):
|
|
104
113
|
"""
|
|
105
114
|
Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
|
|
@@ -121,6 +130,9 @@ class CSV_TO_Pandas:
|
|
|
121
130
|
print_info (bool, optional):
|
|
122
131
|
Whether to print preprocessing information and dataset statistics.
|
|
123
132
|
Defaults to False.
|
|
133
|
+
title_name (str):
|
|
134
|
+
Title used for the summary table or report that documents
|
|
135
|
+
the preprocessing steps and dataset statistics.
|
|
124
136
|
|
|
125
137
|
Returns:
|
|
126
138
|
pandas.DataFrame:
|
|
@@ -145,7 +157,7 @@ class CSV_TO_Pandas:
|
|
|
145
157
|
... )
|
|
146
158
|
"""
|
|
147
159
|
# Step 0: Load the dataset
|
|
148
|
-
df = pd.read_csv(csv_path)
|
|
160
|
+
# df = pd.read_csv(csv_path)
|
|
149
161
|
columns = df.columns
|
|
150
162
|
|
|
151
163
|
# Save original size
|
|
@@ -155,7 +167,15 @@ class CSV_TO_Pandas:
|
|
|
155
167
|
df = df.drop(columns=drop_cols)
|
|
156
168
|
|
|
157
169
|
# Step 2: Remove rows with missing values
|
|
158
|
-
|
|
170
|
+
if missing_strategy == 'drop':
|
|
171
|
+
df = df.dropna(axis=0, how="any")
|
|
172
|
+
|
|
173
|
+
elif missing_strategy == 'mode':
|
|
174
|
+
for col in df.columns:
|
|
175
|
+
if df[col].notna().any():
|
|
176
|
+
mode_val = df[col].mode()[0]
|
|
177
|
+
df[col] = df[col].fillna(mode_val)
|
|
178
|
+
|
|
159
179
|
m_encoded, n_encoded = df.shape
|
|
160
180
|
|
|
161
181
|
if time_info is not None:
|
|
@@ -177,17 +197,6 @@ class CSV_TO_Pandas:
|
|
|
177
197
|
)
|
|
178
198
|
m_cleaned, n_cleaned = df.shape
|
|
179
199
|
|
|
180
|
-
if Standard:
|
|
181
|
-
# Identify numerical columns Standardize numerical columns
|
|
182
|
-
num_cols = [
|
|
183
|
-
col
|
|
184
|
-
for col in df.columns
|
|
185
|
-
if col
|
|
186
|
-
not in list(text_feature_cols) + [label_col] + [user_one_hot_cols]
|
|
187
|
-
]
|
|
188
|
-
scaler = StandardScaler()
|
|
189
|
-
df[num_cols] = scaler.fit_transform(df[num_cols])
|
|
190
|
-
|
|
191
200
|
# print info
|
|
192
201
|
if print_info:
|
|
193
202
|
pos_count = (df[label_col] == 1).sum()
|
|
@@ -201,6 +210,7 @@ class CSV_TO_Pandas:
|
|
|
201
210
|
print(
|
|
202
211
|
f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
|
|
203
212
|
)
|
|
213
|
+
print(f"{'missing_strategy:':<40} {missing_strategy}")
|
|
204
214
|
print(
|
|
205
215
|
f"{'Dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
|
|
206
216
|
)
|
|
@@ -216,19 +226,97 @@ class CSV_TO_Pandas:
|
|
|
216
226
|
print(f"{'label_map:':<40} {label_map}")
|
|
217
227
|
print(f"{'time column:':<40} {time_info}")
|
|
218
228
|
if time_info is not None:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
229
|
+
if time_info["trans_type"] == 0:
|
|
230
|
+
print("- 0 : Extract ['year', 'month', 'day', 'hour']")
|
|
231
|
+
elif time_info["trans_type"] == 1:
|
|
232
|
+
print("- 1 : Extract ['hour', 'dayofweek', 'is_weekend']")
|
|
233
|
+
elif time_info["trans_type"] == 2:
|
|
234
|
+
print("- 2 : Extract ['year', 'month', 'day']")
|
|
235
|
+
else:
|
|
236
|
+
assert False
|
|
222
237
|
print(
|
|
223
238
|
f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
|
|
224
239
|
)
|
|
225
|
-
print("-" * 80)
|
|
226
|
-
print("all columns:")
|
|
227
|
-
print(list(columns))
|
|
228
|
-
print("=" * 80 + "\n")
|
|
240
|
+
# print("-" * 80)
|
|
241
|
+
# print("all columns:")
|
|
242
|
+
# print(list(columns))
|
|
243
|
+
# print("=" * 80 + "\n")
|
|
229
244
|
|
|
230
245
|
return df
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
from sklearn.model_selection import train_test_split
|
|
249
|
+
from sklearn.preprocessing import StandardScaler
|
|
250
|
+
from torch.utils.data import Dataset
|
|
251
|
+
|
|
252
|
+
class Pandas_TO_Torch(Dataset):
|
|
253
|
+
|
|
254
|
+
def __init__(self, df: pd.DataFrame,
|
|
255
|
+
label_col: str,
|
|
256
|
+
):
|
|
257
|
+
self.df = df
|
|
258
|
+
self.label_col = label_col
|
|
231
259
|
|
|
260
|
+
# Identify feature columns automatically (all except the label)
|
|
261
|
+
self.label_col = label_col
|
|
262
|
+
self.feature_cols = [col for col in self.df.columns if col != label_col]
|
|
263
|
+
|
|
264
|
+
# Extract features and labels
|
|
265
|
+
self.features = self.df[self.feature_cols].values.astype("float32")
|
|
266
|
+
self.labels = self.df[self.label_col].values.astype("int64")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def __len__(self):
|
|
270
|
+
"""Return the total number of samples."""
|
|
271
|
+
return len(self.features)
|
|
272
|
+
|
|
273
|
+
def __getitem__(self, idx):
|
|
274
|
+
x = torch.tensor(self.features[idx], dtype=torch.float32)
|
|
275
|
+
y = torch.tensor(self.labels[idx], dtype=torch.long)
|
|
276
|
+
|
|
277
|
+
return x, y
|
|
278
|
+
|
|
279
|
+
def __repr__(self):
|
|
280
|
+
info = (
|
|
281
|
+
f"Dataset CustomNumericDataset\n"
|
|
282
|
+
f" Number of datapoints: {len(self)}\n"
|
|
283
|
+
f" Features: {self.features.shape[1]}\n"
|
|
284
|
+
)
|
|
285
|
+
return info
|
|
286
|
+
|
|
287
|
+
def to_torch(self, transform, Paras):
|
|
288
|
+
fea_cols = [col for col in self.df.columns if col != self.label_col]
|
|
289
|
+
|
|
290
|
+
if transform["normalization"]:
|
|
291
|
+
scaler = StandardScaler()
|
|
292
|
+
self.df[fea_cols] = scaler.fit_transform(self.df[fea_cols])
|
|
293
|
+
|
|
294
|
+
# Train/test split
|
|
295
|
+
|
|
296
|
+
train_df, test_df = train_test_split(self.df, train_size=transform["train_size"], random_state=Paras["seed"], stratify=self.df[self.label_col])
|
|
297
|
+
|
|
298
|
+
# Create datasets
|
|
299
|
+
train_dataset = Pandas_TO_Torch(train_df, self.label_col)
|
|
300
|
+
test_dataset = Pandas_TO_Torch(test_df, self.label_col)
|
|
301
|
+
|
|
302
|
+
return train_dataset, test_dataset, transform
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class TXT_TO_Numpy:
|
|
306
|
+
def __init__(self):
|
|
307
|
+
pass
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class bz2_To_Numpy:
|
|
311
|
+
def __init__(self):
|
|
312
|
+
pass
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
|
|
232
320
|
|
|
233
321
|
class StepByStep:
|
|
234
322
|
def __init__(self):
|
|
@@ -242,3 +330,5 @@ class StepByStep:
|
|
|
242
330
|
print(
|
|
243
331
|
f"Unique values ({len(df[col].unique())}): {df[col].unique().tolist()}"
|
|
244
332
|
)
|
|
333
|
+
|
|
334
|
+
|