junshan-kit 2.2.3__py2.py3-none-any.whl → 2.5.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- junshan_kit/DataHub.py +114 -0
- junshan_kit/DataProcessor.py +280 -138
- junshan_kit/DataSets.py +377 -0
- junshan_kit/Evaluate_Metrics.py +40 -0
- junshan_kit/ModelsHub.py +212 -0
- junshan_kit/ParametersHub.py +419 -0
- junshan_kit/Print_Info.py +63 -0
- junshan_kit/TrainingHub.py +174 -0
- junshan_kit/kit.py +279 -2
- {junshan_kit-2.2.3.dist-info → junshan_kit-2.5.1.dist-info}/METADATA +2 -4
- junshan_kit-2.5.1.dist-info/RECORD +13 -0
- junshan_kit/datahub.py +0 -146
- junshan_kit/meta.py +0 -256
- junshan_kit/test.py +0 -8
- junshan_kit-2.2.3.dist-info/RECORD +0 -9
- {junshan_kit-2.2.3.dist-info → junshan_kit-2.5.1.dist-info}/WHEEL +0 -0
junshan_kit/DataHub.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import torchvision, torch
|
|
2
|
+
import torchvision.transforms as transforms
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from junshan_kit import DataSets, DataProcessor
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def Adult_Income_Prediction(Paras):
|
|
18
|
+
|
|
19
|
+
df = DataSets.adult_income_prediction()
|
|
20
|
+
transform = {
|
|
21
|
+
"train_size": 0.7,
|
|
22
|
+
"normalization": True
|
|
23
|
+
}
|
|
24
|
+
label_col='income'
|
|
25
|
+
|
|
26
|
+
train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
|
|
27
|
+
|
|
28
|
+
return train_dataset, test_dataset, transform
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def Credit_Card_Fraud_Detection(Paras):
|
|
32
|
+
df = DataSets.credit_card_fraud_detection()
|
|
33
|
+
transform = {
|
|
34
|
+
"train_size": 0.7,
|
|
35
|
+
"normalization": True
|
|
36
|
+
}
|
|
37
|
+
label_col='Class'
|
|
38
|
+
|
|
39
|
+
train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
|
|
40
|
+
|
|
41
|
+
return train_dataset, test_dataset, transform
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def MNIST(Paras, model_name):
|
|
46
|
+
"""
|
|
47
|
+
Load the MNIST dataset and return both the training and test sets,
|
|
48
|
+
along with the transformation applied (ToTensor).
|
|
49
|
+
"""
|
|
50
|
+
transform = torchvision.transforms.ToTensor()
|
|
51
|
+
|
|
52
|
+
train_dataset = torchvision.datasets.MNIST(
|
|
53
|
+
root='./exp_data/MNIST',
|
|
54
|
+
train=True,
|
|
55
|
+
download=True,
|
|
56
|
+
transform=transform
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
test_dataset = torchvision.datasets.MNIST(
|
|
60
|
+
root='./exp_data/MNIST',
|
|
61
|
+
train=False,
|
|
62
|
+
download=True,
|
|
63
|
+
transform=transform
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if Paras["model_type"][model_name] == "binary":
|
|
67
|
+
|
|
68
|
+
train_mask = (train_dataset.targets == 0) | (train_dataset.targets == 1)
|
|
69
|
+
test_mask = (test_dataset.targets == 0) | (test_dataset.targets == 1)
|
|
70
|
+
|
|
71
|
+
train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
|
|
72
|
+
test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
|
|
73
|
+
|
|
74
|
+
train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
|
|
75
|
+
test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
|
|
76
|
+
|
|
77
|
+
return train_dataset, test_dataset, transform
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def CIFAR100(Paras, model_name):
|
|
81
|
+
"""
|
|
82
|
+
Load the CIFAR-100 dataset with standard normalization and return both
|
|
83
|
+
the training and test sets, along with the transformation applied.
|
|
84
|
+
"""
|
|
85
|
+
transform = transforms.Compose([
|
|
86
|
+
transforms.ToTensor(),
|
|
87
|
+
transforms.Normalize(mean=[0.5071, 0.4867, 0.4408],
|
|
88
|
+
std=[0.2675, 0.2565, 0.2761])
|
|
89
|
+
])
|
|
90
|
+
|
|
91
|
+
train_dataset = torchvision.datasets.CIFAR100(
|
|
92
|
+
root='./exp_data/CIFAR100',
|
|
93
|
+
train=True,
|
|
94
|
+
download=True,
|
|
95
|
+
transform=transform
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
test_dataset = torchvision.datasets.CIFAR100(
|
|
99
|
+
root='./exp_data/CIFAR100',
|
|
100
|
+
train=False,
|
|
101
|
+
download=True,
|
|
102
|
+
transform=transform
|
|
103
|
+
)
|
|
104
|
+
if Paras["model_type"][model_name] == "binary":
|
|
105
|
+
train_mask = (torch.tensor(train_dataset.targets) == 0) | (torch.tensor(train_dataset.targets) == 1)
|
|
106
|
+
test_mask = (torch.tensor(test_dataset.targets) == 0) | (torch.tensor(test_dataset.targets) == 1)
|
|
107
|
+
|
|
108
|
+
train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
|
|
109
|
+
test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
|
|
110
|
+
|
|
111
|
+
train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
|
|
112
|
+
test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
|
|
113
|
+
|
|
114
|
+
return train_dataset, test_dataset, transform
|
junshan_kit/DataProcessor.py
CHANGED
|
@@ -1,148 +1,164 @@
|
|
|
1
1
|
"""
|
|
2
2
|
----------------------------------------------------------------------
|
|
3
|
-
>>> Author : Junshan Yin
|
|
3
|
+
>>> Author : Junshan Yin
|
|
4
4
|
>>> Last Updated : 2025-10-12
|
|
5
5
|
----------------------------------------------------------------------
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
|
-
import
|
|
9
|
+
import torch
|
|
10
10
|
from sklearn.preprocessing import StandardScaler
|
|
11
|
-
import junshan_kit.datahub
|
|
12
|
-
import zipfile
|
|
13
11
|
|
|
14
|
-
|
|
12
|
+
|
|
13
|
+
class CSV_TO_Pandas:
|
|
15
14
|
def __init__(self):
|
|
16
|
-
|
|
17
|
-
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
def _trans_time_fea(self, df, time_info: dict):
|
|
18
|
+
"""
|
|
19
|
+
Transform and extract time-based features from a specified datetime column.
|
|
20
|
+
|
|
21
|
+
This function converts a given column to pandas datetime format and
|
|
22
|
+
extracts different time-related features based on the specified mode.
|
|
23
|
+
It supports two extraction modes:
|
|
24
|
+
- type = 0: Extracts basic components (year, month, day, hour)
|
|
25
|
+
- type = 1: Extracts hour, day of week, and weekend indicator
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
df : pandas.DataFrame
|
|
30
|
+
Input DataFrame containing the datetime column.
|
|
31
|
+
time_info:
|
|
32
|
+
- time_col_name : str
|
|
33
|
+
Name of the column containing time or datetime values.
|
|
34
|
+
- trans_type : int, optional, default=1
|
|
35
|
+
- 0 : Extract ['year', 'month', 'day', 'hour']
|
|
36
|
+
- 1 : Extract ['hour', 'dayofweek', 'is_weekend']
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
pandas.DataFrame
|
|
41
|
+
The DataFrame with newly added time-based feature columns.
|
|
42
|
+
|
|
43
|
+
Notes
|
|
44
|
+
-----
|
|
45
|
+
- Rows that cannot be parsed as valid datetime will be dropped automatically.
|
|
46
|
+
- 'dayofweek' ranges from 0 (Monday) to 6 (Sunday).
|
|
47
|
+
- 'is_weekend' equals 1 if the day is Saturday or Sunday, otherwise 0.
|
|
48
|
+
|
|
49
|
+
Examples
|
|
50
|
+
--------
|
|
51
|
+
>>> import pandas as pd
|
|
52
|
+
>>> data = pd.DataFrame({
|
|
53
|
+
... 'timestamp': ['2023-08-01 12:30:00', '2023-08-05 08:15:00', 'invalid_time']
|
|
54
|
+
... })
|
|
55
|
+
>>> df = handler._trans_time_fea(data, {"time_col_name": "timestamp", "trans_type": 1})
|
|
56
|
+
>>> print(df)
|
|
57
|
+
timestamp hour dayofweek is_weekend
|
|
58
|
+
0 2023-08-01 12:30:00 12 1 0
|
|
59
|
+
1 2023-08-05 08:15:00 8 5 1
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
time_col_name, trans_type = time_info['time_col_name'], time_info['trans_type']
|
|
18
63
|
|
|
19
|
-
|
|
20
|
-
self.csv_path = f'exp_data/{data_name}/{data_name}.csv'
|
|
21
|
-
if not os.path.exists(self.csv_path):
|
|
22
|
-
self.data_downloader.download_data(f'{data_name}', f'exp_data/{data_name}')
|
|
64
|
+
df[time_col_name] = pd.to_datetime(df[time_col_name], errors="coerce")
|
|
23
65
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
# download data if not exist
|
|
27
|
-
self.read_csv(data_name)
|
|
28
|
-
|
|
29
|
-
df = pd.read_csv(self.csv_path)
|
|
30
|
-
m_before, n_before = df.shape
|
|
31
|
-
df = df.dropna(axis=0, how='any')
|
|
32
|
-
m_after, n_after = df.shape
|
|
33
|
-
df['Class'] = df['Class'].replace(0, -1)
|
|
34
|
-
|
|
35
|
-
if show_info:
|
|
36
|
-
pos_count = (df['Class'] == 1).sum()
|
|
37
|
-
neg_count = (df['Class'] == -1).sum()
|
|
38
|
-
|
|
39
|
-
print('\n' + '='*60)
|
|
40
|
-
print(f"{'CCFD-Kaggle Dataset Info':^60}")
|
|
41
|
-
print('='*60)
|
|
42
|
-
print(f"{'Original size:':<25} {m_before} rows x {n_before} cols")
|
|
43
|
-
print(f"{'Size after dropping NaNs:':<25} {m_after} rows x {n_after} cols")
|
|
44
|
-
print(f"{'Positive samples (+1):':<25} {pos_count}")
|
|
45
|
-
print(f"{'Negative samples (-1):':<25} {neg_count}")
|
|
46
|
-
print(f"{'Export size:':<25} {m_after} rows x {n_after} cols")
|
|
47
|
-
print('-'*60)
|
|
48
|
-
print(f"More details: https://www.jianguoyun.com/p/Dd1clVgQ4ZThCxiwzZQGIAA")
|
|
49
|
-
print('='*60 + '\n')
|
|
66
|
+
# Drop rows where the datetime conversion failed, and make an explicit copy
|
|
67
|
+
df = df.dropna(subset=[time_col_name]).copy()
|
|
50
68
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
self.read_csv(data_name)
|
|
57
|
-
|
|
58
|
-
# read csv
|
|
59
|
-
df = pd.read_csv(self.csv_path)
|
|
60
|
-
m_before, n_before = df.shape
|
|
61
|
-
|
|
62
|
-
# drop NaNs
|
|
63
|
-
df = df.dropna(axis=0, how='any')
|
|
64
|
-
m_after, n_after = df.shape
|
|
65
|
-
|
|
66
|
-
# drop unique identifier
|
|
67
|
-
if 'property_id' in df.columns:
|
|
68
|
-
df.drop(columns=['property_id'], inplace=True)
|
|
69
|
-
|
|
70
|
-
# Replace label 0 with -1
|
|
71
|
-
df['decision'] = df['decision'].replace(0, -1)
|
|
72
|
-
|
|
73
|
-
# Identify categorical and numerical columns
|
|
74
|
-
cat_cols = ['country', 'city', 'property_type', 'furnishing_status']
|
|
75
|
-
num_cols = [col for col in df.columns if col not in cat_cols + ['decision']]
|
|
76
|
-
|
|
77
|
-
# One-Hot encode categorical columns
|
|
78
|
-
df = pd.get_dummies(df, columns=cat_cols)
|
|
79
|
-
|
|
80
|
-
# Convert boolean columns to int
|
|
81
|
-
bool_cols = df.select_dtypes(include='bool').columns
|
|
82
|
-
for col in bool_cols:
|
|
83
|
-
df[col] = df[col].astype(int)
|
|
84
|
-
|
|
85
|
-
# Standardize numerical columns
|
|
86
|
-
scaler = StandardScaler()
|
|
87
|
-
df[num_cols] = scaler.fit_transform(df[num_cols])
|
|
88
|
-
|
|
89
|
-
# The size after export
|
|
90
|
-
m_export, n_export = df.shape
|
|
91
|
-
|
|
92
|
-
if show_info:
|
|
93
|
-
pos_count = (df['decision'] == 1).sum()
|
|
94
|
-
neg_count = (df['decision'] == -1).sum()
|
|
95
|
-
|
|
96
|
-
print('\n' + '='*70)
|
|
97
|
-
print(f"{'GHPDD-Kaggle Dataset Info':^70}")
|
|
98
|
-
print('='*70)
|
|
99
|
-
print(f"{'Original size:':<35} {m_before} rows x {n_before} cols")
|
|
100
|
-
print(f"{'Size after dropping NaNs:':<35} {m_after} rows x {n_after} cols")
|
|
101
|
-
print(f"{'Positive samples (+1):':<35} {pos_count}")
|
|
102
|
-
print(f"{'Negative samples (-1):':<35} {neg_count}")
|
|
103
|
-
print(f"{'Export size (after encoding & scaling):':<35} {m_export} rows x {n_export} cols")
|
|
104
|
-
print('-'*70)
|
|
105
|
-
print(f"{'Note: categorical columns one-hot encoded, numerical standardized.'}")
|
|
106
|
-
print(f"More details: https://www.jianguoyun.com/p/DU6Lr9oQqdHDDRj5sI0GIAA")
|
|
107
|
-
print('='*70 + '\n')
|
|
69
|
+
if trans_type == 0:
|
|
70
|
+
df.loc[:, "year"] = df[time_col_name].dt.year
|
|
71
|
+
df.loc[:, "month"] = df[time_col_name].dt.month
|
|
72
|
+
df.loc[:, "day"] = df[time_col_name].dt.day
|
|
73
|
+
df.loc[:, "hour"] = df[time_col_name].dt.hour
|
|
108
74
|
|
|
109
|
-
|
|
75
|
+
user_text_fea = ['year','month','day', 'hour']
|
|
76
|
+
df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
|
|
110
77
|
|
|
78
|
+
elif trans_type == 1:
|
|
79
|
+
df.loc[:, "hour"] = df[time_col_name].dt.hour
|
|
80
|
+
df.loc[:, "dayofweek"] = df[time_col_name].dt.dayofweek
|
|
81
|
+
df.loc[:, "is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
|
|
111
82
|
|
|
83
|
+
user_text_fea = ['hour','dayofweek','is_weekend']
|
|
84
|
+
df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
|
|
112
85
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
86
|
+
elif trans_type == 2:
|
|
87
|
+
df.loc[:, "year"] = df[time_col_name].dt.year
|
|
88
|
+
df.loc[:, "month"] = df[time_col_name].dt.month
|
|
89
|
+
df.loc[:, "day"] = df[time_col_name].dt.day
|
|
116
90
|
|
|
117
|
-
def unzip_file(self, zip_path: str, unzip_folder: str):
|
|
118
|
-
"""
|
|
119
|
-
Args:
|
|
120
|
-
zip_path (str): Path to the ZIP file to extract.
|
|
121
|
-
dest_folder (str, optional): Folder to extract files into.
|
|
122
|
-
If None, the function will create a folder with the same
|
|
123
|
-
name as the ZIP file (without extension).
|
|
124
|
-
|
|
125
|
-
Examples:
|
|
126
|
-
>>> zip_path = "./downloads/data.zip"
|
|
127
|
-
>>> unzip_folder = "./exp_data/data"
|
|
128
|
-
>>> unzip_file(zip_path, unzip_folder)
|
|
129
|
-
"""
|
|
130
91
|
|
|
131
|
-
|
|
132
|
-
|
|
92
|
+
user_text_fea = ['year','month','day']
|
|
93
|
+
df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
|
|
94
|
+
else:
|
|
95
|
+
print("error!")
|
|
133
96
|
|
|
134
|
-
|
|
97
|
+
df = df.drop(columns=[time_col_name])
|
|
135
98
|
|
|
136
|
-
|
|
137
|
-
zip_ref.extractall(unzip_folder)
|
|
99
|
+
return df
|
|
138
100
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
101
|
+
def preprocess_dataset(
|
|
102
|
+
self,
|
|
103
|
+
df,
|
|
104
|
+
drop_cols: list,
|
|
105
|
+
label_col: str,
|
|
106
|
+
label_map: dict,
|
|
107
|
+
title_name: str,
|
|
108
|
+
user_one_hot_cols=[],
|
|
109
|
+
print_info=False,
|
|
110
|
+
time_info: dict | None = None,
|
|
111
|
+
missing_strategy = 'drop', # [drop, mode]
|
|
112
|
+
):
|
|
113
|
+
"""
|
|
114
|
+
Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
|
|
115
|
+
|
|
116
|
+
This function loads a dataset from a CSV file, removes specified non-feature columns,
|
|
117
|
+
drops rows with missing values, maps the target label to numerical values, and
|
|
118
|
+
one-hot encodes categorical features. Optionally, it can print dataset statistics
|
|
119
|
+
before and after preprocessing.
|
|
142
120
|
|
|
143
|
-
|
|
121
|
+
Args:
|
|
122
|
+
csv_path (str):
|
|
123
|
+
Path to the input CSV dataset.
|
|
124
|
+
drop_cols (list):
|
|
125
|
+
List of column names to drop from the dataset.
|
|
126
|
+
label_col (str):
|
|
127
|
+
Name of the target label column.
|
|
128
|
+
label_map (dict):
|
|
129
|
+
Mapping dictionary for label conversion (e.g., {"yes": 1, "no": -1}).
|
|
130
|
+
print_info (bool, optional):
|
|
131
|
+
Whether to print preprocessing information and dataset statistics.
|
|
132
|
+
Defaults to False.
|
|
133
|
+
title_name (str):
|
|
134
|
+
Title used for the summary table or report that documents
|
|
135
|
+
the preprocessing steps and dataset statistics.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
pandas.DataFrame:
|
|
139
|
+
The cleaned and preprocessed dataset ready for model input.
|
|
140
|
+
|
|
141
|
+
Steps:
|
|
142
|
+
1. Load the dataset from CSV.
|
|
143
|
+
2. Drop non-informative or irrelevant columns.
|
|
144
|
+
3. Remove rows with missing values.
|
|
145
|
+
4. Map label column to numerical values according to `label_map`.
|
|
146
|
+
5. One-hot encode categorical (non-label) text features.
|
|
147
|
+
6. Optionally print dataset information and summary statistics.
|
|
148
|
+
|
|
149
|
+
Example:
|
|
150
|
+
>>> label_map = {"positive": 1, "negative": -1}
|
|
151
|
+
>>> df = data_handler.preprocess_dataset(
|
|
152
|
+
... csv_path="data/raw.csv",
|
|
153
|
+
... drop_cols=["id", "timestamp"],
|
|
154
|
+
... label_col="sentiment",
|
|
155
|
+
... label_map=label_map,
|
|
156
|
+
... print_info=True
|
|
157
|
+
... )
|
|
158
|
+
"""
|
|
144
159
|
# Step 0: Load the dataset
|
|
145
|
-
df = pd.read_csv(csv_path)
|
|
160
|
+
# df = pd.read_csv(csv_path)
|
|
161
|
+
columns = df.columns
|
|
146
162
|
|
|
147
163
|
# Save original size
|
|
148
164
|
m_original, n_original = df.shape
|
|
@@ -151,17 +167,34 @@ class CSV_TO_Pandas:
|
|
|
151
167
|
df = df.drop(columns=drop_cols)
|
|
152
168
|
|
|
153
169
|
# Step 2: Remove rows with missing values
|
|
154
|
-
|
|
170
|
+
if missing_strategy == 'drop':
|
|
171
|
+
df = df.dropna(axis=0, how="any")
|
|
172
|
+
|
|
173
|
+
elif missing_strategy == 'mode':
|
|
174
|
+
for col in df.columns:
|
|
175
|
+
if df[col].notna().any():
|
|
176
|
+
mode_val = df[col].mode()[0]
|
|
177
|
+
df[col] = df[col].fillna(mode_val)
|
|
178
|
+
|
|
155
179
|
m_encoded, n_encoded = df.shape
|
|
156
|
-
|
|
157
|
-
|
|
180
|
+
|
|
181
|
+
if time_info is not None:
|
|
182
|
+
df = self._trans_time_fea(df, time_info)
|
|
183
|
+
|
|
184
|
+
# Step 3: Map target label (to -1 and +1)
|
|
158
185
|
df[label_col] = df[label_col].map(label_map)
|
|
159
186
|
|
|
160
187
|
# Step 4: Encode categorical features (exclude label column)
|
|
161
|
-
text_feature_cols = df.select_dtypes(
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
188
|
+
text_feature_cols = df.select_dtypes(
|
|
189
|
+
include=["object", "string", "category"]
|
|
190
|
+
).columns
|
|
191
|
+
text_feature_cols = [
|
|
192
|
+
col for col in text_feature_cols if col != label_col
|
|
193
|
+
] # ✅ exclude label
|
|
194
|
+
|
|
195
|
+
df = pd.get_dummies(
|
|
196
|
+
df, columns=text_feature_cols + user_one_hot_cols, dtype=int
|
|
197
|
+
)
|
|
165
198
|
m_cleaned, n_cleaned = df.shape
|
|
166
199
|
|
|
167
200
|
# print info
|
|
@@ -170,23 +203,132 @@ class CSV_TO_Pandas:
|
|
|
170
203
|
neg_count = (df[label_col] == -1).sum()
|
|
171
204
|
|
|
172
205
|
# Step 6: Print dataset information
|
|
173
|
-
print(
|
|
174
|
-
print(f"{'
|
|
175
|
-
print(
|
|
206
|
+
print("\n" + "=" * 80)
|
|
207
|
+
print(f"{f'{title_name} - Summary':^70}")
|
|
208
|
+
print("=" * 80)
|
|
176
209
|
print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
|
|
177
|
-
print(
|
|
210
|
+
print(
|
|
211
|
+
f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
|
|
212
|
+
)
|
|
213
|
+
print(f"{'missing_strategy:':<40} {missing_strategy}")
|
|
214
|
+
print(
|
|
215
|
+
f"{'Dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
|
|
216
|
+
)
|
|
178
217
|
print(f"{'Positive samples (+1):':<40} {pos_count}")
|
|
179
218
|
print(f"{'Negative samples (-1):':<40} {neg_count}")
|
|
180
|
-
print(
|
|
181
|
-
|
|
182
|
-
|
|
219
|
+
print(
|
|
220
|
+
f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
|
|
221
|
+
)
|
|
222
|
+
print("-" * 80)
|
|
223
|
+
print(f"{'More details about preprocessing':^70}")
|
|
224
|
+
print("-" * 80)
|
|
183
225
|
print(f"{'Label column:':<40} {label_col}")
|
|
184
|
-
print(f"{'
|
|
185
|
-
print(f"{'
|
|
186
|
-
|
|
226
|
+
print(f"{'label_map:':<40} {label_map}")
|
|
227
|
+
print(f"{'time column:':<40} {time_info}")
|
|
228
|
+
if time_info is not None:
|
|
229
|
+
if time_info["trans_type"] == 0:
|
|
230
|
+
print("- 0 : Extract ['year', 'month', 'day', 'hour']")
|
|
231
|
+
elif time_info["trans_type"] == 1:
|
|
232
|
+
print("- 1 : Extract ['hour', 'dayofweek', 'is_weekend']")
|
|
233
|
+
elif time_info["trans_type"] == 2:
|
|
234
|
+
print("- 2 : Extract ['year', 'month', 'day']")
|
|
235
|
+
else:
|
|
236
|
+
assert False
|
|
237
|
+
print(
|
|
238
|
+
f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
|
|
239
|
+
)
|
|
240
|
+
# print("-" * 80)
|
|
241
|
+
# print("all columns:")
|
|
242
|
+
# print(list(columns))
|
|
243
|
+
# print("=" * 80 + "\n")
|
|
187
244
|
|
|
188
245
|
return df
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
from sklearn.model_selection import train_test_split
|
|
249
|
+
from sklearn.preprocessing import StandardScaler
|
|
250
|
+
from torch.utils.data import Dataset
|
|
251
|
+
|
|
252
|
+
class Pandas_TO_Torch(Dataset):
|
|
253
|
+
|
|
254
|
+
def __init__(self, df: pd.DataFrame,
|
|
255
|
+
label_col: str,
|
|
256
|
+
):
|
|
257
|
+
self.df = df
|
|
258
|
+
self.label_col = label_col
|
|
259
|
+
|
|
260
|
+
# Identify feature columns automatically (all except the label)
|
|
261
|
+
self.label_col = label_col
|
|
262
|
+
self.feature_cols = [col for col in self.df.columns if col != label_col]
|
|
189
263
|
|
|
264
|
+
# Extract features and labels
|
|
265
|
+
self.features = self.df[self.feature_cols].values.astype("float32")
|
|
266
|
+
self.labels = self.df[self.label_col].values.astype("int64")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def __len__(self):
|
|
270
|
+
"""Return the total number of samples."""
|
|
271
|
+
return len(self.features)
|
|
272
|
+
|
|
273
|
+
def __getitem__(self, idx):
|
|
274
|
+
x = torch.tensor(self.features[idx], dtype=torch.float32)
|
|
275
|
+
y = torch.tensor(self.labels[idx], dtype=torch.long)
|
|
190
276
|
|
|
277
|
+
return x, y
|
|
278
|
+
|
|
279
|
+
def __repr__(self):
|
|
280
|
+
info = (
|
|
281
|
+
f"Dataset CustomNumericDataset\n"
|
|
282
|
+
f" Number of datapoints: {len(self)}\n"
|
|
283
|
+
f" Features: {self.features.shape[1]}\n"
|
|
284
|
+
)
|
|
285
|
+
return info
|
|
286
|
+
|
|
287
|
+
def to_torch(self, transform, Paras):
|
|
288
|
+
fea_cols = [col for col in self.df.columns if col != self.label_col]
|
|
191
289
|
|
|
290
|
+
if transform["normalization"]:
|
|
291
|
+
scaler = StandardScaler()
|
|
292
|
+
self.df[fea_cols] = scaler.fit_transform(self.df[fea_cols])
|
|
192
293
|
|
|
294
|
+
# Train/test split
|
|
295
|
+
|
|
296
|
+
train_df, test_df = train_test_split(self.df, train_size=transform["train_size"], random_state=Paras["seed"], stratify=self.df[self.label_col])
|
|
297
|
+
|
|
298
|
+
# Create datasets
|
|
299
|
+
train_dataset = Pandas_TO_Torch(train_df, self.label_col)
|
|
300
|
+
test_dataset = Pandas_TO_Torch(test_df, self.label_col)
|
|
301
|
+
|
|
302
|
+
return train_dataset, test_dataset, transform
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class TXT_TO_Numpy:
|
|
306
|
+
def __init__(self):
|
|
307
|
+
pass
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class bz2_To_Numpy:
|
|
311
|
+
def __init__(self):
|
|
312
|
+
pass
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
class StepByStep:
|
|
322
|
+
def __init__(self):
|
|
323
|
+
pass
|
|
324
|
+
|
|
325
|
+
def print_text_fea(self, df, text_feature_cols):
|
|
326
|
+
for col in text_feature_cols:
|
|
327
|
+
print(f"\n{'-'*80}")
|
|
328
|
+
print(f'Feature: "{col}"')
|
|
329
|
+
print(f"{'-'*80}")
|
|
330
|
+
print(
|
|
331
|
+
f"Unique values ({len(df[col].unique())}): {df[col].unique().tolist()}"
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
|