junshan-kit 2.4.8__py2.py3-none-any.whl → 2.4.9__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of junshan-kit might be problematic. Click here for more details.

junshan_kit/DataHub.py ADDED
@@ -0,0 +1,114 @@
1
+ import torchvision, torch
2
+ import torchvision.transforms as transforms
3
+ import pandas as pd
4
+
5
+ from junshan_kit import DataSets, DataProcessor
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+ def Adult_Income_Prediction(Paras):
18
+
19
+ df = DataSets.adult_income_prediction()
20
+ transform = {
21
+ "train_size": 0.7,
22
+ "normalization": True
23
+ }
24
+ label_col='income'
25
+
26
+ train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
27
+
28
+ return train_dataset, test_dataset, transform
29
+
30
+
31
+ def Credit_Card_Fraud_Detection(Paras):
32
+ df = DataSets.credit_card_fraud_detection()
33
+ transform = {
34
+ "train_size": 0.7,
35
+ "normalization": True
36
+ }
37
+ label_col='Class'
38
+
39
+ train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
40
+
41
+ return train_dataset, test_dataset, transform
42
+
43
+
44
+
45
+ def MNIST(Paras, model_name):
46
+ """
47
+ Load the MNIST dataset and return both the training and test sets,
48
+ along with the transformation applied (ToTensor).
49
+ """
50
+ transform = torchvision.transforms.ToTensor()
51
+
52
+ train_dataset = torchvision.datasets.MNIST(
53
+ root='./exp_data/MNIST',
54
+ train=True,
55
+ download=True,
56
+ transform=transform
57
+ )
58
+
59
+ test_dataset = torchvision.datasets.MNIST(
60
+ root='./exp_data/MNIST',
61
+ train=False,
62
+ download=True,
63
+ transform=transform
64
+ )
65
+
66
+ if Paras["model_type"][model_name] == "binary":
67
+
68
+ train_mask = (train_dataset.targets == 0) | (train_dataset.targets == 1)
69
+ test_mask = (test_dataset.targets == 0) | (test_dataset.targets == 1)
70
+
71
+ train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
72
+ test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
73
+
74
+ train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
75
+ test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
76
+
77
+ return train_dataset, test_dataset, transform
78
+
79
+
80
+ def CIFAR100(Paras, model_name):
81
+ """
82
+ Load the CIFAR-100 dataset with standard normalization and return both
83
+ the training and test sets, along with the transformation applied.
84
+ """
85
+ transform = transforms.Compose([
86
+ transforms.ToTensor(),
87
+ transforms.Normalize(mean=[0.5071, 0.4867, 0.4408],
88
+ std=[0.2675, 0.2565, 0.2761])
89
+ ])
90
+
91
+ train_dataset = torchvision.datasets.CIFAR100(
92
+ root='./exp_data/CIFAR100',
93
+ train=True,
94
+ download=True,
95
+ transform=transform
96
+ )
97
+
98
+ test_dataset = torchvision.datasets.CIFAR100(
99
+ root='./exp_data/CIFAR100',
100
+ train=False,
101
+ download=True,
102
+ transform=transform
103
+ )
104
+ if Paras["model_type"][model_name] == "binary":
105
+ train_mask = (torch.tensor(train_dataset.targets) == 0) | (torch.tensor(train_dataset.targets) == 1)
106
+ test_mask = (torch.tensor(test_dataset.targets) == 0) | (torch.tensor(test_dataset.targets) == 1)
107
+
108
+ train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
109
+ test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
110
+
111
+ train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
112
+ test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
113
+
114
+ return train_dataset, test_dataset, transform
@@ -6,6 +6,7 @@
6
6
  """
7
7
 
8
8
  import pandas as pd
9
+ import torch
9
10
  from sklearn.preprocessing import StandardScaler
10
11
 
11
12
 
@@ -13,7 +14,6 @@ class CSV_TO_Pandas:
13
14
  def __init__(self):
14
15
  pass
15
16
 
16
-
17
17
  def _trans_time_fea(self, df, time_info: dict):
18
18
  """
19
19
  Transform and extract time-based features from a specified datetime column.
@@ -82,6 +82,15 @@ class CSV_TO_Pandas:
82
82
 
83
83
  user_text_fea = ['hour','dayofweek','is_weekend']
84
84
  df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
85
+
86
+ elif trans_type == 2:
87
+ df.loc[:, "year"] = df[time_col_name].dt.year
88
+ df.loc[:, "month"] = df[time_col_name].dt.month
89
+ df.loc[:, "day"] = df[time_col_name].dt.day
90
+
91
+
92
+ user_text_fea = ['year','month','day']
93
+ df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
85
94
  else:
86
95
  print("error!")
87
96
 
@@ -91,15 +100,15 @@ class CSV_TO_Pandas:
91
100
 
92
101
  def preprocess_dataset(
93
102
  self,
94
- csv_path,
103
+ df,
95
104
  drop_cols: list,
96
105
  label_col: str,
97
106
  label_map: dict,
98
107
  title_name: str,
99
108
  user_one_hot_cols=[],
100
109
  print_info=False,
101
- Standard=False,
102
- time_info: dict | None = None
110
+ time_info: dict | None = None,
111
+ missing_strategy = 'drop', # [drop, mode]
103
112
  ):
104
113
  """
105
114
  Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
@@ -121,6 +130,9 @@ class CSV_TO_Pandas:
121
130
  print_info (bool, optional):
122
131
  Whether to print preprocessing information and dataset statistics.
123
132
  Defaults to False.
133
+ title_name (str):
134
+ Title used for the summary table or report that documents
135
+ the preprocessing steps and dataset statistics.
124
136
 
125
137
  Returns:
126
138
  pandas.DataFrame:
@@ -145,7 +157,7 @@ class CSV_TO_Pandas:
145
157
  ... )
146
158
  """
147
159
  # Step 0: Load the dataset
148
- df = pd.read_csv(csv_path)
160
+ # df = pd.read_csv(csv_path)
149
161
  columns = df.columns
150
162
 
151
163
  # Save original size
@@ -155,7 +167,15 @@ class CSV_TO_Pandas:
155
167
  df = df.drop(columns=drop_cols)
156
168
 
157
169
  # Step 2: Remove rows with missing values
158
- df = df.dropna(axis=0, how="any")
170
+ if missing_strategy == 'drop':
171
+ df = df.dropna(axis=0, how="any")
172
+
173
+ elif missing_strategy == 'mode':
174
+ for col in df.columns:
175
+ if df[col].notna().any():
176
+ mode_val = df[col].mode()[0]
177
+ df[col] = df[col].fillna(mode_val)
178
+
159
179
  m_encoded, n_encoded = df.shape
160
180
 
161
181
  if time_info is not None:
@@ -177,17 +197,6 @@ class CSV_TO_Pandas:
177
197
  )
178
198
  m_cleaned, n_cleaned = df.shape
179
199
 
180
- if Standard:
181
- # Identify numerical columns Standardize numerical columns
182
- num_cols = [
183
- col
184
- for col in df.columns
185
- if col
186
- not in list(text_feature_cols) + [label_col] + [user_one_hot_cols]
187
- ]
188
- scaler = StandardScaler()
189
- df[num_cols] = scaler.fit_transform(df[num_cols])
190
-
191
200
  # print info
192
201
  if print_info:
193
202
  pos_count = (df[label_col] == 1).sum()
@@ -201,6 +210,7 @@ class CSV_TO_Pandas:
201
210
  print(
202
211
  f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
203
212
  )
213
+ print(f"{'missing_strategy:':<40} {missing_strategy}")
204
214
  print(
205
215
  f"{'Dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
206
216
  )
@@ -216,19 +226,97 @@ class CSV_TO_Pandas:
216
226
  print(f"{'label_map:':<40} {label_map}")
217
227
  print(f"{'time column:':<40} {time_info}")
218
228
  if time_info is not None:
219
- print(f"{'trans_type : int, optional, default=1'}")
220
- print("- 0 : Extract ['year', 'month', 'day', 'hour']")
221
- print("- 1 : Extract ['hour', 'dayofweek', 'is_weekend']")
229
+ if time_info["trans_type"] == 0:
230
+ print("- 0 : Extract ['year', 'month', 'day', 'hour']")
231
+ elif time_info["trans_type"] == 1:
232
+ print("- 1 : Extract ['hour', 'dayofweek', 'is_weekend']")
233
+ elif time_info["trans_type"] == 2:
234
+ print("- 2 : Extract ['year', 'month', 'day']")
235
+ else:
236
+ assert False
222
237
  print(
223
238
  f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
224
239
  )
225
- print("-" * 80)
226
- print("all columns:")
227
- print(list(columns))
228
- print("=" * 80 + "\n")
240
+ # print("-" * 80)
241
+ # print("all columns:")
242
+ # print(list(columns))
243
+ # print("=" * 80 + "\n")
229
244
 
230
245
  return df
246
+
247
+
248
+ from sklearn.model_selection import train_test_split
249
+ from sklearn.preprocessing import StandardScaler
250
+ from torch.utils.data import Dataset
251
+
252
+ class Pandas_TO_Torch(Dataset):
253
+
254
+ def __init__(self, df: pd.DataFrame,
255
+ label_col: str,
256
+ ):
257
+ self.df = df
258
+ self.label_col = label_col
231
259
 
260
+ # Identify feature columns automatically (all except the label)
261
+ self.label_col = label_col
262
+ self.feature_cols = [col for col in self.df.columns if col != label_col]
263
+
264
+ # Extract features and labels
265
+ self.features = self.df[self.feature_cols].values.astype("float32")
266
+ self.labels = self.df[self.label_col].values.astype("int64")
267
+
268
+
269
+ def __len__(self):
270
+ """Return the total number of samples."""
271
+ return len(self.features)
272
+
273
+ def __getitem__(self, idx):
274
+ x = torch.tensor(self.features[idx], dtype=torch.float32)
275
+ y = torch.tensor(self.labels[idx], dtype=torch.long)
276
+
277
+ return x, y
278
+
279
+ def __repr__(self):
280
+ info = (
281
+ f"Dataset CustomNumericDataset\n"
282
+ f" Number of datapoints: {len(self)}\n"
283
+ f" Features: {self.features.shape[1]}\n"
284
+ )
285
+ return info
286
+
287
+ def to_torch(self, transform, Paras):
288
+ fea_cols = [col for col in self.df.columns if col != self.label_col]
289
+
290
+ if transform["normalization"]:
291
+ scaler = StandardScaler()
292
+ self.df[fea_cols] = scaler.fit_transform(self.df[fea_cols])
293
+
294
+ # Train/test split
295
+
296
+ train_df, test_df = train_test_split(self.df, train_size=transform["train_size"], random_state=Paras["seed"], stratify=self.df[self.label_col])
297
+
298
+ # Create datasets
299
+ train_dataset = Pandas_TO_Torch(train_df, self.label_col)
300
+ test_dataset = Pandas_TO_Torch(test_df, self.label_col)
301
+
302
+ return train_dataset, test_dataset, transform
303
+
304
+
305
+ class TXT_TO_Numpy:
306
+ def __init__(self):
307
+ pass
308
+
309
+
310
+ class bz2_To_Numpy:
311
+ def __init__(self):
312
+ pass
313
+
314
+
315
+
316
+
317
+
318
+
319
+
232
320
 
233
321
  class StepByStep:
234
322
  def __init__(self):
@@ -242,3 +330,5 @@ class StepByStep:
242
330
  print(
243
331
  f"Unique values ({len(df[col].unique())}): {df[col].unique().tolist()}"
244
332
  )
333
+
334
+