junshan-kit 2.2.8__py2.py3-none-any.whl → 2.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
junshan_kit/DataHub.py ADDED
@@ -0,0 +1,114 @@
1
+ import torchvision, torch
2
+ import torchvision.transforms as transforms
3
+ import pandas as pd
4
+
5
+ from junshan_kit import DataSets, DataProcessor
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+ def Adult_Income_Prediction(Paras):
18
+
19
+ df = DataSets.adult_income_prediction()
20
+ transform = {
21
+ "train_size": 0.7,
22
+ "normalization": True
23
+ }
24
+ label_col='income'
25
+
26
+ train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
27
+
28
+ return train_dataset, test_dataset, transform
29
+
30
+
31
+ def Credit_Card_Fraud_Detection(Paras):
32
+ df = DataSets.credit_card_fraud_detection()
33
+ transform = {
34
+ "train_size": 0.7,
35
+ "normalization": True
36
+ }
37
+ label_col='Class'
38
+
39
+ train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
40
+
41
+ return train_dataset, test_dataset, transform
42
+
43
+
44
+
45
+ def MNIST(Paras, model_name):
46
+ """
47
+ Load the MNIST dataset and return both the training and test sets,
48
+ along with the transformation applied (ToTensor).
49
+ """
50
+ transform = torchvision.transforms.ToTensor()
51
+
52
+ train_dataset = torchvision.datasets.MNIST(
53
+ root='./exp_data/MNIST',
54
+ train=True,
55
+ download=True,
56
+ transform=transform
57
+ )
58
+
59
+ test_dataset = torchvision.datasets.MNIST(
60
+ root='./exp_data/MNIST',
61
+ train=False,
62
+ download=True,
63
+ transform=transform
64
+ )
65
+
66
+ if Paras["model_type"][model_name] == "binary":
67
+
68
+ train_mask = (train_dataset.targets == 0) | (train_dataset.targets == 1)
69
+ test_mask = (test_dataset.targets == 0) | (test_dataset.targets == 1)
70
+
71
+ train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
72
+ test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
73
+
74
+ train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
75
+ test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
76
+
77
+ return train_dataset, test_dataset, transform
78
+
79
+
80
+ def CIFAR100(Paras, model_name):
81
+ """
82
+ Load the CIFAR-100 dataset with standard normalization and return both
83
+ the training and test sets, along with the transformation applied.
84
+ """
85
+ transform = transforms.Compose([
86
+ transforms.ToTensor(),
87
+ transforms.Normalize(mean=[0.5071, 0.4867, 0.4408],
88
+ std=[0.2675, 0.2565, 0.2761])
89
+ ])
90
+
91
+ train_dataset = torchvision.datasets.CIFAR100(
92
+ root='./exp_data/CIFAR100',
93
+ train=True,
94
+ download=True,
95
+ transform=transform
96
+ )
97
+
98
+ test_dataset = torchvision.datasets.CIFAR100(
99
+ root='./exp_data/CIFAR100',
100
+ train=False,
101
+ download=True,
102
+ transform=transform
103
+ )
104
+ if Paras["model_type"][model_name] == "binary":
105
+ train_mask = (torch.tensor(train_dataset.targets) == 0) | (torch.tensor(train_dataset.targets) == 1)
106
+ test_mask = (torch.tensor(test_dataset.targets) == 0) | (torch.tensor(test_dataset.targets) == 1)
107
+
108
+ train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
109
+ test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
110
+
111
+ train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
112
+ test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
113
+
114
+ return train_dataset, test_dataset, transform
@@ -6,19 +6,109 @@
6
6
  """
7
7
 
8
8
  import pandas as pd
9
+ import torch
10
+ from sklearn.preprocessing import StandardScaler
9
11
 
10
12
 
11
13
  class CSV_TO_Pandas:
12
14
  def __init__(self):
13
15
  pass
16
+
17
+ def _trans_time_fea(self, df, time_info: dict):
18
+ """
19
+ Transform and extract time-based features from a specified datetime column.
20
+
21
+ This function converts a given column to pandas datetime format and
22
+ extracts different time-related features based on the specified mode.
23
+ It supports two extraction modes:
24
+ - type = 0: Extracts basic components (year, month, day, hour)
25
+ - type = 1: Extracts hour, day of week, and weekend indicator
26
+
27
+ Parameters
28
+ ----------
29
+ df : pandas.DataFrame
30
+ Input DataFrame containing the datetime column.
31
+ time_info:
32
+ - time_col_name : str
33
+ Name of the column containing time or datetime values.
34
+ - trans_type : int, optional, default=1
35
+ - 0 : Extract ['year', 'month', 'day', 'hour']
36
+ - 1 : Extract ['hour', 'dayofweek', 'is_weekend']
37
+
38
+ Returns
39
+ -------
40
+ pandas.DataFrame
41
+ The DataFrame with newly added time-based feature columns.
42
+
43
+ Notes
44
+ -----
45
+ - Rows that cannot be parsed as valid datetime will be dropped automatically.
46
+ - 'dayofweek' ranges from 0 (Monday) to 6 (Sunday).
47
+ - 'is_weekend' equals 1 if the day is Saturday or Sunday, otherwise 0.
48
+
49
+ Examples
50
+ --------
51
+ >>> import pandas as pd
52
+ >>> data = pd.DataFrame({
53
+ ... 'timestamp': ['2023-08-01 12:30:00', '2023-08-05 08:15:00', 'invalid_time']
54
+ ... })
55
+ >>> df = handler._trans_time_fea(data, {"time_col_name": "timestamp", "trans_type": 1})
56
+ >>> print(df)
57
+ timestamp hour dayofweek is_weekend
58
+ 0 2023-08-01 12:30:00 12 1 0
59
+ 1 2023-08-05 08:15:00 8 5 1
60
+ """
61
+
62
+ time_col_name, trans_type = time_info['time_col_name'], time_info['trans_type']
63
+
64
+ df[time_col_name] = pd.to_datetime(df[time_col_name], errors="coerce")
65
+
66
+ # Drop rows where the datetime conversion failed, and make an explicit copy
67
+ df = df.dropna(subset=[time_col_name]).copy()
68
+
69
+ if trans_type == 0:
70
+ df.loc[:, "year"] = df[time_col_name].dt.year
71
+ df.loc[:, "month"] = df[time_col_name].dt.month
72
+ df.loc[:, "day"] = df[time_col_name].dt.day
73
+ df.loc[:, "hour"] = df[time_col_name].dt.hour
74
+
75
+ user_text_fea = ['year','month','day', 'hour']
76
+ df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
77
+
78
+ elif trans_type == 1:
79
+ df.loc[:, "hour"] = df[time_col_name].dt.hour
80
+ df.loc[:, "dayofweek"] = df[time_col_name].dt.dayofweek
81
+ df.loc[:, "is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
82
+
83
+ user_text_fea = ['hour','dayofweek','is_weekend']
84
+ df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
85
+
86
+ elif trans_type == 2:
87
+ df.loc[:, "year"] = df[time_col_name].dt.year
88
+ df.loc[:, "month"] = df[time_col_name].dt.month
89
+ df.loc[:, "day"] = df[time_col_name].dt.day
90
+
91
+
92
+ user_text_fea = ['year','month','day']
93
+ df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
94
+ else:
95
+ print("error!")
96
+
97
+ df = df.drop(columns=[time_col_name])
98
+
99
+ return df
14
100
 
15
101
  def preprocess_dataset(
16
102
  self,
17
- csv_path,
103
+ df,
18
104
  drop_cols: list,
19
105
  label_col: str,
20
106
  label_map: dict,
107
+ title_name: str,
108
+ user_one_hot_cols=[],
21
109
  print_info=False,
110
+ time_info: dict | None = None,
111
+ missing_strategy = 'drop', # [drop, mode]
22
112
  ):
23
113
  """
24
114
  Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
@@ -40,6 +130,9 @@ class CSV_TO_Pandas:
40
130
  print_info (bool, optional):
41
131
  Whether to print preprocessing information and dataset statistics.
42
132
  Defaults to False.
133
+ title_name (str):
134
+ Title used for the summary table or report that documents
135
+ the preprocessing steps and dataset statistics.
43
136
 
44
137
  Returns:
45
138
  pandas.DataFrame:
@@ -64,7 +157,8 @@ class CSV_TO_Pandas:
64
157
  ... )
65
158
  """
66
159
  # Step 0: Load the dataset
67
- df = pd.read_csv(csv_path)
160
+ # df = pd.read_csv(csv_path)
161
+ columns = df.columns
68
162
 
69
163
  # Save original size
70
164
  m_original, n_original = df.shape
@@ -73,9 +167,20 @@ class CSV_TO_Pandas:
73
167
  df = df.drop(columns=drop_cols)
74
168
 
75
169
  # Step 2: Remove rows with missing values
76
- df = df.dropna(axis=0, how="any")
170
+ if missing_strategy == 'drop':
171
+ df = df.dropna(axis=0, how="any")
172
+
173
+ elif missing_strategy == 'mode':
174
+ for col in df.columns:
175
+ if df[col].notna().any():
176
+ mode_val = df[col].mode()[0]
177
+ df[col] = df[col].fillna(mode_val)
178
+
77
179
  m_encoded, n_encoded = df.shape
78
180
 
181
+ if time_info is not None:
182
+ df = self._trans_time_fea(df, time_info)
183
+
79
184
  # Step 3: Map target label (to -1 and +1)
80
185
  df[label_col] = df[label_col].map(label_map)
81
186
 
@@ -87,7 +192,9 @@ class CSV_TO_Pandas:
87
192
  col for col in text_feature_cols if col != label_col
88
193
  ] # ✅ exclude label
89
194
 
90
- df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
195
+ df = pd.get_dummies(
196
+ df, columns=text_feature_cols + user_one_hot_cols, dtype=int
197
+ )
91
198
  m_cleaned, n_cleaned = df.shape
92
199
 
93
200
  # print info
@@ -97,26 +204,131 @@ class CSV_TO_Pandas:
97
204
 
98
205
  # Step 6: Print dataset information
99
206
  print("\n" + "=" * 80)
100
- print(f"{'Dataset Info':^70}")
207
+ print(f"{f'{title_name} - Summary':^70}")
101
208
  print("=" * 80)
102
209
  print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
103
210
  print(
104
- f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols"
211
+ f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
212
+ )
213
+ print(f"{'missing_strategy:':<40} {missing_strategy}")
214
+ print(
215
+ f"{'Dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
105
216
  )
106
217
  print(f"{'Positive samples (+1):':<40} {pos_count}")
107
218
  print(f"{'Negative samples (-1):':<40} {neg_count}")
108
219
  print(
109
- f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols"
220
+ f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
110
221
  )
111
222
  print("-" * 80)
112
- print(f"Note:")
223
+ print(f"{'More details about preprocessing':^70}")
224
+ print("-" * 80)
113
225
  print(f"{'Label column:':<40} {label_col}")
114
- print(
115
- f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
116
- )
226
+ print(f"{'label_map:':<40} {label_map}")
227
+ print(f"{'time column:':<40} {time_info}")
228
+ if time_info is not None:
229
+ if time_info["trans_type"] == 0:
230
+ print("- 0 : Extract ['year', 'month', 'day', 'hour']")
231
+ elif time_info["trans_type"] == 1:
232
+ print("- 1 : Extract ['hour', 'dayofweek', 'is_weekend']")
233
+ elif time_info["trans_type"] == 2:
234
+ print("- 2 : Extract ['year', 'month', 'day']")
235
+ else:
236
+ assert False
117
237
  print(
118
238
  f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
119
239
  )
120
- print("=" * 80 + "\n")
240
+ # print("-" * 80)
241
+ # print("all columns:")
242
+ # print(list(columns))
243
+ # print("=" * 80 + "\n")
121
244
 
122
245
  return df
246
+
247
+
248
+ from sklearn.model_selection import train_test_split
249
+ from sklearn.preprocessing import StandardScaler
250
+ from torch.utils.data import Dataset
251
+
252
+ class Pandas_TO_Torch(Dataset):
253
+
254
+ def __init__(self, df: pd.DataFrame,
255
+ label_col: str,
256
+ ):
257
+ self.df = df
258
+ self.label_col = label_col
259
+
260
+ # Identify feature columns automatically (all except the label)
261
+ self.label_col = label_col
262
+ self.feature_cols = [col for col in self.df.columns if col != label_col]
263
+
264
+ # Extract features and labels
265
+ self.features = self.df[self.feature_cols].values.astype("float32")
266
+ self.labels = self.df[self.label_col].values.astype("int64")
267
+
268
+
269
+ def __len__(self):
270
+ """Return the total number of samples."""
271
+ return len(self.features)
272
+
273
+ def __getitem__(self, idx):
274
+ x = torch.tensor(self.features[idx], dtype=torch.float32)
275
+ y = torch.tensor(self.labels[idx], dtype=torch.long)
276
+
277
+ return x, y
278
+
279
+ def __repr__(self):
280
+ info = (
281
+ f"Dataset CustomNumericDataset\n"
282
+ f" Number of datapoints: {len(self)}\n"
283
+ f" Features: {self.features.shape[1]}\n"
284
+ )
285
+ return info
286
+
287
+ def to_torch(self, transform, Paras):
288
+ fea_cols = [col for col in self.df.columns if col != self.label_col]
289
+
290
+ if transform["normalization"]:
291
+ scaler = StandardScaler()
292
+ self.df[fea_cols] = scaler.fit_transform(self.df[fea_cols])
293
+
294
+ # Train/test split
295
+
296
+ train_df, test_df = train_test_split(self.df, train_size=transform["train_size"], random_state=Paras["seed"], stratify=self.df[self.label_col])
297
+
298
+ # Create datasets
299
+ train_dataset = Pandas_TO_Torch(train_df, self.label_col)
300
+ test_dataset = Pandas_TO_Torch(test_df, self.label_col)
301
+
302
+ return train_dataset, test_dataset, transform
303
+
304
+
305
+ class TXT_TO_Numpy:
306
+ def __init__(self):
307
+ pass
308
+
309
+
310
+ class bz2_To_Numpy:
311
+ def __init__(self):
312
+ pass
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+ class StepByStep:
322
+ def __init__(self):
323
+ pass
324
+
325
+ def print_text_fea(self, df, text_feature_cols):
326
+ for col in text_feature_cols:
327
+ print(f"\n{'-'*80}")
328
+ print(f'Feature: "{col}"')
329
+ print(f"{'-'*80}")
330
+ print(
331
+ f"Unique values ({len(df[col].unique())}): {df[col].unique().tolist()}"
332
+ )
333
+
334
+