junshan-kit 2.2.3__py2.py3-none-any.whl → 2.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
junshan_kit/DataHub.py ADDED
@@ -0,0 +1,114 @@
1
+ import torchvision, torch
2
+ import torchvision.transforms as transforms
3
+ import pandas as pd
4
+
5
+ from junshan_kit import DataSets, DataProcessor
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+ def Adult_Income_Prediction(Paras):
18
+
19
+ df = DataSets.adult_income_prediction()
20
+ transform = {
21
+ "train_size": 0.7,
22
+ "normalization": True
23
+ }
24
+ label_col='income'
25
+
26
+ train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
27
+
28
+ return train_dataset, test_dataset, transform
29
+
30
+
31
+ def Credit_Card_Fraud_Detection(Paras):
32
+ df = DataSets.credit_card_fraud_detection()
33
+ transform = {
34
+ "train_size": 0.7,
35
+ "normalization": True
36
+ }
37
+ label_col='Class'
38
+
39
+ train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
40
+
41
+ return train_dataset, test_dataset, transform
42
+
43
+
44
+
45
+ def MNIST(Paras, model_name):
46
+ """
47
+ Load the MNIST dataset and return both the training and test sets,
48
+ along with the transformation applied (ToTensor).
49
+ """
50
+ transform = torchvision.transforms.ToTensor()
51
+
52
+ train_dataset = torchvision.datasets.MNIST(
53
+ root='./exp_data/MNIST',
54
+ train=True,
55
+ download=True,
56
+ transform=transform
57
+ )
58
+
59
+ test_dataset = torchvision.datasets.MNIST(
60
+ root='./exp_data/MNIST',
61
+ train=False,
62
+ download=True,
63
+ transform=transform
64
+ )
65
+
66
+ if Paras["model_type"][model_name] == "binary":
67
+
68
+ train_mask = (train_dataset.targets == 0) | (train_dataset.targets == 1)
69
+ test_mask = (test_dataset.targets == 0) | (test_dataset.targets == 1)
70
+
71
+ train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
72
+ test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
73
+
74
+ train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
75
+ test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
76
+
77
+ return train_dataset, test_dataset, transform
78
+
79
+
80
+ def CIFAR100(Paras, model_name):
81
+ """
82
+ Load the CIFAR-100 dataset with standard normalization and return both
83
+ the training and test sets, along with the transformation applied.
84
+ """
85
+ transform = transforms.Compose([
86
+ transforms.ToTensor(),
87
+ transforms.Normalize(mean=[0.5071, 0.4867, 0.4408],
88
+ std=[0.2675, 0.2565, 0.2761])
89
+ ])
90
+
91
+ train_dataset = torchvision.datasets.CIFAR100(
92
+ root='./exp_data/CIFAR100',
93
+ train=True,
94
+ download=True,
95
+ transform=transform
96
+ )
97
+
98
+ test_dataset = torchvision.datasets.CIFAR100(
99
+ root='./exp_data/CIFAR100',
100
+ train=False,
101
+ download=True,
102
+ transform=transform
103
+ )
104
+ if Paras["model_type"][model_name] == "binary":
105
+ train_mask = (torch.tensor(train_dataset.targets) == 0) | (torch.tensor(train_dataset.targets) == 1)
106
+ test_mask = (torch.tensor(test_dataset.targets) == 0) | (torch.tensor(test_dataset.targets) == 1)
107
+
108
+ train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
109
+ test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
110
+
111
+ train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
112
+ test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
113
+
114
+ return train_dataset, test_dataset, transform
@@ -1,148 +1,164 @@
1
1
  """
2
2
  ----------------------------------------------------------------------
3
- >>> Author : Junshan Yin
3
+ >>> Author : Junshan Yin
4
4
  >>> Last Updated : 2025-10-12
5
5
  ----------------------------------------------------------------------
6
6
  """
7
7
 
8
8
  import pandas as pd
9
- import os
9
+ import torch
10
10
  from sklearn.preprocessing import StandardScaler
11
- import junshan_kit.datahub
12
- import zipfile
13
11
 
14
- class CSVToPandasMeta:
12
+
13
+ class CSV_TO_Pandas:
15
14
  def __init__(self):
16
- self.data_downloader = junshan_kit.datahub.kaggle_data()
17
-
15
+ pass
16
+
17
+ def _trans_time_fea(self, df, time_info: dict):
18
+ """
19
+ Transform and extract time-based features from a specified datetime column.
20
+
21
+ This function converts a given column to pandas datetime format and
22
+ extracts different time-related features based on the specified mode.
23
+ It supports two extraction modes:
24
+ - type = 0: Extracts basic components (year, month, day, hour)
25
+ - type = 1: Extracts hour, day of week, and weekend indicator
26
+
27
+ Parameters
28
+ ----------
29
+ df : pandas.DataFrame
30
+ Input DataFrame containing the datetime column.
31
+ time_info:
32
+ - time_col_name : str
33
+ Name of the column containing time or datetime values.
34
+ - trans_type : int, optional, default=1
35
+ - 0 : Extract ['year', 'month', 'day', 'hour']
36
+ - 1 : Extract ['hour', 'dayofweek', 'is_weekend']
37
+
38
+ Returns
39
+ -------
40
+ pandas.DataFrame
41
+ The DataFrame with newly added time-based feature columns.
42
+
43
+ Notes
44
+ -----
45
+ - Rows that cannot be parsed as valid datetime will be dropped automatically.
46
+ - 'dayofweek' ranges from 0 (Monday) to 6 (Sunday).
47
+ - 'is_weekend' equals 1 if the day is Saturday or Sunday, otherwise 0.
48
+
49
+ Examples
50
+ --------
51
+ >>> import pandas as pd
52
+ >>> data = pd.DataFrame({
53
+ ... 'timestamp': ['2023-08-01 12:30:00', '2023-08-05 08:15:00', 'invalid_time']
54
+ ... })
55
+ >>> df = handler._trans_time_fea(data, {"time_col_name": "timestamp", "trans_type": 1})
56
+ >>> print(df)
57
+ timestamp hour dayofweek is_weekend
58
+ 0 2023-08-01 12:30:00 12 1 0
59
+ 1 2023-08-05 08:15:00 8 5 1
60
+ """
61
+
62
+ time_col_name, trans_type = time_info['time_col_name'], time_info['trans_type']
18
63
 
19
- def read_csv(self, data_name):
20
- self.csv_path = f'exp_data/{data_name}/{data_name}.csv'
21
- if not os.path.exists(self.csv_path):
22
- self.data_downloader.download_data(f'{data_name}', f'exp_data/{data_name}')
64
+ df[time_col_name] = pd.to_datetime(df[time_col_name], errors="coerce")
23
65
 
24
- # ----------------- ccfd_kaggle ----------------------------------
25
- def ccfd_kaggle(self, data_name = 'ccfd-kaggle', show_info = True):
26
- # download data if not exist
27
- self.read_csv(data_name)
28
-
29
- df = pd.read_csv(self.csv_path)
30
- m_before, n_before = df.shape
31
- df = df.dropna(axis=0, how='any')
32
- m_after, n_after = df.shape
33
- df['Class'] = df['Class'].replace(0, -1)
34
-
35
- if show_info:
36
- pos_count = (df['Class'] == 1).sum()
37
- neg_count = (df['Class'] == -1).sum()
38
-
39
- print('\n' + '='*60)
40
- print(f"{'CCFD-Kaggle Dataset Info':^60}")
41
- print('='*60)
42
- print(f"{'Original size:':<25} {m_before} rows x {n_before} cols")
43
- print(f"{'Size after dropping NaNs:':<25} {m_after} rows x {n_after} cols")
44
- print(f"{'Positive samples (+1):':<25} {pos_count}")
45
- print(f"{'Negative samples (-1):':<25} {neg_count}")
46
- print(f"{'Export size:':<25} {m_after} rows x {n_after} cols")
47
- print('-'*60)
48
- print(f"More details: https://www.jianguoyun.com/p/Dd1clVgQ4ZThCxiwzZQGIAA")
49
- print('='*60 + '\n')
66
+ # Drop rows where the datetime conversion failed, and make an explicit copy
67
+ df = df.dropna(subset=[time_col_name]).copy()
50
68
 
51
- return df
52
-
53
- # ------------------------
54
- def ghpdd_kaggle(self, data_name='ghpdd-kaggle', show_info=True):
55
- # download data if not exist
56
- self.read_csv(data_name)
57
-
58
- # read csv
59
- df = pd.read_csv(self.csv_path)
60
- m_before, n_before = df.shape
61
-
62
- # drop NaNs
63
- df = df.dropna(axis=0, how='any')
64
- m_after, n_after = df.shape
65
-
66
- # drop unique identifier
67
- if 'property_id' in df.columns:
68
- df.drop(columns=['property_id'], inplace=True)
69
-
70
- # Replace label 0 with -1
71
- df['decision'] = df['decision'].replace(0, -1)
72
-
73
- # Identify categorical and numerical columns
74
- cat_cols = ['country', 'city', 'property_type', 'furnishing_status']
75
- num_cols = [col for col in df.columns if col not in cat_cols + ['decision']]
76
-
77
- # One-Hot encode categorical columns
78
- df = pd.get_dummies(df, columns=cat_cols)
79
-
80
- # Convert boolean columns to int
81
- bool_cols = df.select_dtypes(include='bool').columns
82
- for col in bool_cols:
83
- df[col] = df[col].astype(int)
84
-
85
- # Standardize numerical columns
86
- scaler = StandardScaler()
87
- df[num_cols] = scaler.fit_transform(df[num_cols])
88
-
89
- # The size after export
90
- m_export, n_export = df.shape
91
-
92
- if show_info:
93
- pos_count = (df['decision'] == 1).sum()
94
- neg_count = (df['decision'] == -1).sum()
95
-
96
- print('\n' + '='*70)
97
- print(f"{'GHPDD-Kaggle Dataset Info':^70}")
98
- print('='*70)
99
- print(f"{'Original size:':<35} {m_before} rows x {n_before} cols")
100
- print(f"{'Size after dropping NaNs:':<35} {m_after} rows x {n_after} cols")
101
- print(f"{'Positive samples (+1):':<35} {pos_count}")
102
- print(f"{'Negative samples (-1):':<35} {neg_count}")
103
- print(f"{'Export size (after encoding & scaling):':<35} {m_export} rows x {n_export} cols")
104
- print('-'*70)
105
- print(f"{'Note: categorical columns one-hot encoded, numerical standardized.'}")
106
- print(f"More details: https://www.jianguoyun.com/p/DU6Lr9oQqdHDDRj5sI0GIAA")
107
- print('='*70 + '\n')
69
+ if trans_type == 0:
70
+ df.loc[:, "year"] = df[time_col_name].dt.year
71
+ df.loc[:, "month"] = df[time_col_name].dt.month
72
+ df.loc[:, "day"] = df[time_col_name].dt.day
73
+ df.loc[:, "hour"] = df[time_col_name].dt.hour
108
74
 
109
- return df
75
+ user_text_fea = ['year','month','day', 'hour']
76
+ df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
110
77
 
78
+ elif trans_type == 1:
79
+ df.loc[:, "hour"] = df[time_col_name].dt.hour
80
+ df.loc[:, "dayofweek"] = df[time_col_name].dt.dayofweek
81
+ df.loc[:, "is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
111
82
 
83
+ user_text_fea = ['hour','dayofweek','is_weekend']
84
+ df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
112
85
 
113
- class CSV_TO_Pandas:
114
- def __init__(self):
115
- pass
86
+ elif trans_type == 2:
87
+ df.loc[:, "year"] = df[time_col_name].dt.year
88
+ df.loc[:, "month"] = df[time_col_name].dt.month
89
+ df.loc[:, "day"] = df[time_col_name].dt.day
116
90
 
117
- def unzip_file(self, zip_path: str, unzip_folder: str):
118
- """
119
- Args:
120
- zip_path (str): Path to the ZIP file to extract.
121
- dest_folder (str, optional): Folder to extract files into.
122
- If None, the function will create a folder with the same
123
- name as the ZIP file (without extension).
124
-
125
- Examples:
126
- >>> zip_path = "./downloads/data.zip"
127
- >>> unzip_folder = "./exp_data/data"
128
- >>> unzip_file(zip_path, unzip_folder)
129
- """
130
91
 
131
- if unzip_folder is None:
132
- unzip_folder = os.path.splitext(os.path.basename(zip_path))[0]
92
+ user_text_fea = ['year','month','day']
93
+ df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
94
+ else:
95
+ print("error!")
133
96
 
134
- os.makedirs(unzip_folder, exist_ok=True)
97
+ df = df.drop(columns=[time_col_name])
135
98
 
136
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
137
- zip_ref.extractall(unzip_folder)
99
+ return df
138
100
 
139
- print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")
140
-
141
- # -----------------------------------------------------
101
+ def preprocess_dataset(
102
+ self,
103
+ df,
104
+ drop_cols: list,
105
+ label_col: str,
106
+ label_map: dict,
107
+ title_name: str,
108
+ user_one_hot_cols=[],
109
+ print_info=False,
110
+ time_info: dict | None = None,
111
+ missing_strategy = 'drop', # [drop, mode]
112
+ ):
113
+ """
114
+ Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
115
+
116
+ This function loads a dataset from a CSV file, removes specified non-feature columns,
117
+ drops rows with missing values, maps the target label to numerical values, and
118
+ one-hot encodes categorical features. Optionally, it can print dataset statistics
119
+ before and after preprocessing.
142
120
 
143
- def clean_data(self, csv_path, drop_cols: list, label_col: str, label_map: dict, print_info = False):
121
+ Args:
122
+ csv_path (str):
123
+ Path to the input CSV dataset.
124
+ drop_cols (list):
125
+ List of column names to drop from the dataset.
126
+ label_col (str):
127
+ Name of the target label column.
128
+ label_map (dict):
129
+ Mapping dictionary for label conversion (e.g., {"yes": 1, "no": -1}).
130
+ print_info (bool, optional):
131
+ Whether to print preprocessing information and dataset statistics.
132
+ Defaults to False.
133
+ title_name (str):
134
+ Title used for the summary table or report that documents
135
+ the preprocessing steps and dataset statistics.
136
+
137
+ Returns:
138
+ pandas.DataFrame:
139
+ The cleaned and preprocessed dataset ready for model input.
140
+
141
+ Steps:
142
+ 1. Load the dataset from CSV.
143
+ 2. Drop non-informative or irrelevant columns.
144
+ 3. Remove rows with missing values.
145
+ 4. Map label column to numerical values according to `label_map`.
146
+ 5. One-hot encode categorical (non-label) text features.
147
+ 6. Optionally print dataset information and summary statistics.
148
+
149
+ Example:
150
+ >>> label_map = {"positive": 1, "negative": -1}
151
+ >>> df = data_handler.preprocess_dataset(
152
+ ... csv_path="data/raw.csv",
153
+ ... drop_cols=["id", "timestamp"],
154
+ ... label_col="sentiment",
155
+ ... label_map=label_map,
156
+ ... print_info=True
157
+ ... )
158
+ """
144
159
  # Step 0: Load the dataset
145
- df = pd.read_csv(csv_path)
160
+ # df = pd.read_csv(csv_path)
161
+ columns = df.columns
146
162
 
147
163
  # Save original size
148
164
  m_original, n_original = df.shape
@@ -151,17 +167,34 @@ class CSV_TO_Pandas:
151
167
  df = df.drop(columns=drop_cols)
152
168
 
153
169
  # Step 2: Remove rows with missing values
154
- df = df.dropna(axis=0, how='any')
170
+ if missing_strategy == 'drop':
171
+ df = df.dropna(axis=0, how="any")
172
+
173
+ elif missing_strategy == 'mode':
174
+ for col in df.columns:
175
+ if df[col].notna().any():
176
+ mode_val = df[col].mode()[0]
177
+ df[col] = df[col].fillna(mode_val)
178
+
155
179
  m_encoded, n_encoded = df.shape
156
-
157
- # Step 3: Map target label to -1 and +1
180
+
181
+ if time_info is not None:
182
+ df = self._trans_time_fea(df, time_info)
183
+
184
+ # Step 3: Map target label (to -1 and +1)
158
185
  df[label_col] = df[label_col].map(label_map)
159
186
 
160
187
  # Step 4: Encode categorical features (exclude label column)
161
- text_feature_cols = df.select_dtypes(include=['object', 'string', 'category']).columns
162
- text_feature_cols = [col for col in text_feature_cols if col != label_col] # ✅ exclude label
163
-
164
- df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
188
+ text_feature_cols = df.select_dtypes(
189
+ include=["object", "string", "category"]
190
+ ).columns
191
+ text_feature_cols = [
192
+ col for col in text_feature_cols if col != label_col
193
+ ] # ✅ exclude label
194
+
195
+ df = pd.get_dummies(
196
+ df, columns=text_feature_cols + user_one_hot_cols, dtype=int
197
+ )
165
198
  m_cleaned, n_cleaned = df.shape
166
199
 
167
200
  # print info
@@ -170,23 +203,132 @@ class CSV_TO_Pandas:
170
203
  neg_count = (df[label_col] == -1).sum()
171
204
 
172
205
  # Step 6: Print dataset information
173
- print('\n' + '='*80)
174
- print(f"{'Dataset Info':^70}")
175
- print('='*80)
206
+ print("\n" + "=" * 80)
207
+ print(f"{f'{title_name} - Summary':^70}")
208
+ print("=" * 80)
176
209
  print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
177
- print(f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols")
210
+ print(
211
+ f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
212
+ )
213
+ print(f"{'missing_strategy:':<40} {missing_strategy}")
214
+ print(
215
+ f"{'Dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
216
+ )
178
217
  print(f"{'Positive samples (+1):':<40} {pos_count}")
179
218
  print(f"{'Negative samples (-1):':<40} {neg_count}")
180
- print(f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols")
181
- print('-'*80)
182
- print(f"Note:")
219
+ print(
220
+ f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
221
+ )
222
+ print("-" * 80)
223
+ print(f"{'More details about preprocessing':^70}")
224
+ print("-" * 80)
183
225
  print(f"{'Label column:':<40} {label_col}")
184
- print(f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}")
185
- print(f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}")
186
- print('='*80 + '\n')
226
+ print(f"{'label_map:':<40} {label_map}")
227
+ print(f"{'time column:':<40} {time_info}")
228
+ if time_info is not None:
229
+ if time_info["trans_type"] == 0:
230
+ print("- 0 : Extract ['year', 'month', 'day', 'hour']")
231
+ elif time_info["trans_type"] == 1:
232
+ print("- 1 : Extract ['hour', 'dayofweek', 'is_weekend']")
233
+ elif time_info["trans_type"] == 2:
234
+ print("- 2 : Extract ['year', 'month', 'day']")
235
+ else:
236
+ assert False
237
+ print(
238
+ f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
239
+ )
240
+ # print("-" * 80)
241
+ # print("all columns:")
242
+ # print(list(columns))
243
+ # print("=" * 80 + "\n")
187
244
 
188
245
  return df
246
+
247
+
248
+ from sklearn.model_selection import train_test_split
249
+ from sklearn.preprocessing import StandardScaler
250
+ from torch.utils.data import Dataset
251
+
252
+ class Pandas_TO_Torch(Dataset):
253
+
254
+ def __init__(self, df: pd.DataFrame,
255
+ label_col: str,
256
+ ):
257
+ self.df = df
258
+ self.label_col = label_col
259
+
260
+ # Identify feature columns automatically (all except the label)
261
+ self.label_col = label_col
262
+ self.feature_cols = [col for col in self.df.columns if col != label_col]
189
263
 
264
+ # Extract features and labels
265
+ self.features = self.df[self.feature_cols].values.astype("float32")
266
+ self.labels = self.df[self.label_col].values.astype("int64")
267
+
268
+
269
+ def __len__(self):
270
+ """Return the total number of samples."""
271
+ return len(self.features)
272
+
273
+ def __getitem__(self, idx):
274
+ x = torch.tensor(self.features[idx], dtype=torch.float32)
275
+ y = torch.tensor(self.labels[idx], dtype=torch.long)
190
276
 
277
+ return x, y
278
+
279
+ def __repr__(self):
280
+ info = (
281
+ f"Dataset CustomNumericDataset\n"
282
+ f" Number of datapoints: {len(self)}\n"
283
+ f" Features: {self.features.shape[1]}\n"
284
+ )
285
+ return info
286
+
287
+ def to_torch(self, transform, Paras):
288
+ fea_cols = [col for col in self.df.columns if col != self.label_col]
191
289
 
290
+ if transform["normalization"]:
291
+ scaler = StandardScaler()
292
+ self.df[fea_cols] = scaler.fit_transform(self.df[fea_cols])
192
293
 
294
+ # Train/test split
295
+
296
+ train_df, test_df = train_test_split(self.df, train_size=transform["train_size"], random_state=Paras["seed"], stratify=self.df[self.label_col])
297
+
298
+ # Create datasets
299
+ train_dataset = Pandas_TO_Torch(train_df, self.label_col)
300
+ test_dataset = Pandas_TO_Torch(test_df, self.label_col)
301
+
302
+ return train_dataset, test_dataset, transform
303
+
304
+
305
+ class TXT_TO_Numpy:
306
+ def __init__(self):
307
+ pass
308
+
309
+
310
+ class bz2_To_Numpy:
311
+ def __init__(self):
312
+ pass
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+ class StepByStep:
322
+ def __init__(self):
323
+ pass
324
+
325
+ def print_text_fea(self, df, text_feature_cols):
326
+ for col in text_feature_cols:
327
+ print(f"\n{'-'*80}")
328
+ print(f'Feature: "{col}"')
329
+ print(f"{'-'*80}")
330
+ print(
331
+ f"Unique values ({len(df[col].unique())}): {df[col].unique().tolist()}"
332
+ )
333
+
334
+