junshan-kit 2.4.8__py2.py3-none-any.whl → 2.4.9__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of junshan-kit might be problematic. Click here for more details.

junshan_kit/DataSets.py CHANGED
@@ -5,14 +5,53 @@
5
5
  ----------------------------------------------------------------------
6
6
  """
7
7
 
8
- import os, time
8
+ import os
9
9
  import pandas as pd
10
+ from scipy.sparse import csr_matrix
11
+ from scipy.io import savemat
10
12
  import junshan_kit.DataProcessor
11
13
  import junshan_kit.kit
12
14
  from sklearn.preprocessing import StandardScaler
13
15
 
14
16
  #----------------------------------------------------------
15
17
  def _download_data(data_name, data_type):
18
+ """
19
+ Download and extract a dataset from Jianguoyun using either Firefox or Chrome automation.
20
+
21
+ This helper function allows the user to manually provide a Jianguoyun download link,
22
+ choose a browser (Firefox or Chrome) for automated downloading, and automatically unzip the downloaded dataset into a structured local directory.
23
+
24
+ Args:
25
+ data_name (str):
26
+ The name of the dataset (used as a folder name for storage).
27
+
28
+ data_type (str):
29
+ The dataset category, e.g., "binary" or "multi".
30
+ Determines the subdirectory under './exp_data/'.
31
+
32
+ Raises:
33
+ ValueError:
34
+ If `data_type` is not one of the allowed options: ["binary", "multi"].
35
+
36
+ Behavior:
37
+ - Prompts the user to input a Jianguoyun download URL.
38
+ - Lets the user select a download method (Firefox or Chrome).
39
+ - Downloads the `.zip` file into `./exp_data/{data_name}/`.
40
+ - Automatically extracts the zip file in the same directory.
41
+ - Prints progress and completion messages.
42
+
43
+ Example:
44
+ >>> _download_data("mnist", "binary")
45
+ Enter the Jianguoyun download URL: https://www.jianguoyun.com/p/abcd1234
46
+ Select download method:
47
+ 1. Firefox
48
+ 2. Chrome
49
+ Enter the number of your choice (1 or 2):
50
+
51
+ Note:
52
+ Requires `junshan_kit` with `JianguoyunDownloaderFirefox`,
53
+ `JianguoyunDownloaderChrome`, and `unzip_file` utilities available.
54
+ """
16
55
  allowed_types = ["binary", "multi"]
17
56
  if data_type not in allowed_types:
18
57
  raise ValueError(f"Invalid data_type: {data_type!r}. Must be one of {allowed_types}.")
@@ -29,43 +68,78 @@ def _download_data(data_name, data_type):
29
68
  choice = input("Enter the number of your choice (1 or 2): ").strip()
30
69
 
31
70
  if choice == "1":
32
- JianguoyunDownloaderFirefox(url, f"./exp_data/{data_type}/{data_name}").run()
33
- print(" Download completed using Firefox")
71
+ JianguoyunDownloaderFirefox(url, f"./exp_data/{data_name}").run()
72
+ print("*** Download completed using Firefox ***")
34
73
  break
35
74
  elif choice == "2":
36
- JianguoyunDownloaderChrome(url, f"./exp_data/{data_type}/{data_name}").run()
37
- print(" Download completed using Chrome")
75
+ JianguoyunDownloaderChrome(url, f"./exp_data/{data_name}").run()
76
+ print("*** Download completed using Chrome ***")
38
77
  break
39
78
  else:
40
- print(" Invalid choice. Please enter 1 or 2.\n")
79
+ print("*** Invalid choice. Please enter 1 or 2 ***\n")
41
80
 
42
81
  # unzip file
43
- junshan_kit.kit.unzip_file(f'./exp_data/{data_type}/{data_name}/{data_name}.zip', f'./exp_data/{data_type}/{data_name}')
82
+ junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
44
83
 
45
84
  def _export_csv(df, data_name, data_type):
46
- path = f'./exp_data/{data_type}/{data_name}/'
85
+ path = f'./exp_data/{data_name}/'
47
86
  os.makedirs(path, exist_ok=True)
48
87
  df.to_csv(path + f'{data_name}_num.csv', index=False)
49
88
  print(path + f'{data_name}.csv')
50
89
 
51
90
 
52
- def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, time_info = None, Standard = False):
53
- if not os.path.exists(csv_path):
91
+ def _export_mat(df, data_name, label_col):
92
+ # Extract label and feature matrices
93
+ y = df[label_col].values # Target column
94
+ X = df.drop(columns=[label_col]).values # Feature matrix
95
+
96
+ # Convert to sparse matrices
97
+ X_sparse = csr_matrix(X)
98
+ Y_sparse = csr_matrix(y.reshape(-1, 1)) # Convert target to column sparse matrix
99
+
100
+ # Get number of samples and features
101
+ m, n = X.shape
102
+
103
+ # Save as a MAT file (supports large datasets)
104
+ save_path = f'exp_data/{data_name}/{data_name}.mat'
105
+ savemat(save_path, {'X': X_sparse, 'Y': Y_sparse, 'm': m, 'n': n}, do_compression=True)
106
+
107
+ # Print confirmation
108
+ print("Sparse MAT file saved to:", save_path)
109
+ print("Number of samples (m):", m)
110
+ print("Number of features (n):", n)
111
+
112
+
113
+ def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, time_info = None, df = None, missing_strategy = 'drop', Paras = None):
114
+
115
+ if csv_path is not None and not os.path.exists(csv_path):
54
116
  print('\n' + '*'*60)
55
117
  print(f"Please download the data.")
56
118
  print(csv_path)
57
- _download_data(data_name, data_type=data_type)
58
- # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
59
-
119
+ _download_data(data_name, data_type=data_type)
120
+
121
+ if not os.path.exists(f"./exp_data/{data_name}"):
122
+ print('\n' + '*'*60)
123
+ print(f"Please download the data.")
124
+ print(f"./exp_data/{data_name}")
125
+ _download_data(data_name, data_type=data_type)
126
+
127
+ if df is None:
128
+ df = pd.read_csv(csv_path)
129
+
60
130
  cleaner = junshan_kit.DataProcessor.CSV_TO_Pandas()
61
- df = cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map, data_name, user_one_hot_cols, print_info=print_info, time_info = time_info, Standard=Standard)
131
+ df = cleaner.preprocess_dataset(df, drop_cols, label_col, label_map, title_name=data_name, user_one_hot_cols=user_one_hot_cols, print_info=print_info, time_info = time_info, missing_strategy = missing_strategy)
62
132
 
63
133
  if export_csv:
64
134
  _export_csv(df, data_name, data_type)
135
+
136
+ if Paras is not None and Paras["export_mat"]:
137
+ _export_mat(df, data_name, label_col)
65
138
 
66
139
  return df
67
140
 
68
141
 
142
+
69
143
  # ********************************************************************
70
144
  """
71
145
  ----------------------------------------------------------------------
@@ -73,15 +147,15 @@ def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_
73
147
  ----------------------------------------------------------------------
74
148
  """
75
149
 
76
- def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False, drop_cols = [], Standard = False):
150
+ def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False, drop_cols = []):
77
151
 
78
152
  data_type = "binary"
79
- csv_path = f'./exp_data/{data_type}/{data_name}/creditcard.csv'
153
+ csv_path = f'./exp_data/{data_name}/creditcard.csv'
80
154
  label_col = 'Class'
81
155
  label_map = {0: -1, 1: 1}
82
156
 
83
157
 
84
- df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, Standard = Standard)
158
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
85
159
 
86
160
 
87
161
  return df
@@ -89,11 +163,11 @@ def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print
89
163
 
90
164
  def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_info = False, export_csv = False, drop_cols = [], Standard = False):
91
165
  data_type = "binary"
92
- csv_path = f'./exp_data/{data_type}/{data_name}/diabetes_dataset.csv'
166
+ csv_path = f'./exp_data/{data_name}/diabetes_dataset.csv'
93
167
  label_col = 'diagnosed_diabetes'
94
168
  label_map = {0: -1, 1: 1}
95
169
 
96
- df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, Standard = Standard)
170
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
97
171
 
98
172
  return df
99
173
 
@@ -101,7 +175,7 @@ def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_i
101
175
  def electric_vehicle_population(data_name = "Electric Vehicle Population", print_info = False, export_csv = False, drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location'], Standard = False):
102
176
 
103
177
  data_type = "binary"
104
- csv_path = f'./exp_data/{data_type}/{data_name}/Electric_Vehicle_Population_Data.csv'
178
+ csv_path = f'./exp_data/{data_name}/Electric_Vehicle_Population_Data.csv'
105
179
  # drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']
106
180
  label_col = 'Electric Vehicle Type'
107
181
  label_map = {
@@ -110,19 +184,19 @@ def electric_vehicle_population(data_name = "Electric Vehicle Population", print
110
184
  }
111
185
 
112
186
 
113
- df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, Standard = Standard)
187
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
114
188
 
115
189
  return df
116
190
 
117
191
  def global_house_purchase(data_name = "Global House Purchase", print_info = False, export_csv = False, drop_cols = ['property_id'], Standard =False):
118
192
 
119
193
  data_type = "binary"
120
- csv_path = f'./exp_data/{data_type}/{data_name}/global_house_purchase_dataset.csv'
194
+ csv_path = f'./exp_data/{data_name}/global_house_purchase_dataset.csv'
121
195
  label_col = 'decision'
122
196
  label_map = {0: -1, 1: 1}
123
197
 
124
198
 
125
- df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, Standard = Standard)
199
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
126
200
 
127
201
  return df
128
202
 
@@ -130,13 +204,13 @@ def global_house_purchase(data_name = "Global House Purchase", print_info = Fals
130
204
  def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_csv = False, drop_cols = ['id'], Standard =False):
131
205
 
132
206
  data_type = "binary"
133
- csv_path = f'./exp_data/{data_type}/{data_name}/health_lifestyle_dataset.csv'
207
+ csv_path = f'./exp_data/{data_name}/health_lifestyle_dataset.csv'
134
208
 
135
209
  label_col = 'disease_risk'
136
210
  label_map = {0: -1, 1: 1}
137
211
 
138
212
 
139
- df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, Standard = Standard)
213
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
140
214
 
141
215
  return df
142
216
 
@@ -149,13 +223,13 @@ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Predic
149
223
  """
150
224
 
151
225
  data_type = "binary"
152
- csv_path = f'./exp_data/{data_type}/{data_name}/medical_insurance.csv'
226
+ csv_path = f'./exp_data/{data_name}/medical_insurance.csv'
153
227
 
154
228
  label_col = 'is_high_risk'
155
229
  label_map = {0: -1, 1: 1}
156
230
 
157
231
 
158
- df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, Standard = Standard)
232
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
159
233
 
160
234
  return df
161
235
 
@@ -163,13 +237,13 @@ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Predic
163
237
  def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False, export_csv = False, drop_cols = [], Standard =False):
164
238
 
165
239
  data_type = "binary"
166
- csv_path = f'./exp_data/{data_type}/{data_name}/Particle Physics Event Classification.csv'
240
+ csv_path = f'./exp_data/{data_name}/Particle Physics Event Classification.csv'
167
241
 
168
242
  label_col = 'Label'
169
243
  label_map = {'s': -1, 'b': 1}
170
244
 
171
245
 
172
- df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, Standard = Standard)
246
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
173
247
 
174
248
  return df
175
249
 
@@ -178,13 +252,13 @@ def particle_physics_event_classification(data_name = "Particle Physics Event Cl
178
252
  def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False, export_csv=False, drop_cols = [], Standard = False):
179
253
 
180
254
  data_type = "binary"
181
- csv_path = f'./exp_data/{data_type}/{data_name}/adult.csv'
255
+ csv_path = f'./exp_data/{data_name}/adult.csv'
182
256
 
183
257
  label_col = 'income'
184
258
  label_map = {'<=50K': -1, '>50K': 1}
185
259
 
186
260
 
187
- df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, Standard = Standard)
261
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
188
262
 
189
263
  return df
190
264
 
@@ -192,7 +266,7 @@ def adult_income_prediction(data_name = "Adult Income Prediction", print_info =
192
266
  def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False, drop_cols = ['Unnamed: 0'], Standard = False):
193
267
 
194
268
  data_type = "binary"
195
- csv_path = f'./exp_data/{data_type}/{data_name}/TNweather_1.8M.csv'
269
+ csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
196
270
 
197
271
  label_col = 'rain_tomorrow'
198
272
  label_map = {0: -1, 1: 1}
@@ -202,15 +276,15 @@ def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info =
202
276
  'trans_type': 0
203
277
  }
204
278
 
205
- df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info, Standard = Standard)
279
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info)
206
280
 
207
281
 
208
282
  return df
209
283
 
210
- def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False, drop_cols = ['user_id'], Standard = False):
284
+ def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False, drop_cols = ['user_id']):
211
285
 
212
286
  data_type = "binary"
213
- csv_path = f'./exp_data/{data_type}/{data_name}/youtube recommendation dataset.csv'
287
+ csv_path = f'./exp_data/{data_name}/youtube recommendation dataset.csv'
214
288
 
215
289
  label_col = 'subscribed_after'
216
290
  label_map = {0: -1, 1: 1}
@@ -218,11 +292,86 @@ def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = Fa
218
292
  # Extraction mode.
219
293
  # - 0 : Extract ['year', 'month', 'day', 'hour']
220
294
  # - 1 : Extract ['hour', 'dayofweek', 'is_weekend']
295
+ # - 2 : Extract ['year', 'month', 'day']
221
296
  time_info = {
222
297
  'time_col_name': 'timestamp',
223
298
  'trans_type': 1
224
299
  }
225
300
 
226
- df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info, Standard = Standard)
301
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info)
302
+
303
+ return df
304
+
305
+
306
+ def Santander_Customer_Satisfaction(data_name = "SantanderCustomerSatisfaction", print_info = False, export_csv = False):
307
+ data_type = "binary"
308
+ csv_path = None
309
+
310
+ drop_cols = ['ID_code']
311
+ label_col = 'target'
312
+ label_map = {False: -1, True: 1}
313
+
314
+ df, y, categorical_indicator, attribute_names = junshan_kit.kit.download_openml_data(data_name)
315
+
316
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, df=df)
317
+
318
+ return df
319
+
320
+
321
+ def newsgroups_drift(data_name = "20_newsgroups.drift", print_info = False, export_csv = False):
322
+ data_type = "binary"
323
+ csv_path = None
324
+
325
+ drop_cols = ['ID_code']
326
+ label_col = 'target'
327
+ label_map = {False: -1, True: 1}
328
+
329
+ df, y, categorical_indicator, attribute_names = junshan_kit.kit.download_openml_data(data_name)
330
+
331
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, df=df)
332
+
333
+ return df
334
+
335
+
336
+ def Homesite_Quote_Conversion(data_name = "Homesite_Quote_Conversion", print_info = False, export_csv = False):
337
+ data_type = "binary"
338
+ csv_path = None
339
+ missing_strategy = 'mode'
340
+
341
+ drop_cols = ['QuoteNumber']
342
+ label_col = 'QuoteConversion_Flag'
343
+ label_map = {0: -1, 1: 1}
344
+
345
+ time_info = {
346
+ 'time_col_name': 'Original_Quote_Date',
347
+ 'trans_type': 2
348
+ }
349
+
350
+ df, y, categorical_indicator, attribute_names = junshan_kit.kit.download_openml_data(data_name)
351
+
352
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, df=df, time_info = time_info, missing_strategy = missing_strategy)
353
+
354
+ return df
355
+
356
+
357
+ def IEEE_CIS_Fraud_Detection(data_name = "IEEE-CIS_Fraud_Detection", print_info = False, export_csv = False, export_mat = False):
358
+ data_type = "binary"
359
+ csv_path = None
360
+ missing_strategy = 'mode'
361
+
362
+ drop_cols = ['TransactionID']
363
+ label_col = 'isFraud'
364
+ label_map = {0: -1, 1: 1}
365
+
366
+ Paras = {
367
+ "export_mat": export_mat
368
+ }
369
+
370
+ df, y, categorical_indicator, attribute_names = junshan_kit.kit.download_openml_data(data_name)
371
+
372
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, df=df, missing_strategy = missing_strategy, Paras = Paras)
373
+
374
+ return df
375
+
376
+
227
377
 
228
- return df
@@ -202,6 +202,11 @@ def Build_LogRegressionBinaryL2_w8a():
202
202
  nn.Flatten(),
203
203
  nn.Linear(300, 1))
204
204
 
205
+ # ---------------------------------------------------------
206
+ def Build_LogRegressionBinaryL2_Adult_Income_Prediction():
207
+ pass
205
208
 
206
209
 
210
+ def Build_LogRegressionBinaryL2_Credit_Card_Fraud_Detection():
211
+ pass
207
212