mlgear 0.3__py3-none-any.whl → 0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mlgear/cv.py CHANGED
@@ -3,8 +3,10 @@ import pandas as pd
3
3
 
4
4
  from sklearn.model_selection import KFold
5
5
 
6
+ from mlgear.utils import print_step
6
7
 
7
- def run_cv_model(train, test=None, target=None, model_fn=None, params={}, eval_fn=None, label='model', n_folds=5, fold_splits=None, classes=1, stop_on_fold=None, train_on_full=False):
8
+
9
+ def run_cv_model(train, test=None, target=None, model_fn=None, params={}, eval_fn=None, label='model', n_folds=5, fold_splits=None, classes=1, stop_on_fold=None, train_on_full=False, verbose=True):
8
10
  if target is None:
9
11
  raise ValueError('Target is needed.')
10
12
  if model_fn is None:
@@ -25,57 +27,64 @@ def run_cv_model(train, test=None, target=None, model_fn=None, params={}, eval_f
25
27
  feature_importance_df = pd.DataFrame()
26
28
  i = 1
27
29
  for dev_index, val_index in fold_splits:
28
- print('Started ' + label + ' fold ' + str(i) + '/' + str(n_folds))
29
- if isinstance(train, pd.DataFrame):
30
- dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
31
- else:
32
- dev_X, val_X = train[dev_index], train[val_index]
33
- dev_y, val_y = target[dev_index], target[val_index]
34
- params2 = params.copy()
35
- meta = {'dev_index': dev_index,
36
- 'val_index': val_index,
37
- 'fold': i,
38
- 'label': label}
39
- pred_val_y, pred_test_y, importances, model = model_fn(dev_X, dev_y, val_X, val_y, test, params2, meta)
40
- if test is not None:
41
- pred_full_test = pred_full_test + pred_test_y
42
- pred_train[val_index] = pred_val_y
43
- if eval_fn is not None:
44
- cv_score = eval_fn(val_y, pred_val_y)
45
- cv_scores.append(cv_score)
46
- print(label + ' cv score {}: {}'.format(i, cv_score))
47
- models[i] = model
48
- if importances is not None and isinstance(train, pd.DataFrame):
49
- fold_importance_df = pd.DataFrame()
50
- fold_importance_df['feature'] = train.columns.values
51
- fold_importance_df['importance'] = importances
52
- fold_importance_df['fold'] = i
53
- feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
54
- if stop_on_fold and stop_on_fold == i:
55
- results = {'label': label,
56
- 'train': pred_train,
57
- 'cv': cv_scores,
58
- 'importance': feature_importance_df,
59
- 'model': models}
60
- if test is not None:
61
- results['test'] = pred_full_test
62
- return results
63
- i += 1
30
+ if verbose:
31
+ print_step('Started ' + label + ' fold ' + str(i) + '/' + str(n_folds))
32
+ if isinstance(train, pd.DataFrame):
33
+ dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
34
+ else:
35
+ dev_X, val_X = train[dev_index], train[val_index]
36
+ dev_y, val_y = target[dev_index], target[val_index]
37
+ params2 = params.copy()
38
+ meta = {'dev_index': dev_index,
39
+ 'val_index': val_index,
40
+ 'fold': i,
41
+ 'label': label}
42
+ pred_val_y, pred_test_y, importances, model = model_fn(dev_X, dev_y, val_X, val_y, test, params2, meta, verbose=verbose)
43
+ if test is not None:
44
+ pred_full_test = pred_full_test + pred_test_y
45
+ pred_train[val_index] = pred_val_y
46
+ if eval_fn is not None:
47
+ cv_score = eval_fn(val_y, pred_val_y)
48
+ cv_scores.append(cv_score)
49
+ if verbose:
50
+ print_step(label + ' cv score {}: {}'.format(i, cv_score))
51
+ models[i] = model
52
+ if importances is not None and isinstance(train, pd.DataFrame):
53
+ fold_importance_df = pd.DataFrame()
54
+ if params.get('group') is None:
55
+ fold_importance_df['feature'] = train.columns.values
56
+ else:
57
+ fold_importance_df['feature'] = [c for c in train.columns.values if c != params['group']]
58
+ fold_importance_df['importance'] = importances
59
+ fold_importance_df['fold'] = i
60
+ feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
61
+ if stop_on_fold and stop_on_fold == i:
62
+ results = {'label': label,
63
+ 'train': pred_train,
64
+ 'cv': cv_scores,
65
+ 'importance': feature_importance_df,
66
+ 'model': models}
67
+ if test is not None:
68
+ results['test'] = pred_full_test
69
+ return results
70
+ i += 1
64
71
 
65
72
  if train_on_full:
66
- print('## Training on full ##')
67
- params2 = params.copy()
68
- _, pred_full_test, importances, model = model_fn(train, target, None, None, test, params2)
69
- models['full'] = model
73
+ if verbose:
74
+ print_step('## Training on full ##')
75
+ params2 = params.copy()
76
+ _, pred_full_test, importances, model = model_fn(train, target, None, None, test, params2, verbose=verbose)
77
+ models['full'] = model
70
78
  elif test is not None:
71
- pred_full_test = pred_full_test / n_folds
79
+ pred_full_test = pred_full_test / n_folds
72
80
 
73
81
  final_cv = eval_fn(target, pred_train) if eval_fn else None
74
82
 
75
- print('{} cv scores : {}'.format(label, cv_scores))
76
- print('{} cv mean score : {}'.format(label, np.mean(cv_scores)))
77
- print('{} cv total score : {}'.format(label, final_cv))
78
- print('{} cv std score : {}'.format(label, np.std(cv_scores)))
83
+ if verbose:
84
+ print_step('{} cv scores : {}'.format(label, cv_scores))
85
+ print_step('{} cv mean score : {}'.format(label, np.mean(cv_scores)))
86
+ print_step('{} cv total score : {}'.format(label, final_cv))
87
+ print_step('{} cv std score : {}'.format(label, np.std(cv_scores)))
79
88
 
80
89
  results = {'label': label,
81
90
  'train': pred_train,
@@ -84,5 +93,5 @@ def run_cv_model(train, test=None, target=None, model_fn=None, params={}, eval_f
84
93
  'importance': feature_importance_df,
85
94
  'model': models}
86
95
  if test is not None:
87
- results['test'] = pred_full_test
96
+ results['test'] = pred_full_test
88
97
  return results
mlgear/models.py CHANGED
@@ -8,17 +8,16 @@ from sklearn.preprocessing import StandardScaler
8
8
  from mlgear.utils import print_step
9
9
 
10
10
 
11
- def runLGB(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={}, meta=None):
12
- print('Prep LGB')
13
- d_train = lgb.Dataset(train_X, label=train_y)
14
- if test_X is not None:
15
- d_valid = lgb.Dataset(test_X, label=test_y)
16
- watchlist = [d_train, d_valid]
11
+ def runLGB(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={}, meta=None, verbose=True):
12
+ if verbose:
13
+ print_step('Prep LGB')
14
+
15
+ if params.get('group'):
16
+ group = params.pop('group')
17
17
  else:
18
- watchlist = [d_train]
19
- print('Train LGB')
18
+ group = None
19
+
20
20
  num_rounds = params.pop('num_rounds')
21
- verbose_eval = params.pop('verbose_eval')
22
21
  early_stop = None
23
22
  if params.get('early_stop'):
24
23
  early_stop = params.pop('early_stop')
@@ -35,6 +34,31 @@ def runLGB(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={},
35
34
  else:
36
35
  feval = None
37
36
 
37
+ if group is None:
38
+ d_train = lgb.Dataset(train_X, label=train_y)
39
+ else:
40
+ d_train = lgb.Dataset(train_X.drop(group, axis=1),
41
+ label=train_y,
42
+ group=train_X.groupby(group).size().to_numpy())
43
+
44
+ if test_X is not None:
45
+ if group is None:
46
+ d_valid = lgb.Dataset(test_X, label=test_y)
47
+ else:
48
+ d_valid = lgb.Dataset(test_X.drop(group, axis=1),
49
+ label=test_y,
50
+ group=test_X.groupby(group).size().to_numpy())
51
+ test_X = test_X.drop(group, axis=1)
52
+ watchlist = [d_train, d_valid]
53
+ else:
54
+ watchlist = [d_train]
55
+
56
+ if test_X2 is not None and group is not None:
57
+ test_X2 = test_X2.drop(group, axis=1)
58
+
59
+ if verbose:
60
+ print_step('Train LGB')
61
+
38
62
  preds_test_y = []
39
63
  preds_test_y2 = []
40
64
  for b in range(nbag):
@@ -43,16 +67,16 @@ def runLGB(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={},
43
67
  train_set=d_train,
44
68
  num_boost_round=num_rounds,
45
69
  valid_sets=watchlist,
46
- verbose_eval=verbose_eval,
47
- early_stopping_rounds=early_stop,
48
- categorical_feature=cat_cols,
70
+ callbacks=[lgb.early_stopping(stopping_rounds=early_stop)] if early_stop else [],
49
71
  feval=feval)
50
72
  if test_X is not None:
51
- print('Predict 1/2')
73
+ if verbose:
74
+ print_step('Predict 1/2')
52
75
  pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
53
76
  preds_test_y += [pred_test_y]
54
77
  if test_X2 is not None:
55
- print('Predict 2/2')
78
+ if verbose:
79
+ print_step('Predict 2/2')
56
80
  pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
57
81
  preds_test_y2 += [pred_test_y2]
58
82
 
@@ -71,21 +95,22 @@ def get_lgb_feature_importance(train, target, params):
71
95
  train_d = lgb.Dataset(train, label=target)
72
96
  lgb_params2 = params.copy()
73
97
  rounds = lgb_params2.pop('num_rounds', 400)
74
- verbose_eval = lgb_params2.pop('verbose_eval', 100)
75
- model = lgb.train(lgb_params2, train_d, rounds, valid_sets = [train_d], verbose_eval=verbose_eval)
98
+ model = lgb.train(lgb_params2, train_d, rounds, valid_sets = [train_d])
76
99
  feature_df = pd.DataFrame(sorted(zip(model.feature_importance(), train.columns)),
77
100
  columns=['Value', 'Feature']).sort_values('Value', ascending=False)
78
101
  return feature_df
79
102
 
80
103
 
81
- def runMLP(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={}, meta=None):
82
- print('Define Model')
104
+ def runMLP(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={}, meta=None, verbose=True):
105
+ if verbose:
106
+ print_step('Define Model')
83
107
  model = params['model'](params['input_size'])
84
108
  es = params['early_stopper']()
85
109
  es.set_model(model)
86
110
  metric = params['metric']
87
111
  metric = metric(model, [es], [(train_X, train_y), (test_X, test_y)])
88
- print('Fit MLP')
112
+ if verbose:
113
+ print_step('Fit MLP')
89
114
  model.fit(train_X, train_y,
90
115
  verbose=params.get('model_verbose', 0),
91
116
  callbacks=[metric] + params['lr_scheduler'](),
@@ -93,12 +118,14 @@ def runMLP(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={},
93
118
  validation_data=(test_X, test_y),
94
119
  batch_size=params.get('batch_size', 128))
95
120
  if test_X is not None:
96
- print('MLP Predict 1/2')
121
+ if verbose:
122
+ print_step('MLP Predict 1/2')
97
123
  pred_test_y = model.predict(test_X)
98
124
  else:
99
125
  pred_test_y = None
100
126
  if test_X2 is not None:
101
- print('MLP Predict 2/2')
127
+ if verbose:
128
+ print_step('MLP Predict 2/2')
102
129
  pred_test_y2 = model.predict(test_X2)
103
130
  else:
104
131
  pred_test_y2 = None
@@ -106,10 +133,11 @@ def runMLP(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={},
106
133
  return pred_test_y, pred_test_y2, None, model
107
134
 
108
135
 
109
- def runLR(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={}, meta=None):
136
+ def runLR(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={}, meta=None, verbose=True):
110
137
  params['random_state'] = 42
111
138
  if params.get('scale'):
112
- print_step('Scale')
139
+ if verbose:
140
+ print_step('Scale')
113
141
  params.pop('scale')
114
142
  scaler = StandardScaler()
115
143
  scaler.fit(train_X.values)
@@ -119,33 +147,39 @@ def runLR(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={}, m
119
147
  if test_X2 is not None:
120
148
  test_X2 = scaler.transform(test_X2.values)
121
149
 
122
- print_step('Train LR')
150
+ if verbose:
151
+ print_step('Train LR')
123
152
  model = LogisticRegression(**params)
124
153
  model.fit(train_X, train_y)
125
154
  if test_X is not None:
126
- print_step('Predict 1/2')
155
+ if verbose:
156
+ print_step('Predict 1/2')
127
157
  pred_test_y = model.predict_proba(test_X)[:, 1]
128
158
  else:
129
159
  pred_test_y = None
130
160
  if test_X2 is not None:
131
- print_step('Predict 2/2')
161
+ if verbose:
162
+ print_step('Predict 2/2')
132
163
  pred_test_y2 = model.predict_proba(test_X2)[:, 1]
133
164
  else:
134
165
  pred_test_y2 = None
135
166
  return pred_test_y, pred_test_y2, model.coef_, model
136
167
 
137
168
 
138
- def runRidge(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={}, meta=None):
169
+ def runRidge(train_X, train_y, test_X=None, test_y=None, test_X2=None, params={}, meta=None, verbose=True):
139
170
  model = Ridge(**params)
140
- print_step('Fit Ridge')
171
+ if verbose:
172
+ print_step('Fit Ridge')
141
173
  model.fit(train_X, train_y)
142
174
  if test_X is not None:
143
- print_step('Ridge Predict 1/2')
175
+ if verbose:
176
+ print_step('Ridge Predict 1/2')
144
177
  pred_test_y = model.predict(test_X)
145
178
  else:
146
179
  pred_test_y = None
147
180
  if test_X2 is not None:
148
- print_step('Ridge Predict 2/2')
181
+ if verbose:
182
+ print_step('Ridge Predict 2/2')
149
183
  pred_test_y2 = model.predict(test_X2)
150
184
  else:
151
185
  pred_test_y2 = None
mlgear/utils.py CHANGED
@@ -14,7 +14,7 @@ def show(df, max_rows=10, max_cols=None, digits=6):
14
14
 
15
15
 
16
16
  def display_column(df, var):
17
- if df[var].nunique() > 9 and (df[var].dtype == int or df[var].dtype == float):
17
+ if df[var].astype(str).nunique() > 9 and (df[var].dtype == int or df[var].dtype == float):
18
18
  print('Mean: {} Median: {} SD: {}'.format(df[var].mean(), df[var].median(), df[var].std()))
19
19
  else:
20
20
  print(df[var].value_counts(normalize=True) * 100)
@@ -36,3 +36,7 @@ def chunk(l, n):
36
36
  for i in range(0, len(l), n):
37
37
  out.append(l[i:i + n])
38
38
  return out
39
+
40
+
41
+ def min_max(dat):
42
+ return (min(dat), max(dat))
@@ -1,23 +1,35 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: mlgear
3
- Version: 0.3
3
+ Version: 0.5
4
4
  Summary: Utility scripts for machine learning
5
- Home-page: https://github.com/peterhurford/mlgear
5
+ License: MIT
6
6
  Author: Peter Hurford
7
7
  Author-email: peter@peterhurford.com
8
- License: UNKNOWN
9
- Platform: UNKNOWN
8
+ Requires-Python: >=3.7,<4.0
10
9
  Classifier: Development Status :: 3 - Alpha
11
- Classifier: Programming Language :: Python :: 3
12
10
  Classifier: License :: OSI Approved :: MIT License
13
11
  Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.7
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Dist: keras
21
+ Requires-Dist: lightgbm
22
+ Requires-Dist: numpy
23
+ Requires-Dist: pandas
24
+ Requires-Dist: scikit-learn
25
+ Project-URL: Repository, https://github.com/peterhurford/mlgear
14
26
  Description-Content-Type: text/markdown
15
27
 
16
28
  ## MLGear
17
29
 
18
30
  Some utility functions to make ML with Python / Pandas / sklearn even easier
19
31
 
20
- #### Example Usage
32
+ ### Example Usage
21
33
 
22
34
  ```Python
23
35
  from mlgear.cv import run_cv_model
@@ -43,4 +55,25 @@ lgb_params = {'application': 'regression',
43
55
  results = run_cv_model(train, test, target, runLGB, lgb_params, rmse)
44
56
  ```
45
57
 
58
+ ### Installation
59
+
60
+ ```
61
+ pip install mlgear
62
+ ```
63
+
64
+ For development:
65
+
66
+ ```
67
+ # Install poetry if you don't have it
68
+ pip install poetry
69
+
70
+ # Install dependencies
71
+ poetry install
72
+
73
+ # Build the package
74
+ poetry build
75
+
76
+ # Publish to PyPI
77
+ poetry publish
78
+ ```
46
79
 
@@ -0,0 +1,13 @@
1
+ mlgear/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mlgear/aggregators.py,sha256=k_GGL8zuBqpBKPYR_v14SqPcJyAUalHcQN2o7gjApiw,373
3
+ mlgear/cv.py,sha256=Dk1ajLFITWgu5nQP-R1rpT-KQ9oYaQRhgSYqQi0IwX0,4009
4
+ mlgear/encoders.py,sha256=_NRqTNSdCNhT04Odxa9xRQq7nrr9bTFRwskARx1rgnU,5044
5
+ mlgear/lr_scheduler.py,sha256=S7DlTAWTzAUAQbmzm-yWIWI5r49Htz1jRBQ98IQHdgg,5272
6
+ mlgear/metrics.py,sha256=_zQwjz4X3-vUQEqu2yIfq2w2XnuH8YUSD_M_u6szToo,1188
7
+ mlgear/models.py,sha256=RtvmsjMFoecdce_ikj3fa9cukGdvQEj3Y72hz5Qw-fY,6249
8
+ mlgear/tracker.py,sha256=U2OXm9tjAWSc5B5_-oTvj_YAJdpkU4nsmPE7tH8BSN4,447
9
+ mlgear/utils.py,sha256=I72-qBgiisV1hcoUT5almb8GXwfmhTQgwvP6gl8kJEY,1096
10
+ mlgear-0.5.dist-info/LICENSE.txt,sha256=qkKmWAzXQC3lYVyoucB3x4iW2xnGEmaORCB4ADTAik4,1081
11
+ mlgear-0.5.dist-info/METADATA,sha256=sRhbP8yNadrRqDX3mY1wOKpjbWD2DPnLbwmibEFYvYc,1994
12
+ mlgear-0.5.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
13
+ mlgear-0.5.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.34.2)
2
+ Generator: poetry-core 2.1.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -1,14 +0,0 @@
1
- mlgear/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mlgear/aggregators.py,sha256=k_GGL8zuBqpBKPYR_v14SqPcJyAUalHcQN2o7gjApiw,373
3
- mlgear/cv.py,sha256=m3BmiJ2evbyj1iCaIp6a4neiyLsFQKGnH51J48yXfBI,3567
4
- mlgear/encoders.py,sha256=_NRqTNSdCNhT04Odxa9xRQq7nrr9bTFRwskARx1rgnU,5044
5
- mlgear/lr_scheduler.py,sha256=S7DlTAWTzAUAQbmzm-yWIWI5r49Htz1jRBQ98IQHdgg,5272
6
- mlgear/metrics.py,sha256=_zQwjz4X3-vUQEqu2yIfq2w2XnuH8YUSD_M_u6szToo,1188
7
- mlgear/models.py,sha256=8zM44ti8vUpyotXpWj2QyVhPWVJrUjbN5l2jq2cKCHw,5300
8
- mlgear/tracker.py,sha256=U2OXm9tjAWSc5B5_-oTvj_YAJdpkU4nsmPE7tH8BSN4,447
9
- mlgear/utils.py,sha256=E8lb0gsTf4tun7PHUQ5GFxwwxY3ZRxwIMzNCTZCb1rM,1032
10
- mlgear-0.3.dist-info/LICENSE.txt,sha256=qkKmWAzXQC3lYVyoucB3x4iW2xnGEmaORCB4ADTAik4,1081
11
- mlgear-0.3.dist-info/METADATA,sha256=FSIlAsK1Zl1Ad268a_Ryd4NBXTv5EZ0JnB5_AgOGjWU,1269
12
- mlgear-0.3.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92
13
- mlgear-0.3.dist-info/top_level.txt,sha256=TM51_lbw1nIKS5TvY-qVQEBGw1tMrgBKGUcB5BISu-Y,7
14
- mlgear-0.3.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- mlgear