mlba 1.9.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlba-1.9.10/LICENSE +21 -0
- mlba-1.9.10/PKG-INFO +35 -0
- mlba-1.9.10/README.md +20 -0
- mlba-1.9.10/setup.cfg +4 -0
- mlba-1.9.10/setup.py +47 -0
- mlba-1.9.10/src/mlba/__init__.py +20 -0
- mlba-1.9.10/src/mlba/csvFiles/Airfares.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Amtrak.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/ApplianceShipments.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/AustralianWines.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/AutoAndElectronics.zip +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Bankruptcy.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/BareggTunnel.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/BathSoapHousehold.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/BostonHousing.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/CanadianWorkHours.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/CatalogCrossSell.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Cereals.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/CharlesBookClub.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Cosmetics.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Coursetopics.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/DepartmentStoreSales.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/EastWestAirlinesCluster.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/EastWestAirlinesNN.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/EbayTreemap.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Faceplate.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/FlightDelays.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Fundraising.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/FutureFundraising.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/GermanCredit.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Hair-Care-Product.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/LaptopSales.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/LaptopSalesJanuary2008.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/NYPD_Motor_Vehicle_Collisions_1000.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/NaturalGasSales.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Pharmaceuticals.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/RidingMowers.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/SC-US-students-GPS-data-2016.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/SP500.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Sept11Travel.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/ShampooSales.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/SouvenirSales.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Spambase.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/SystemAdministrators.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Taxi-cancellation-case.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Tayko.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/TinyData.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/ToyotaCorolla.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/ToysRUsRevenues.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/UniversalBank.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Universities.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Utilities.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Veerhoven.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Voter-Persuasion.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/WalMartStock.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/WestRoxbury.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/Wine.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/accidents.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/accidentsFull.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/accidentsnn.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/banks.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/bicup2006.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/courserating.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/drug.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/eBayAuctions.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/eBayNetwork.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/farm-ads.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/gdp.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/liftExample.csv.gz +0 -0
- mlba-1.9.10/src/mlba/csvFiles/ownerExample.csv.gz +0 -0
- mlba-1.9.10/src/mlba/data.py +32 -0
- mlba-1.9.10/src/mlba/featureSelection.py +191 -0
- mlba-1.9.10/src/mlba/graphs.py +173 -0
- mlba-1.9.10/src/mlba/metric.py +134 -0
- mlba-1.9.10/src/mlba/tests/__init__.py +0 -0
- mlba-1.9.10/src/mlba/tests/test_data.py +47 -0
- mlba-1.9.10/src/mlba/tests/test_graphs.py +45 -0
- mlba-1.9.10/src/mlba/tests/test_metric.py +135 -0
- mlba-1.9.10/src/mlba/tests/test_textMining.py +34 -0
- mlba-1.9.10/src/mlba/textMining.py +19 -0
- mlba-1.9.10/src/mlba/version.py +7 -0
- mlba-1.9.10/src/mlba.egg-info/PKG-INFO +35 -0
- mlba-1.9.10/src/mlba.egg-info/SOURCES.txt +83 -0
- mlba-1.9.10/src/mlba.egg-info/dependency_links.txt +1 -0
- mlba-1.9.10/src/mlba.egg-info/top_level.txt +1 -0
mlba-1.9.10/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Galit Shmueli, Peter C. Bruce, Peter Gedeck
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
mlba-1.9.10/PKG-INFO
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: mlba
|
|
3
|
+
Version: 1.9.10
|
|
4
|
+
Summary: Utility functions for 'Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python'
|
|
5
|
+
Home-page: https://github.com/gedeck/mlba-python
|
|
6
|
+
Author: Peter Gedeck
|
|
7
|
+
Author-email: mail@petergedeck.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
|
|
16
|
+
# Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python
|
|
17
|
+
|
|
18
|
+
Collection of utility functions used in the book
|
|
19
|
+
|
|
20
|
+
> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition)
|
|
21
|
+
> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel
|
|
22
|
+
> (c) 2019 John Wiley & Sons, Inc.
|
|
23
|
+
|
|
24
|
+
Code for the book and the used datasets are available on https://www.dataminingbook.com/book/python-edition.
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
Use `pip` to install the `dmba` package from pypi (https://pypi.org/project/dmba/).
|
|
29
|
+
```
|
|
30
|
+
pip install dmba
|
|
31
|
+
```
|
|
32
|
+
Should this not work, for example when you are behind a firewall, download the package from pypi and install from file, e.g.
|
|
33
|
+
```
|
|
34
|
+
pip install dmba-0.0.14.tar.gz
|
|
35
|
+
```
|
mlba-1.9.10/README.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python
|
|
2
|
+
|
|
3
|
+
Collection of utility functions used in the book
|
|
4
|
+
|
|
5
|
+
> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition)
|
|
6
|
+
> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel
|
|
7
|
+
> (c) 2019 John Wiley & Sons, Inc.
|
|
8
|
+
|
|
9
|
+
Code for the book and the used datasets are available on https://www.dataminingbook.com/book/python-edition.
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
Use `pip` to install the `dmba` package from pypi (https://pypi.org/project/dmba/).
|
|
14
|
+
```
|
|
15
|
+
pip install dmba
|
|
16
|
+
```
|
|
17
|
+
Should this not work, for example when you are behind a firewall, download the package from pypi and install from file, e.g.
|
|
18
|
+
```
|
|
19
|
+
pip install dmba-0.0.14.tar.gz
|
|
20
|
+
```
|
mlba-1.9.10/setup.cfg
ADDED
mlba-1.9.10/setup.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Utility functions for "Data Mining for Business Analytics: Concepts, Techniques, and
|
|
3
|
+
Applications in Python"
|
|
4
|
+
|
|
5
|
+
(c) 2019, 2024 Galit Shmueli, Peter C. Bruce, Peter Gedeck
|
|
6
|
+
'''
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import setuptools
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def getVersion():
|
|
13
|
+
f = Path(__file__).parent / 'src' / 'mlba' / 'version.py'
|
|
14
|
+
lines = f.read_text().split('\n')
|
|
15
|
+
version = [s for s in lines if '__version__' in s][0]
|
|
16
|
+
version = version.split('=')[1].strip().strip("'")
|
|
17
|
+
return version
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
with open("README.md", "r") as fh:
|
|
21
|
+
long_description = fh.read()
|
|
22
|
+
|
|
23
|
+
setuptools.setup(
|
|
24
|
+
name="mlba",
|
|
25
|
+
version=getVersion(),
|
|
26
|
+
author="Peter Gedeck",
|
|
27
|
+
author_email="mail@petergedeck.com",
|
|
28
|
+
description="Utility functions for 'Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python'",
|
|
29
|
+
long_description=long_description,
|
|
30
|
+
long_description_content_type="text/markdown",
|
|
31
|
+
url="https://github.com/gedeck/mlba-python",
|
|
32
|
+
packages=setuptools.find_packages("src"),
|
|
33
|
+
package_dir={'': 'src'},
|
|
34
|
+
package_data={
|
|
35
|
+
"mlba": ["csvFiles/*.csv.gz", "csvFiles/*.zip"],
|
|
36
|
+
},
|
|
37
|
+
classifiers=[
|
|
38
|
+
"Programming Language :: Python :: 3",
|
|
39
|
+
"Programming Language :: Python :: 3.6",
|
|
40
|
+
"Programming Language :: Python :: 3.7",
|
|
41
|
+
"License :: OSI Approved :: MIT License",
|
|
42
|
+
"Operating System :: OS Independent",
|
|
43
|
+
],
|
|
44
|
+
|
|
45
|
+
test_suite='nose.collector',
|
|
46
|
+
tests_require=['nose'],
|
|
47
|
+
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Utility functions for "Data Mining for Business Analytics: Concepts, Techniques, and
|
|
3
|
+
Applications in Python"
|
|
4
|
+
|
|
5
|
+
(c) 2019, 2024 Galit Shmueli, Peter C. Bruce, Peter Gedeck
|
|
6
|
+
'''
|
|
7
|
+
import os
|
|
8
|
+
import matplotlib as mpl
|
|
9
|
+
from .version import __version__
|
|
10
|
+
|
|
11
|
+
if os.environ.get('DISPLAY', '') == '' and os.name != 'nt':
|
|
12
|
+
print('no display found. Using non-interactive Agg backend')
|
|
13
|
+
mpl.use('Agg')
|
|
14
|
+
|
|
15
|
+
from .featureSelection import exhaustive_search, forward_selection, backward_elimination, stepwise_selection
|
|
16
|
+
from .graphs import plotDecisionTree, liftChart, gainsChart, textDecisionTree
|
|
17
|
+
from .metric import regressionSummary, classificationSummary
|
|
18
|
+
from .metric import AIC_score, BIC_score, adjusted_r2_score
|
|
19
|
+
from .textMining import printTermDocumentMatrix
|
|
20
|
+
from .data import load_data, get_data_file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Created on Jun 12, 2020
|
|
3
|
+
|
|
4
|
+
@author: gedeck
|
|
5
|
+
'''
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
DATA_DIR = Path(__file__).parent / 'csvFiles'
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_data(name, **kwargs):
|
|
15
|
+
""" Returns the data either as a Pandas data frame or series """
|
|
16
|
+
data_file = get_data_file(name)
|
|
17
|
+
if not data_file.exists():
|
|
18
|
+
raise ValueError('Data file {name} not found')
|
|
19
|
+
data = pd.read_csv(data_file, **kwargs)
|
|
20
|
+
if data.shape[1] == 1:
|
|
21
|
+
return data[data.columns[0]]
|
|
22
|
+
return data
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_data_file(name):
|
|
26
|
+
if name.endswith('.zip'):
|
|
27
|
+
return DATA_DIR / name
|
|
28
|
+
if name.endswith('.gz'):
|
|
29
|
+
name = name[:-3]
|
|
30
|
+
if name.endswith('.csv'):
|
|
31
|
+
name = name[:-4]
|
|
32
|
+
return DATA_DIR / f'{name}.csv.gz'
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Utility functions for "Data Mining for Business Analytics: Concepts, Techniques, and
|
|
3
|
+
Applications in Python"
|
|
4
|
+
|
|
5
|
+
(c) 2019, 2024 Galit Shmueli, Peter C. Bruce, Peter Gedeck
|
|
6
|
+
'''
|
|
7
|
+
import itertools
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def exhaustive_search(variables, train_model, score_model):
|
|
11
|
+
""" Variable selection using backward elimination
|
|
12
|
+
|
|
13
|
+
Input:
|
|
14
|
+
variables: complete list of variables to consider in model building
|
|
15
|
+
train_model: function that returns a fitted model for a given set of variables
|
|
16
|
+
score_model: function that returns the score of a model; better models have lower scores
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
List of best subset models for increasing number of variables
|
|
20
|
+
"""
|
|
21
|
+
# create models of increasing size and determine the best models in each case
|
|
22
|
+
result = []
|
|
23
|
+
for nvariables in range(1, len(variables) + 1):
|
|
24
|
+
best_subset = None
|
|
25
|
+
best_score = None
|
|
26
|
+
best_model = None
|
|
27
|
+
for subset in itertools.combinations(variables, nvariables):
|
|
28
|
+
subset = list(subset)
|
|
29
|
+
subset_model = train_model(subset)
|
|
30
|
+
subset_score = score_model(subset_model, subset)
|
|
31
|
+
if best_subset is None or best_score > subset_score:
|
|
32
|
+
best_subset = subset
|
|
33
|
+
best_score = subset_score
|
|
34
|
+
best_model = subset_model
|
|
35
|
+
result.append({
|
|
36
|
+
'n': nvariables,
|
|
37
|
+
'variables': best_subset,
|
|
38
|
+
'score': best_score,
|
|
39
|
+
'model': best_model,
|
|
40
|
+
})
|
|
41
|
+
return result
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def backward_elimination(variables, train_model, score_model, verbose=False):
|
|
45
|
+
""" Variable selection using backward elimination
|
|
46
|
+
|
|
47
|
+
Input:
|
|
48
|
+
variables: complete list of variables to consider in model building
|
|
49
|
+
train_model: function that returns a fitted model for a given set of variables
|
|
50
|
+
score_model: function that returns the score of a model; better models have lower scores
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
(best_model, best_variables)
|
|
54
|
+
"""
|
|
55
|
+
# we start with a model that contains all variables
|
|
56
|
+
best_variables = list(variables)
|
|
57
|
+
best_model = train_model(best_variables)
|
|
58
|
+
best_score = score_model(best_model, best_variables)
|
|
59
|
+
if verbose:
|
|
60
|
+
print('Variables: ' + ', '.join(variables))
|
|
61
|
+
print('Start: score={:.2f}'.format(best_score))
|
|
62
|
+
|
|
63
|
+
while len(best_variables) > 1:
|
|
64
|
+
step = [(best_score, None, best_model)]
|
|
65
|
+
for removeVar in best_variables:
|
|
66
|
+
step_var = list(best_variables)
|
|
67
|
+
step_var.remove(removeVar)
|
|
68
|
+
step_model = train_model(step_var)
|
|
69
|
+
step_score = score_model(step_model, step_var)
|
|
70
|
+
step.append((step_score, removeVar, step_model))
|
|
71
|
+
|
|
72
|
+
# sort by ascending score
|
|
73
|
+
step.sort(key=lambda x: x[0])
|
|
74
|
+
|
|
75
|
+
# the first entry is the model with the lowest score
|
|
76
|
+
best_score, removed_step, best_model = step[0]
|
|
77
|
+
if verbose:
|
|
78
|
+
print('Step: score={:.2f}, remove {}'.format(
|
|
79
|
+
best_score, removed_step))
|
|
80
|
+
if removed_step is None:
|
|
81
|
+
# step here, as removing more variables is detrimental to performance
|
|
82
|
+
break
|
|
83
|
+
best_variables.remove(removed_step)
|
|
84
|
+
return best_model, best_variables
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def forward_selection(variables, train_model, score_model, verbose=True):
|
|
88
|
+
""" Variable selection using forward selection
|
|
89
|
+
|
|
90
|
+
Input:
|
|
91
|
+
variables: complete list of variables to consider in model building
|
|
92
|
+
train_model: function that returns a fitted model for a given set of variables
|
|
93
|
+
score_model: function that returns the score of a model; better models have lower scores
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
(best_model, best_variables)
|
|
97
|
+
"""
|
|
98
|
+
# we start with a model that contains no variables
|
|
99
|
+
best_variables = []
|
|
100
|
+
best_model = train_model(best_variables)
|
|
101
|
+
best_score = score_model(best_model, best_variables)
|
|
102
|
+
if verbose:
|
|
103
|
+
print('Variables: ' + ', '.join(variables))
|
|
104
|
+
print('Start: score={:.2f}, constant'.format(best_score))
|
|
105
|
+
while True:
|
|
106
|
+
step = [(best_score, None, best_model)]
|
|
107
|
+
for addVar in variables:
|
|
108
|
+
if addVar in best_variables:
|
|
109
|
+
continue
|
|
110
|
+
step_var = list(best_variables)
|
|
111
|
+
step_var.append(addVar)
|
|
112
|
+
step_model = train_model(step_var)
|
|
113
|
+
step_score = score_model(step_model, step_var)
|
|
114
|
+
step.append((step_score, addVar, step_model))
|
|
115
|
+
step.sort(key=lambda x: x[0])
|
|
116
|
+
|
|
117
|
+
# the first entry in step is now the model that improved most
|
|
118
|
+
best_score, added_step, best_model = step[0]
|
|
119
|
+
if verbose:
|
|
120
|
+
print('Step: score={:.2f}, add {}'.format(best_score, added_step))
|
|
121
|
+
if added_step is None:
|
|
122
|
+
# stop here, as adding more variables is detrimental to performance
|
|
123
|
+
break
|
|
124
|
+
best_variables.append(added_step)
|
|
125
|
+
return best_model, best_variables
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def stepwise_selection(variables, train_model, score_model, direction='both', verbose=True):
|
|
129
|
+
""" Variable selection using forward and/or backward selection
|
|
130
|
+
|
|
131
|
+
Input:
|
|
132
|
+
variables: complete list of variables to consider in model building
|
|
133
|
+
train_model: function that returns a fitted model for a given set of variables
|
|
134
|
+
score_model: function that returns the score of a model; better models have lower scores
|
|
135
|
+
direction: use it to limit stepwise selection to either 'forward' or 'backward'
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
(best_model, best_variables)
|
|
139
|
+
"""
|
|
140
|
+
FORWARD = 'forward'
|
|
141
|
+
BACKWARD = 'backward'
|
|
142
|
+
directions = [FORWARD, BACKWARD]
|
|
143
|
+
if direction.lower() == FORWARD:
|
|
144
|
+
directions = [FORWARD]
|
|
145
|
+
if direction.lower() == BACKWARD:
|
|
146
|
+
directions = [BACKWARD]
|
|
147
|
+
|
|
148
|
+
# we start with a model that contains no variables
|
|
149
|
+
best_variables = [] if 'forward' in directions else list(variables)
|
|
150
|
+
best_model = train_model(best_variables)
|
|
151
|
+
best_score = score_model(best_model, best_variables)
|
|
152
|
+
if verbose:
|
|
153
|
+
print('Variables: ' + ', '.join(variables))
|
|
154
|
+
print('Start: score={:.2f}, constant'.format(best_score))
|
|
155
|
+
|
|
156
|
+
while True:
|
|
157
|
+
step = [(best_score, None, best_model, 'unchanged')]
|
|
158
|
+
if FORWARD in directions:
|
|
159
|
+
for variable in variables:
|
|
160
|
+
if variable in best_variables:
|
|
161
|
+
continue
|
|
162
|
+
step_var = list(best_variables)
|
|
163
|
+
step_var.append(variable)
|
|
164
|
+
step_model = train_model(step_var)
|
|
165
|
+
step_score = score_model(step_model, step_var)
|
|
166
|
+
step.append((step_score, variable, step_model, 'add'))
|
|
167
|
+
|
|
168
|
+
if 'backward' in directions:
|
|
169
|
+
for variable in best_variables:
|
|
170
|
+
step_var = list(best_variables)
|
|
171
|
+
step_var.remove(variable)
|
|
172
|
+
step_model = train_model(step_var)
|
|
173
|
+
step_score = score_model(step_model, step_var)
|
|
174
|
+
step.append((step_score, variable, step_model, 'remove'))
|
|
175
|
+
|
|
176
|
+
# sort by ascending score
|
|
177
|
+
step.sort(key=lambda x: x[0])
|
|
178
|
+
|
|
179
|
+
# the first entry is the model with the lowest score
|
|
180
|
+
best_score, chosen_variable, best_model, direction = step[0]
|
|
181
|
+
if verbose:
|
|
182
|
+
print('Step: score={:.2f}, {} {}'.format(
|
|
183
|
+
best_score, direction, chosen_variable))
|
|
184
|
+
if chosen_variable is None:
|
|
185
|
+
# step here, as adding or removing more variables is detrimental to performance
|
|
186
|
+
break
|
|
187
|
+
if direction == 'add':
|
|
188
|
+
best_variables.append(chosen_variable)
|
|
189
|
+
else:
|
|
190
|
+
best_variables.remove(chosen_variable)
|
|
191
|
+
return best_model, best_variables
|