mlba 1.9.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. mlba-1.9.10/LICENSE +21 -0
  2. mlba-1.9.10/PKG-INFO +35 -0
  3. mlba-1.9.10/README.md +20 -0
  4. mlba-1.9.10/setup.cfg +4 -0
  5. mlba-1.9.10/setup.py +47 -0
  6. mlba-1.9.10/src/mlba/__init__.py +20 -0
  7. mlba-1.9.10/src/mlba/csvFiles/Airfares.csv.gz +0 -0
  8. mlba-1.9.10/src/mlba/csvFiles/Amtrak.csv.gz +0 -0
  9. mlba-1.9.10/src/mlba/csvFiles/ApplianceShipments.csv.gz +0 -0
  10. mlba-1.9.10/src/mlba/csvFiles/AustralianWines.csv.gz +0 -0
  11. mlba-1.9.10/src/mlba/csvFiles/AutoAndElectronics.zip +0 -0
  12. mlba-1.9.10/src/mlba/csvFiles/Bankruptcy.csv.gz +0 -0
  13. mlba-1.9.10/src/mlba/csvFiles/BareggTunnel.csv.gz +0 -0
  14. mlba-1.9.10/src/mlba/csvFiles/BathSoapHousehold.csv.gz +0 -0
  15. mlba-1.9.10/src/mlba/csvFiles/BostonHousing.csv.gz +0 -0
  16. mlba-1.9.10/src/mlba/csvFiles/CanadianWorkHours.csv.gz +0 -0
  17. mlba-1.9.10/src/mlba/csvFiles/CatalogCrossSell.csv.gz +0 -0
  18. mlba-1.9.10/src/mlba/csvFiles/Cereals.csv.gz +0 -0
  19. mlba-1.9.10/src/mlba/csvFiles/CharlesBookClub.csv.gz +0 -0
  20. mlba-1.9.10/src/mlba/csvFiles/Cosmetics.csv.gz +0 -0
  21. mlba-1.9.10/src/mlba/csvFiles/Coursetopics.csv.gz +0 -0
  22. mlba-1.9.10/src/mlba/csvFiles/DepartmentStoreSales.csv.gz +0 -0
  23. mlba-1.9.10/src/mlba/csvFiles/EastWestAirlinesCluster.csv.gz +0 -0
  24. mlba-1.9.10/src/mlba/csvFiles/EastWestAirlinesNN.csv.gz +0 -0
  25. mlba-1.9.10/src/mlba/csvFiles/EbayTreemap.csv.gz +0 -0
  26. mlba-1.9.10/src/mlba/csvFiles/Faceplate.csv.gz +0 -0
  27. mlba-1.9.10/src/mlba/csvFiles/FlightDelays.csv.gz +0 -0
  28. mlba-1.9.10/src/mlba/csvFiles/Fundraising.csv.gz +0 -0
  29. mlba-1.9.10/src/mlba/csvFiles/FutureFundraising.csv.gz +0 -0
  30. mlba-1.9.10/src/mlba/csvFiles/GermanCredit.csv.gz +0 -0
  31. mlba-1.9.10/src/mlba/csvFiles/Hair-Care-Product.csv.gz +0 -0
  32. mlba-1.9.10/src/mlba/csvFiles/LaptopSales.csv.gz +0 -0
  33. mlba-1.9.10/src/mlba/csvFiles/LaptopSalesJanuary2008.csv.gz +0 -0
  34. mlba-1.9.10/src/mlba/csvFiles/NYPD_Motor_Vehicle_Collisions_1000.csv.gz +0 -0
  35. mlba-1.9.10/src/mlba/csvFiles/NaturalGasSales.csv.gz +0 -0
  36. mlba-1.9.10/src/mlba/csvFiles/Pharmaceuticals.csv.gz +0 -0
  37. mlba-1.9.10/src/mlba/csvFiles/RidingMowers.csv.gz +0 -0
  38. mlba-1.9.10/src/mlba/csvFiles/SC-US-students-GPS-data-2016.csv.gz +0 -0
  39. mlba-1.9.10/src/mlba/csvFiles/SP500.csv.gz +0 -0
  40. mlba-1.9.10/src/mlba/csvFiles/Sept11Travel.csv.gz +0 -0
  41. mlba-1.9.10/src/mlba/csvFiles/ShampooSales.csv.gz +0 -0
  42. mlba-1.9.10/src/mlba/csvFiles/SouvenirSales.csv.gz +0 -0
  43. mlba-1.9.10/src/mlba/csvFiles/Spambase.csv.gz +0 -0
  44. mlba-1.9.10/src/mlba/csvFiles/SystemAdministrators.csv.gz +0 -0
  45. mlba-1.9.10/src/mlba/csvFiles/Taxi-cancellation-case.csv.gz +0 -0
  46. mlba-1.9.10/src/mlba/csvFiles/Tayko.csv.gz +0 -0
  47. mlba-1.9.10/src/mlba/csvFiles/TinyData.csv.gz +0 -0
  48. mlba-1.9.10/src/mlba/csvFiles/ToyotaCorolla.csv.gz +0 -0
  49. mlba-1.9.10/src/mlba/csvFiles/ToysRUsRevenues.csv.gz +0 -0
  50. mlba-1.9.10/src/mlba/csvFiles/UniversalBank.csv.gz +0 -0
  51. mlba-1.9.10/src/mlba/csvFiles/Universities.csv.gz +0 -0
  52. mlba-1.9.10/src/mlba/csvFiles/Utilities.csv.gz +0 -0
  53. mlba-1.9.10/src/mlba/csvFiles/Veerhoven.csv.gz +0 -0
  54. mlba-1.9.10/src/mlba/csvFiles/Voter-Persuasion.csv.gz +0 -0
  55. mlba-1.9.10/src/mlba/csvFiles/WalMartStock.csv.gz +0 -0
  56. mlba-1.9.10/src/mlba/csvFiles/WestRoxbury.csv.gz +0 -0
  57. mlba-1.9.10/src/mlba/csvFiles/Wine.csv.gz +0 -0
  58. mlba-1.9.10/src/mlba/csvFiles/accidents.csv.gz +0 -0
  59. mlba-1.9.10/src/mlba/csvFiles/accidentsFull.csv.gz +0 -0
  60. mlba-1.9.10/src/mlba/csvFiles/accidentsnn.csv.gz +0 -0
  61. mlba-1.9.10/src/mlba/csvFiles/banks.csv.gz +0 -0
  62. mlba-1.9.10/src/mlba/csvFiles/bicup2006.csv.gz +0 -0
  63. mlba-1.9.10/src/mlba/csvFiles/courserating.csv.gz +0 -0
  64. mlba-1.9.10/src/mlba/csvFiles/drug.csv.gz +0 -0
  65. mlba-1.9.10/src/mlba/csvFiles/eBayAuctions.csv.gz +0 -0
  66. mlba-1.9.10/src/mlba/csvFiles/eBayNetwork.csv.gz +0 -0
  67. mlba-1.9.10/src/mlba/csvFiles/farm-ads.csv.gz +0 -0
  68. mlba-1.9.10/src/mlba/csvFiles/gdp.csv.gz +0 -0
  69. mlba-1.9.10/src/mlba/csvFiles/liftExample.csv.gz +0 -0
  70. mlba-1.9.10/src/mlba/csvFiles/ownerExample.csv.gz +0 -0
  71. mlba-1.9.10/src/mlba/data.py +32 -0
  72. mlba-1.9.10/src/mlba/featureSelection.py +191 -0
  73. mlba-1.9.10/src/mlba/graphs.py +173 -0
  74. mlba-1.9.10/src/mlba/metric.py +134 -0
  75. mlba-1.9.10/src/mlba/tests/__init__.py +0 -0
  76. mlba-1.9.10/src/mlba/tests/test_data.py +47 -0
  77. mlba-1.9.10/src/mlba/tests/test_graphs.py +45 -0
  78. mlba-1.9.10/src/mlba/tests/test_metric.py +135 -0
  79. mlba-1.9.10/src/mlba/tests/test_textMining.py +34 -0
  80. mlba-1.9.10/src/mlba/textMining.py +19 -0
  81. mlba-1.9.10/src/mlba/version.py +7 -0
  82. mlba-1.9.10/src/mlba.egg-info/PKG-INFO +35 -0
  83. mlba-1.9.10/src/mlba.egg-info/SOURCES.txt +83 -0
  84. mlba-1.9.10/src/mlba.egg-info/dependency_links.txt +1 -0
  85. mlba-1.9.10/src/mlba.egg-info/top_level.txt +1 -0
mlba-1.9.10/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) Galit Shmueli, Peter C. Bruce, Peter Gedeck
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
mlba-1.9.10/PKG-INFO ADDED
@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.1
2
+ Name: mlba
3
+ Version: 1.9.10
4
+ Summary: Utility functions for 'Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python'
5
+ Home-page: https://github.com/gedeck/mlba-python
6
+ Author: Peter Gedeck
7
+ Author-email: mail@petergedeck.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.6
10
+ Classifier: Programming Language :: Python :: 3.7
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+
16
+ # Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python
17
+
18
+ Collection of utility functions used in the book
19
+
20
+ > _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition)
21
+ > Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel
22
+ > (c) 2019 John Wiley & Sons, Inc.
23
+
24
+ Code for the book and the used datasets are available on https://www.dataminingbook.com/book/python-edition.
25
+
26
+
27
+ ## Installation
28
+ Use `pip` to install the `dmba` package from pypi (https://pypi.org/project/dmba/).
29
+ ```
30
+ pip install dmba
31
+ ```
32
+ Should this not work, for example when you are behind a firewall, download the package from pypi and install from file, e.g.
33
+ ```
34
+ pip install dmba-0.0.14.tar.gz
35
+ ```
mlba-1.9.10/README.md ADDED
@@ -0,0 +1,20 @@
1
+ # Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python
2
+
3
+ Collection of utility functions used in the book
4
+
5
+ > _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition)
6
+ > Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel
7
+ > (c) 2019 John Wiley & Sons, Inc.
8
+
9
+ Code for the book and the used datasets are available on https://www.dataminingbook.com/book/python-edition.
10
+
11
+
12
+ ## Installation
13
+ Use `pip` to install the `dmba` package from pypi (https://pypi.org/project/dmba/).
14
+ ```
15
+ pip install dmba
16
+ ```
17
+ Should this not work, for example when you are behind a firewall, download the package from pypi and install from file, e.g.
18
+ ```
19
+ pip install dmba-0.0.14.tar.gz
20
+ ```
mlba-1.9.10/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
mlba-1.9.10/setup.py ADDED
@@ -0,0 +1,47 @@
1
+ '''
2
+ Utility functions for "Data Mining for Business Analytics: Concepts, Techniques, and
3
+ Applications in Python"
4
+
5
+ (c) 2019, 2024 Galit Shmueli, Peter C. Bruce, Peter Gedeck
6
+ '''
7
+ from pathlib import Path
8
+
9
+ import setuptools
10
+
11
+
12
+ def getVersion():
13
+ f = Path(__file__).parent / 'src' / 'mlba' / 'version.py'
14
+ lines = f.read_text().split('\n')
15
+ version = [s for s in lines if '__version__' in s][0]
16
+ version = version.split('=')[1].strip().strip("'")
17
+ return version
18
+
19
+
20
+ with open("README.md", "r") as fh:
21
+ long_description = fh.read()
22
+
23
+ setuptools.setup(
24
+ name="mlba",
25
+ version=getVersion(),
26
+ author="Peter Gedeck",
27
+ author_email="mail@petergedeck.com",
28
+ description="Utility functions for 'Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python'",
29
+ long_description=long_description,
30
+ long_description_content_type="text/markdown",
31
+ url="https://github.com/gedeck/mlba-python",
32
+ packages=setuptools.find_packages("src"),
33
+ package_dir={'': 'src'},
34
+ package_data={
35
+ "mlba": ["csvFiles/*.csv.gz", "csvFiles/*.zip"],
36
+ },
37
+ classifiers=[
38
+ "Programming Language :: Python :: 3",
39
+ "Programming Language :: Python :: 3.6",
40
+ "Programming Language :: Python :: 3.7",
41
+ "License :: OSI Approved :: MIT License",
42
+ "Operating System :: OS Independent",
43
+ ],
44
+
45
+ test_suite='nose.collector',
46
+ tests_require=['nose'],
47
+ )
@@ -0,0 +1,20 @@
1
+ '''
2
+ Utility functions for "Data Mining for Business Analytics: Concepts, Techniques, and
3
+ Applications in Python"
4
+
5
+ (c) 2019, 2024 Galit Shmueli, Peter C. Bruce, Peter Gedeck
6
+ '''
7
+ import os
8
+ import matplotlib as mpl
9
+ from .version import __version__
10
+
11
+ if os.environ.get('DISPLAY', '') == '' and os.name != 'nt':
12
+ print('no display found. Using non-interactive Agg backend')
13
+ mpl.use('Agg')
14
+
15
+ from .featureSelection import exhaustive_search, forward_selection, backward_elimination, stepwise_selection
16
+ from .graphs import plotDecisionTree, liftChart, gainsChart, textDecisionTree
17
+ from .metric import regressionSummary, classificationSummary
18
+ from .metric import AIC_score, BIC_score, adjusted_r2_score
19
+ from .textMining import printTermDocumentMatrix
20
+ from .data import load_data, get_data_file
Binary file
@@ -0,0 +1,32 @@
1
+ '''
2
+ Created on Jun 12, 2020
3
+
4
+ @author: gedeck
5
+ '''
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+
11
+ DATA_DIR = Path(__file__).parent / 'csvFiles'
12
+
13
+
14
+ def load_data(name, **kwargs):
15
+ """ Returns the data either as a Pandas data frame or series """
16
+ data_file = get_data_file(name)
17
+ if not data_file.exists():
18
+ raise ValueError('Data file {name} not found')
19
+ data = pd.read_csv(data_file, **kwargs)
20
+ if data.shape[1] == 1:
21
+ return data[data.columns[0]]
22
+ return data
23
+
24
+
25
+ def get_data_file(name):
26
+ if name.endswith('.zip'):
27
+ return DATA_DIR / name
28
+ if name.endswith('.gz'):
29
+ name = name[:-3]
30
+ if name.endswith('.csv'):
31
+ name = name[:-4]
32
+ return DATA_DIR / f'{name}.csv.gz'
@@ -0,0 +1,191 @@
1
+ '''
2
+ Utility functions for "Data Mining for Business Analytics: Concepts, Techniques, and
3
+ Applications in Python"
4
+
5
+ (c) 2019, 2024 Galit Shmueli, Peter C. Bruce, Peter Gedeck
6
+ '''
7
+ import itertools
8
+
9
+
10
+ def exhaustive_search(variables, train_model, score_model):
11
+ """ Variable selection using backward elimination
12
+
13
+ Input:
14
+ variables: complete list of variables to consider in model building
15
+ train_model: function that returns a fitted model for a given set of variables
16
+ score_model: function that returns the score of a model; better models have lower scores
17
+
18
+ Returns:
19
+ List of best subset models for increasing number of variables
20
+ """
21
+ # create models of increasing size and determine the best models in each case
22
+ result = []
23
+ for nvariables in range(1, len(variables) + 1):
24
+ best_subset = None
25
+ best_score = None
26
+ best_model = None
27
+ for subset in itertools.combinations(variables, nvariables):
28
+ subset = list(subset)
29
+ subset_model = train_model(subset)
30
+ subset_score = score_model(subset_model, subset)
31
+ if best_subset is None or best_score > subset_score:
32
+ best_subset = subset
33
+ best_score = subset_score
34
+ best_model = subset_model
35
+ result.append({
36
+ 'n': nvariables,
37
+ 'variables': best_subset,
38
+ 'score': best_score,
39
+ 'model': best_model,
40
+ })
41
+ return result
42
+
43
+
44
+ def backward_elimination(variables, train_model, score_model, verbose=False):
45
+ """ Variable selection using backward elimination
46
+
47
+ Input:
48
+ variables: complete list of variables to consider in model building
49
+ train_model: function that returns a fitted model for a given set of variables
50
+ score_model: function that returns the score of a model; better models have lower scores
51
+
52
+ Returns:
53
+ (best_model, best_variables)
54
+ """
55
+ # we start with a model that contains all variables
56
+ best_variables = list(variables)
57
+ best_model = train_model(best_variables)
58
+ best_score = score_model(best_model, best_variables)
59
+ if verbose:
60
+ print('Variables: ' + ', '.join(variables))
61
+ print('Start: score={:.2f}'.format(best_score))
62
+
63
+ while len(best_variables) > 1:
64
+ step = [(best_score, None, best_model)]
65
+ for removeVar in best_variables:
66
+ step_var = list(best_variables)
67
+ step_var.remove(removeVar)
68
+ step_model = train_model(step_var)
69
+ step_score = score_model(step_model, step_var)
70
+ step.append((step_score, removeVar, step_model))
71
+
72
+ # sort by ascending score
73
+ step.sort(key=lambda x: x[0])
74
+
75
+ # the first entry is the model with the lowest score
76
+ best_score, removed_step, best_model = step[0]
77
+ if verbose:
78
+ print('Step: score={:.2f}, remove {}'.format(
79
+ best_score, removed_step))
80
+ if removed_step is None:
81
+ # step here, as removing more variables is detrimental to performance
82
+ break
83
+ best_variables.remove(removed_step)
84
+ return best_model, best_variables
85
+
86
+
87
+ def forward_selection(variables, train_model, score_model, verbose=True):
88
+ """ Variable selection using forward selection
89
+
90
+ Input:
91
+ variables: complete list of variables to consider in model building
92
+ train_model: function that returns a fitted model for a given set of variables
93
+ score_model: function that returns the score of a model; better models have lower scores
94
+
95
+ Returns:
96
+ (best_model, best_variables)
97
+ """
98
+ # we start with a model that contains no variables
99
+ best_variables = []
100
+ best_model = train_model(best_variables)
101
+ best_score = score_model(best_model, best_variables)
102
+ if verbose:
103
+ print('Variables: ' + ', '.join(variables))
104
+ print('Start: score={:.2f}, constant'.format(best_score))
105
+ while True:
106
+ step = [(best_score, None, best_model)]
107
+ for addVar in variables:
108
+ if addVar in best_variables:
109
+ continue
110
+ step_var = list(best_variables)
111
+ step_var.append(addVar)
112
+ step_model = train_model(step_var)
113
+ step_score = score_model(step_model, step_var)
114
+ step.append((step_score, addVar, step_model))
115
+ step.sort(key=lambda x: x[0])
116
+
117
+ # the first entry in step is now the model that improved most
118
+ best_score, added_step, best_model = step[0]
119
+ if verbose:
120
+ print('Step: score={:.2f}, add {}'.format(best_score, added_step))
121
+ if added_step is None:
122
+ # stop here, as adding more variables is detrimental to performance
123
+ break
124
+ best_variables.append(added_step)
125
+ return best_model, best_variables
126
+
127
+
128
+ def stepwise_selection(variables, train_model, score_model, direction='both', verbose=True):
129
+ """ Variable selection using forward and/or backward selection
130
+
131
+ Input:
132
+ variables: complete list of variables to consider in model building
133
+ train_model: function that returns a fitted model for a given set of variables
134
+ score_model: function that returns the score of a model; better models have lower scores
135
+ direction: use it to limit stepwise selection to either 'forward' or 'backward'
136
+
137
+ Returns:
138
+ (best_model, best_variables)
139
+ """
140
+ FORWARD = 'forward'
141
+ BACKWARD = 'backward'
142
+ directions = [FORWARD, BACKWARD]
143
+ if direction.lower() == FORWARD:
144
+ directions = [FORWARD]
145
+ if direction.lower() == BACKWARD:
146
+ directions = [BACKWARD]
147
+
148
+ # we start with a model that contains no variables
149
+ best_variables = [] if 'forward' in directions else list(variables)
150
+ best_model = train_model(best_variables)
151
+ best_score = score_model(best_model, best_variables)
152
+ if verbose:
153
+ print('Variables: ' + ', '.join(variables))
154
+ print('Start: score={:.2f}, constant'.format(best_score))
155
+
156
+ while True:
157
+ step = [(best_score, None, best_model, 'unchanged')]
158
+ if FORWARD in directions:
159
+ for variable in variables:
160
+ if variable in best_variables:
161
+ continue
162
+ step_var = list(best_variables)
163
+ step_var.append(variable)
164
+ step_model = train_model(step_var)
165
+ step_score = score_model(step_model, step_var)
166
+ step.append((step_score, variable, step_model, 'add'))
167
+
168
+ if 'backward' in directions:
169
+ for variable in best_variables:
170
+ step_var = list(best_variables)
171
+ step_var.remove(variable)
172
+ step_model = train_model(step_var)
173
+ step_score = score_model(step_model, step_var)
174
+ step.append((step_score, variable, step_model, 'remove'))
175
+
176
+ # sort by ascending score
177
+ step.sort(key=lambda x: x[0])
178
+
179
+ # the first entry is the model with the lowest score
180
+ best_score, chosen_variable, best_model, direction = step[0]
181
+ if verbose:
182
+ print('Step: score={:.2f}, {} {}'.format(
183
+ best_score, direction, chosen_variable))
184
+ if chosen_variable is None:
185
+ # step here, as adding or removing more variables is detrimental to performance
186
+ break
187
+ if direction == 'add':
188
+ best_variables.append(chosen_variable)
189
+ else:
190
+ best_variables.remove(chosen_variable)
191
+ return best_model, best_variables