scypyy 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scypyy-0.7.0/PKG-INFO +32 -0
- scypyy-0.7.0/README.md +18 -0
- scypyy-0.7.0/pyproject.toml +20 -0
- scypyy-0.7.0/scypyy/__init__.py +5 -0
- scypyy-0.7.0/scypyy/core.py +1109 -0
- scypyy-0.7.0/scypyy.egg-info/PKG-INFO +32 -0
- scypyy-0.7.0/scypyy.egg-info/SOURCES.txt +9 -0
- scypyy-0.7.0/scypyy.egg-info/dependency_links.txt +1 -0
- scypyy-0.7.0/scypyy.egg-info/top_level.txt +1 -0
- scypyy-0.7.0/setup.cfg +4 -0
- scypyy-0.7.0/setup.py +17 -0
scypyy-0.7.0/PKG-INFO
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scypyy
|
|
3
|
+
Version: 0.7.0
|
|
4
|
+
Summary: A curated collection.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://google.com
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Dynamic: requires-python
|
|
14
|
+
|
|
15
|
+
# scypyy
|
|
16
|
+
|
|
17
|
+
A curated collection.
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install scypyy
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import scypyy
|
|
28
|
+
|
|
29
|
+
print(scypyy.get())
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
This prints a help.
|
scypyy-0.7.0/README.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "scypyy"
|
|
7
|
+
version = "0.7.0"
|
|
8
|
+
description = "A curated collection."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
"Intended Audience :: Science/Research",
|
|
16
|
+
"Topic :: Scientific/Engineering",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Homepage = "https://google.com"
|
|
@@ -0,0 +1,1109 @@
|
|
|
1
|
+
"""Core module containing curated import strings and code templates by topic."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class _Get:
|
|
5
|
+
"""Namespace for categorized import and code helpers."""
|
|
6
|
+
|
|
7
|
+
def libs(self):
|
|
8
|
+
"""All commonly used imports for data science, ML, stats, and visualization."""
|
|
9
|
+
print("""\
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
14
|
+
import seaborn as sns
|
|
15
|
+
|
|
16
|
+
from scipy import stats
|
|
17
|
+
from scipy.stats import norm # Z-test (normal distribution)
|
|
18
|
+
from scipy.stats import t # t-test (t-distribution)
|
|
19
|
+
from scipy.stats import binom # binomial distribution
|
|
20
|
+
from scipy.stats import poisson # poisson distribution
|
|
21
|
+
from scipy.stats import bernoulli # bernoulli distribution
|
|
22
|
+
from scipy.optimize import minimize # optimization
|
|
23
|
+
from scipy.special import factorial # factorial function
|
|
24
|
+
from numpy import maximum # element-wise max
|
|
25
|
+
|
|
26
|
+
from statistics import stdev # sample standard deviation
|
|
27
|
+
|
|
28
|
+
import statsmodels.api as sm
|
|
29
|
+
import statsmodels.formula.api as smf # smf.ols(), smf.logit()
|
|
30
|
+
from statsmodels.formula.api import ols # for ANOVA
|
|
31
|
+
from statsmodels.stats.proportion import proportions_ztest # proportion z-test
|
|
32
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor # VIF
|
|
33
|
+
from statsmodels.tools.tools import add_constant
|
|
34
|
+
import statsmodels.stats.multicomp as mc # multiple comparison (Tukey)
|
|
35
|
+
from statsmodels.multivariate.manova import MANOVA # MANOVA
|
|
36
|
+
|
|
37
|
+
import sklearn.linear_model as lm
|
|
38
|
+
from sklearn.linear_model import LinearRegression
|
|
39
|
+
from sklearn.linear_model import Ridge, RidgeCV
|
|
40
|
+
from sklearn.linear_model import Lasso, LassoCV
|
|
41
|
+
from sklearn.linear_model import LogisticRegression
|
|
42
|
+
|
|
43
|
+
from sklearn import preprocessing
|
|
44
|
+
from sklearn.preprocessing import scale
|
|
45
|
+
from sklearn.preprocessing import StandardScaler
|
|
46
|
+
from sklearn.preprocessing import LabelEncoder
|
|
47
|
+
from sklearn.preprocessing import OneHotEncoder
|
|
48
|
+
|
|
49
|
+
from sklearn.model_selection import train_test_split
|
|
50
|
+
from sklearn.model_selection import KFold
|
|
51
|
+
from sklearn.model_selection import cross_val_score
|
|
52
|
+
|
|
53
|
+
from sklearn.metrics import mean_squared_error
|
|
54
|
+
from sklearn.metrics import confusion_matrix
|
|
55
|
+
from sklearn.metrics import accuracy_score
|
|
56
|
+
|
|
57
|
+
df = sns.load_dataset('dataset_Asked')""")
|
|
58
|
+
|
|
59
|
+
def hypothesis(self):
|
|
60
|
+
"""Hypothesis testing — all test types with examples."""
|
|
61
|
+
print("""\
|
|
62
|
+
## ============================================================
|
|
63
|
+
## ALL HYPOTHESIS TESTING CODES - LAB 4 TO LAB 7
|
|
64
|
+
## ============================================================
|
|
65
|
+
|
|
66
|
+
import numpy as np
|
|
67
|
+
from scipy import stats
|
|
68
|
+
from scipy.stats import norm
|
|
69
|
+
from scipy.stats import t
|
|
70
|
+
from statistics import stdev
|
|
71
|
+
from statsmodels.stats.proportion import proportions_ztest
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
## ============================================================
|
|
75
|
+
## 1. ONE-SAMPLE Z-TEST (TWO-SIDED)
|
|
76
|
+
## Population SD (sigma) KNOWN
|
|
77
|
+
## Keywords: "differ", "same", "equal", "changed"
|
|
78
|
+
## ============================================================
|
|
79
|
+
|
|
80
|
+
mu = 84
|
|
81
|
+
xbar = 81.5
|
|
82
|
+
sigma = 10
|
|
83
|
+
n = 75
|
|
84
|
+
alpha = 0.01
|
|
85
|
+
|
|
86
|
+
se = sigma / np.sqrt(n)
|
|
87
|
+
Z_cal = (xbar - mu) / se
|
|
88
|
+
|
|
89
|
+
Z_pos = norm.ppf(1 - alpha/2)
|
|
90
|
+
Z_neg = norm.ppf(alpha/2)
|
|
91
|
+
|
|
92
|
+
print("=== 1. One-Sample Z-Test (Two-Sided) ===")
|
|
93
|
+
print(f"Z_cal = {Z_cal}")
|
|
94
|
+
print(f"Z_table = [{Z_neg}, {Z_pos}]")
|
|
95
|
+
print(f"p-value = {norm.cdf(Z_cal)}")
|
|
96
|
+
print(f"CI = [{mu + Z_neg * se}, {mu + Z_pos * se}]")
|
|
97
|
+
# Accept H0 if |Z_cal| < |Z_table|
|
|
98
|
+
# Accept H0 if p-value > alpha
|
|
99
|
+
# Accept H0 if xbar inside CI
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
## ============================================================
|
|
103
|
+
## 2. ONE-SAMPLE Z-TEST (ONE-SIDED LEFT)
|
|
104
|
+
## Keywords: "less", "lower", "decreased"
|
|
105
|
+
## ============================================================
|
|
106
|
+
|
|
107
|
+
mu = 14500
|
|
108
|
+
xbar = 13000
|
|
109
|
+
sigma = 2100
|
|
110
|
+
n = 25
|
|
111
|
+
alpha = 0.01
|
|
112
|
+
|
|
113
|
+
se = sigma / np.sqrt(n)
|
|
114
|
+
Z_cal = (xbar - mu) / se
|
|
115
|
+
|
|
116
|
+
Z_neg = norm.ppf(alpha)
|
|
117
|
+
|
|
118
|
+
print("=== 2. One-Sample Z-Test (One-Sided Left) ===")
|
|
119
|
+
print(f"Z_cal = {Z_cal}")
|
|
120
|
+
print(f"Z_neg = {Z_neg}")
|
|
121
|
+
print(f"p-value = {norm.cdf(Z_cal)}")
|
|
122
|
+
print(f"Boundary = {mu + Z_neg * se}")
|
|
123
|
+
# Reject H0 if Z_cal < Z_neg
|
|
124
|
+
# Reject H0 if p-value < alpha
|
|
125
|
+
# Reject H0 if xbar < boundary
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
## ============================================================
|
|
129
|
+
## 3. ONE-SAMPLE Z-TEST (ONE-SIDED RIGHT)
|
|
130
|
+
## Keywords: "greater", "higher", "increased"
|
|
131
|
+
## ============================================================
|
|
132
|
+
|
|
133
|
+
mu = 50
|
|
134
|
+
xbar = 55
|
|
135
|
+
sigma = 10
|
|
136
|
+
n = 36
|
|
137
|
+
alpha = 0.05
|
|
138
|
+
|
|
139
|
+
se = sigma / np.sqrt(n)
|
|
140
|
+
Z_cal = (xbar - mu) / se
|
|
141
|
+
|
|
142
|
+
Z_pos = norm.ppf(1 - alpha)
|
|
143
|
+
|
|
144
|
+
print("=== 3. One-Sample Z-Test (One-Sided Right) ===")
|
|
145
|
+
print(f"Z_cal = {Z_cal}")
|
|
146
|
+
print(f"Z_pos = {Z_pos}")
|
|
147
|
+
print(f"p-value = {1 - norm.cdf(Z_cal)}")
|
|
148
|
+
print(f"Boundary = {mu + Z_pos * se}")
|
|
149
|
+
# Reject H0 if Z_cal > Z_pos
|
|
150
|
+
# Reject H0 if p-value < alpha
|
|
151
|
+
# Reject H0 if xbar > boundary
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
## ============================================================
|
|
155
|
+
## 4. ONE-SAMPLE t-TEST (TWO-SIDED)
|
|
156
|
+
## Population SD UNKNOWN, use sample SD
|
|
157
|
+
## Keywords: "differ", "same", "equal"
|
|
158
|
+
## ============================================================
|
|
159
|
+
|
|
160
|
+
mu = 100
|
|
161
|
+
xbar = 95.8
|
|
162
|
+
s = 17.5
|
|
163
|
+
n = 100
|
|
164
|
+
df = n - 1
|
|
165
|
+
alpha = 0.05
|
|
166
|
+
|
|
167
|
+
se = s / np.sqrt(n)
|
|
168
|
+
t_cal = (xbar - mu) / se
|
|
169
|
+
|
|
170
|
+
t_pos = t.ppf(1 - alpha/2, df)
|
|
171
|
+
t_neg = t.ppf(alpha/2, df)
|
|
172
|
+
|
|
173
|
+
print("=== 4. One-Sample t-Test (Two-Sided) ===")
|
|
174
|
+
print(f"t_cal = {t_cal}")
|
|
175
|
+
print(f"t_table = [{t_neg}, {t_pos}]")
|
|
176
|
+
print(f"p-value = {t.cdf(t_cal, df) * 2}")
|
|
177
|
+
print(f"CI = [{mu + t_neg * se}, {mu + t_pos * se}]")
|
|
178
|
+
# Reject H0 if |t_cal| > |t_table|
|
|
179
|
+
# Reject H0 if p-value < alpha
|
|
180
|
+
# Reject H0 if xbar outside CI
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
## ============================================================
|
|
184
|
+
## 5. ONE-SAMPLE t-TEST (ONE-SIDED LEFT)
|
|
185
|
+
## Keywords: "less", "lower", "decreased"
|
|
186
|
+
## ============================================================
|
|
187
|
+
|
|
188
|
+
mu = 100
|
|
189
|
+
xbar = 95.8
|
|
190
|
+
s = 17.5
|
|
191
|
+
n = 100
|
|
192
|
+
df = n - 1
|
|
193
|
+
alpha = 0.05
|
|
194
|
+
|
|
195
|
+
se = s / np.sqrt(n)
|
|
196
|
+
t_cal = (xbar - mu) / se
|
|
197
|
+
|
|
198
|
+
t_neg = t.ppf(alpha, df)
|
|
199
|
+
|
|
200
|
+
print("=== 5. One-Sample t-Test (One-Sided Left) ===")
|
|
201
|
+
print(f"t_cal = {t_cal}")
|
|
202
|
+
print(f"t_neg = {t_neg}")
|
|
203
|
+
print(f"p-value = {t.cdf(t_cal, df)}")
|
|
204
|
+
print(f"Boundary = {mu + t_neg * se}")
|
|
205
|
+
# Reject H0 if t_cal < t_neg
|
|
206
|
+
# Reject H0 if p-value < alpha
|
|
207
|
+
# Reject H0 if xbar < boundary
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
## ============================================================
|
|
211
|
+
## 6. ONE-SAMPLE t-TEST (ONE-SIDED RIGHT)
|
|
212
|
+
## Keywords: "greater", "higher", "increased"
|
|
213
|
+
## ============================================================
|
|
214
|
+
|
|
215
|
+
mu = 100
|
|
216
|
+
xbar = 105
|
|
217
|
+
s = 17.5
|
|
218
|
+
n = 100
|
|
219
|
+
df = n - 1
|
|
220
|
+
alpha = 0.05
|
|
221
|
+
|
|
222
|
+
se = s / np.sqrt(n)
|
|
223
|
+
t_cal = (xbar - mu) / se
|
|
224
|
+
|
|
225
|
+
t_pos = t.ppf(1 - alpha, df)
|
|
226
|
+
|
|
227
|
+
print("=== 6. One-Sample t-Test (One-Sided Right) ===")
|
|
228
|
+
print(f"t_cal = {t_cal}")
|
|
229
|
+
print(f"t_pos = {t_pos}")
|
|
230
|
+
print(f"p-value = {1 - t.cdf(t_cal, df)}")
|
|
231
|
+
print(f"Boundary = {mu + t_pos * se}")
|
|
232
|
+
# Reject H0 if t_cal > t_pos
|
|
233
|
+
# Reject H0 if p-value < alpha
|
|
234
|
+
# Reject H0 if xbar > boundary
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
## ============================================================
|
|
238
|
+
## 7. ONE-SAMPLE t-TEST USING BUILT-IN (ttest_1samp)
|
|
239
|
+
## When you have RAW DATA (not xbar and s)
|
|
240
|
+
## ============================================================
|
|
241
|
+
|
|
242
|
+
X = np.array([10, 12, 13, 14, 15, 2, 7, 8])
|
|
243
|
+
|
|
244
|
+
print("=== 7. ttest_1samp (Built-in) ===")
|
|
245
|
+
print("Two-sided:", stats.ttest_1samp(X, popmean=12, alternative='two-sided'))
|
|
246
|
+
print("Left: ", stats.ttest_1samp(X, popmean=12, alternative='less'))
|
|
247
|
+
print("Right: ", stats.ttest_1samp(X, popmean=12, alternative='greater'))
|
|
248
|
+
# Reject H0 if pvalue < alpha
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
## ============================================================
|
|
252
|
+
## 8. ONE-SAMPLE PROPORTION Z-TEST (TWO-SIDED)
|
|
253
|
+
## Keywords: "differ", "same", "equal"
|
|
254
|
+
## ============================================================
|
|
255
|
+
|
|
256
|
+
p = 0.15
|
|
257
|
+
pbar = 22 / 120
|
|
258
|
+
n = 120
|
|
259
|
+
alpha = 0.02
|
|
260
|
+
|
|
261
|
+
se = np.sqrt((p * (1 - p)) / n)
|
|
262
|
+
Z_cal = (pbar - p) / se
|
|
263
|
+
|
|
264
|
+
Z_pos = norm.ppf(1 - alpha/2)
|
|
265
|
+
Z_neg = norm.ppf(alpha/2)
|
|
266
|
+
|
|
267
|
+
print("=== 8. One-Sample Proportion Z-Test (Two-Sided) ===")
|
|
268
|
+
print(f"Z_cal = {Z_cal}")
|
|
269
|
+
print(f"Z_table = [{Z_neg}, {Z_pos}]")
|
|
270
|
+
print(f"p-value = {norm.cdf(Z_cal)}")
|
|
271
|
+
print(f"CI = [{p + Z_neg * se}, {p + Z_pos * se}]")
|
|
272
|
+
# Accept H0 if |Z_cal| < |Z_table|
|
|
273
|
+
# Accept H0 if p-value > alpha
|
|
274
|
+
# Accept H0 if pbar inside CI
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
## ============================================================
|
|
278
|
+
## 9. ONE-SAMPLE PROPORTION Z-TEST (ONE-SIDED RIGHT)
|
|
279
|
+
## Keywords: "increased", "greater", "more"
|
|
280
|
+
## ============================================================
|
|
281
|
+
|
|
282
|
+
p = 0.05
|
|
283
|
+
pbar = 335 / 6000
|
|
284
|
+
n = 6000
|
|
285
|
+
alpha = 0.02
|
|
286
|
+
|
|
287
|
+
se = np.sqrt((p * (1 - p)) / n)
|
|
288
|
+
Z_cal = (pbar - p) / se
|
|
289
|
+
|
|
290
|
+
Z_pos = norm.ppf(1 - alpha)
|
|
291
|
+
|
|
292
|
+
print("=== 9. One-Sample Proportion Z-Test (One-Sided Right) ===")
|
|
293
|
+
print(f"Z_cal = {Z_cal}")
|
|
294
|
+
print(f"Z_pos = {Z_pos}")
|
|
295
|
+
print(f"p-value = {1 - norm.cdf(Z_cal)}")
|
|
296
|
+
print(f"Boundary = {p + Z_pos * se}")
|
|
297
|
+
# Reject H0 if Z_cal > Z_pos
|
|
298
|
+
# Reject H0 if p-value < alpha
|
|
299
|
+
# Reject H0 if pbar > boundary
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
## ============================================================
|
|
303
|
+
## 10. ONE-SAMPLE PROPORTION USING BUILT-IN (proportions_ztest)
|
|
304
|
+
## ============================================================
|
|
305
|
+
|
|
306
|
+
print("=== 10. proportions_ztest (Built-in) ===")
|
|
307
|
+
print("Two-sided:", proportions_ztest(count=22, nobs=120, value=0.15, alternative='two-sided'))
|
|
308
|
+
print("Right: ", proportions_ztest(count=335, nobs=6000, value=0.05, alternative='larger'))
|
|
309
|
+
print("Left: ", proportions_ztest(count=22, nobs=120, value=0.15, alternative='smaller'))
|
|
310
|
+
# alternative: 'two-sided', 'smaller', 'larger'
|
|
311
|
+
# Reject H0 if pvalue < alpha
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
## ============================================================
|
|
315
|
+
## 11. TWO-SAMPLE Z-TEST FOR MEANS (TWO-SIDED)
|
|
316
|
+
## Both sigma known
|
|
317
|
+
## Keywords: "differ", "same", "equal"
|
|
318
|
+
## ============================================================
|
|
319
|
+
|
|
320
|
+
xbar1, xbar2 = 86, 82
|
|
321
|
+
sigma1, sigma2 = 6, 9
|
|
322
|
+
n1, n2 = 60, 75
|
|
323
|
+
alpha = 0.01
|
|
324
|
+
|
|
325
|
+
se = np.sqrt(sigma1**2/n1 + sigma2**2/n2)
|
|
326
|
+
Z_cal = ((xbar1 - xbar2) - 0) / se
|
|
327
|
+
|
|
328
|
+
Z_pos = norm.ppf(1 - alpha/2)
|
|
329
|
+
Z_neg = norm.ppf(alpha/2)
|
|
330
|
+
|
|
331
|
+
print("=== 11. Two-Sample Z-Test Means (Two-Sided) ===")
|
|
332
|
+
print(f"Z_cal = {Z_cal}")
|
|
333
|
+
print(f"Z_table = [{Z_neg}, {Z_pos}]")
|
|
334
|
+
print(f"p-value = {(1 - norm.cdf(Z_cal)) * 2}")
|
|
335
|
+
print(f"CI = [{0 + Z_neg * se}, {0 + Z_pos * se}]")
|
|
336
|
+
# Reject H0 if |Z_cal| > |Z_table|
|
|
337
|
+
# Reject H0 if p-value < alpha
|
|
338
|
+
# Reject H0 if (xbar1-xbar2) outside CI
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
## ============================================================
|
|
342
|
+
## 12. TWO-SAMPLE Z-TEST FOR MEANS (ONE-SIDED RIGHT)
|
|
343
|
+
## Keywords: "greater", "higher", "increased"
|
|
344
|
+
## ============================================================
|
|
345
|
+
|
|
346
|
+
xbar1, xbar2 = 13.8, 9.1
|
|
347
|
+
sigma1, sigma2 = 18.9, 8.7
|
|
348
|
+
n1, n2 = 41, 35
|
|
349
|
+
alpha = 0.10
|
|
350
|
+
|
|
351
|
+
se = np.sqrt(sigma1**2/n1 + sigma2**2/n2)
|
|
352
|
+
Z_cal = ((xbar1 - xbar2) - 0) / se
|
|
353
|
+
|
|
354
|
+
Z_pos = norm.ppf(1 - alpha)
|
|
355
|
+
|
|
356
|
+
print("=== 12. Two-Sample Z-Test Means (One-Sided Right) ===")
|
|
357
|
+
print(f"Z_cal = {Z_cal}")
|
|
358
|
+
print(f"Z_pos = {Z_pos}")
|
|
359
|
+
print(f"p-value = {1 - norm.cdf(Z_cal)}")
|
|
360
|
+
print(f"Boundary = {0 + Z_pos * se}")
|
|
361
|
+
# Reject H0 if Z_cal > Z_pos
|
|
362
|
+
# Reject H0 if p-value < alpha
|
|
363
|
+
# Reject H0 if (xbar1-xbar2) > boundary
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
## ============================================================
|
|
367
|
+
## 13. TWO-SAMPLE t-TEST EQUAL VARIANCE (TWO-SIDED)
|
|
368
|
+
## sigma unknown, use pooled SD
|
|
369
|
+
## Keywords: "differ", "same", "equal"
|
|
370
|
+
## ============================================================
|
|
371
|
+
|
|
372
|
+
X = np.array([10, 12, 13, 14, 15, 2, 7, 8])
|
|
373
|
+
# Using built-in
|
|
374
|
+
print("=== 13. Two-Sample t-Test Equal Var (Built-in) ===")
|
|
375
|
+
a = np.array([56, 128.6, 12, 123.8, 64.34, 78, 763.3])
|
|
376
|
+
b = np.array([1.1, 2.9, 4.2])
|
|
377
|
+
print("Equal var: ", stats.ttest_ind(a, b, equal_var=True, alternative='two-sided'))
|
|
378
|
+
print("Unequal var:", stats.ttest_ind(a, b, equal_var=False, alternative='two-sided'))
|
|
379
|
+
# Reject H0 if pvalue < alpha
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
## ============================================================
|
|
383
|
+
## 14. TWO-SAMPLE t-TEST EQUAL VARIANCE (MANUAL - ONE-SIDED)
|
|
384
|
+
## Pooled SD formula
|
|
385
|
+
## ============================================================
|
|
386
|
+
|
|
387
|
+
Bus = np.array([2.86, 2.77, 3.18, 2.80, 3.14, 2.87, 3.19, 3.24, 2.91, 3, 2.83])
|
|
388
|
+
As = np.array([3.35, 3.32, 3.36, 3.63, 3.41, 3.37, 3.45, 3.43, 3.44, 3.17, 3.26, 3.18, 3.41])
|
|
389
|
+
|
|
390
|
+
xbar1 = np.mean(Bus)
|
|
391
|
+
xbar2 = np.mean(As)
|
|
392
|
+
s1 = stdev(Bus)
|
|
393
|
+
s2 = stdev(As)
|
|
394
|
+
n1 = len(Bus)
|
|
395
|
+
n2 = len(As)
|
|
396
|
+
alpha = 0.02
|
|
397
|
+
hypothesized_diff = -0.25
|
|
398
|
+
|
|
399
|
+
# Pooled SD
|
|
400
|
+
sp = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2) / (n1 + n2 - 2))
|
|
401
|
+
se = sp * np.sqrt(1/n1 + 1/n2)
|
|
402
|
+
df = n1 + n2 - 2
|
|
403
|
+
|
|
404
|
+
t_cal = ((xbar1 - xbar2) - hypothesized_diff) / se
|
|
405
|
+
t_neg = t.ppf(alpha, df)
|
|
406
|
+
|
|
407
|
+
print("=== 14. Two-Sample t-Test Pooled (One-Sided Left) ===")
|
|
408
|
+
print(f"Pooled SD = {sp}")
|
|
409
|
+
print(f"SE = {se}")
|
|
410
|
+
print(f"t_cal = {t_cal}")
|
|
411
|
+
print(f"t_neg = {t_neg}")
|
|
412
|
+
print(f"df = {df}")
|
|
413
|
+
print(f"p-value = {t.cdf(t_cal, df)}")
|
|
414
|
+
print(f"Boundary = {hypothesized_diff + t_neg * se}")
|
|
415
|
+
# Reject H0 if t_cal < t_neg
|
|
416
|
+
# Reject H0 if p-value < alpha
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
## ============================================================
|
|
420
|
+
## 15. TWO-SAMPLE t-TEST UNEQUAL VARIANCE (MANUAL - ONE-SIDED)
|
|
421
|
+
## ============================================================
|
|
422
|
+
|
|
423
|
+
xbar1, xbar2 = 27.2, 32.4
|
|
424
|
+
s1, s2 = 3.8, 4.3
|
|
425
|
+
n1, n2 = 12, 9
|
|
426
|
+
alpha = 0.01
|
|
427
|
+
|
|
428
|
+
sp = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2) / (n1 + n2 - 2))
|
|
429
|
+
se = sp * np.sqrt(1/n1 + 1/n2)
|
|
430
|
+
df = n1 + n2 - 2
|
|
431
|
+
|
|
432
|
+
t_cal = ((xbar1 - xbar2) - 0) / se
|
|
433
|
+
t_neg = t.ppf(alpha, df)
|
|
434
|
+
|
|
435
|
+
print("=== 15. Two-Sample t-Test Unequal Var (One-Sided Left) ===")
|
|
436
|
+
print(f"t_cal = {t_cal}")
|
|
437
|
+
print(f"t_neg = {t_neg}")
|
|
438
|
+
print(f"p-value = {t.cdf(t_cal, df)}")
|
|
439
|
+
print(f"Boundary = {0 + t_neg * se}")
|
|
440
|
+
# Reject H0 if t_cal < t_neg
|
|
441
|
+
# Reject H0 if p-value < alpha
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
## ============================================================
|
|
445
|
+
## 16. TWO-SAMPLE t-TEST USING BUILT-IN (ttest_ind)
|
|
446
|
+
## ============================================================
|
|
447
|
+
|
|
448
|
+
a = np.array([56, 128.6, 12, 123.8, 64.34, 78, 763.3])
|
|
449
|
+
b = np.array([1.1, 2.9, 4.2])
|
|
450
|
+
|
|
451
|
+
print("=== 16. ttest_ind (Built-in) ===")
|
|
452
|
+
print("Two-sided equal: ", stats.ttest_ind(a, b, equal_var=True, alternative='two-sided'))
|
|
453
|
+
print("Two-sided unequal:", stats.ttest_ind(a, b, equal_var=False, alternative='two-sided'))
|
|
454
|
+
print("Left equal: ", stats.ttest_ind(a, b, equal_var=True, alternative='less'))
|
|
455
|
+
print("Right equal: ", stats.ttest_ind(a, b, equal_var=True, alternative='greater'))
|
|
456
|
+
# equal_var=True --> pooled (df = n1+n2-2)
|
|
457
|
+
# equal_var=False --> Welch's (fractional df)
|
|
458
|
+
# alternative: 'two-sided', 'less', 'greater'
|
|
459
|
+
# Reject H0 if pvalue < alpha
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
## ============================================================
|
|
463
|
+
## 17. TWO-SAMPLE PROPORTION Z-TEST (ONE-SIDED LEFT)
|
|
464
|
+
## Keywords: "increased" but p1(old) - p2(new) < 0
|
|
465
|
+
## ============================================================
|
|
466
|
+
|
|
467
|
+
n1, n2 = 400, 380
|
|
468
|
+
p1 = 166 / 400
|
|
469
|
+
p2 = 205 / 380
|
|
470
|
+
alpha = 0.01
|
|
471
|
+
|
|
472
|
+
phat = (n1*p1 + n2*p2) / (n1 + n2)
|
|
473
|
+
se = np.sqrt(phat * (1 - phat) * (1/n1 + 1/n2))
|
|
474
|
+
Z_cal = ((p1 - p2) - 0) / se
|
|
475
|
+
|
|
476
|
+
Z_neg = norm.ppf(alpha)
|
|
477
|
+
|
|
478
|
+
print("=== 17. Two-Sample Proportion Z-Test (One-Sided Left) ===")
|
|
479
|
+
print(f"Pooled proportion = {phat}")
|
|
480
|
+
print(f"Z_cal = {Z_cal}")
|
|
481
|
+
print(f"Z_neg = {Z_neg}")
|
|
482
|
+
print(f"p-value = {norm.cdf(Z_cal)}")
|
|
483
|
+
print(f"Boundary = {0 + Z_neg * se}")
|
|
484
|
+
# Reject H0 if Z_cal < Z_neg
|
|
485
|
+
# Reject H0 if p-value < alpha
|
|
486
|
+
# Reject H0 if (p1-p2) < boundary
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
## ============================================================
|
|
490
|
+
## 18. TWO-SAMPLE PROPORTION USING BUILT-IN (proportions_ztest)
|
|
491
|
+
## ============================================================
|
|
492
|
+
|
|
493
|
+
count = np.array([166, 205])
|
|
494
|
+
nobs = np.array([400, 380])
|
|
495
|
+
|
|
496
|
+
print("=== 18. Two-Sample proportions_ztest (Built-in) ===")
|
|
497
|
+
stat, pval = proportions_ztest(count, nobs)
|
|
498
|
+
print(f"Two-sided: stat={stat:.4f}, pval={pval:.4f}")
|
|
499
|
+
# Reject H0 if pval < alpha
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
## ============================================================
|
|
503
|
+
## 19. PAIRED t-TEST (DEPENDENT SAMPLES) - MANUAL
|
|
504
|
+
## Same subjects measured twice (before/after)
|
|
505
|
+
## ============================================================
|
|
506
|
+
|
|
507
|
+
a = np.array([56, 128, 12, 123, 64, 78, 763])
|
|
508
|
+
b = np.array([46, 100, 5, 121, 54, 80, 700])
|
|
509
|
+
|
|
510
|
+
c = a - b
|
|
511
|
+
xbar = np.mean(c)
|
|
512
|
+
sample_sd = stdev(c)
|
|
513
|
+
n = len(c)
|
|
514
|
+
df = n - 1
|
|
515
|
+
alpha = 0.05
|
|
516
|
+
|
|
517
|
+
se = sample_sd / np.sqrt(n)
|
|
518
|
+
t_cal = (xbar - 0) / se
|
|
519
|
+
|
|
520
|
+
t_pos = t.ppf(1 - alpha/2, df)
|
|
521
|
+
t_neg = t.ppf(alpha/2, df)
|
|
522
|
+
|
|
523
|
+
print("=== 19. Paired t-Test Manual (Two-Sided) ===")
|
|
524
|
+
print(f"Differences = {list(c)}")
|
|
525
|
+
print(f"Mean diff = {xbar}")
|
|
526
|
+
print(f"t_cal = {t_cal}")
|
|
527
|
+
print(f"t_table = [{t_neg}, {t_pos}]")
|
|
528
|
+
print(f"p-value = {(1 - t.cdf(t_cal, df)) * 2}")
|
|
529
|
+
# Accept H0 if |t_cal| < |t_table|
|
|
530
|
+
# Accept H0 if p-value > alpha
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
## ============================================================
|
|
534
|
+
## 20. PAIRED t-TEST USING BUILT-IN (ttest_rel)
|
|
535
|
+
## ============================================================
|
|
536
|
+
|
|
537
|
+
a = np.array([56, 128, 12, 123, 64, 78, 763])
|
|
538
|
+
b = np.array([46, 100, 5, 121, 54, 80, 700])
|
|
539
|
+
|
|
540
|
+
print("=== 20. ttest_rel (Built-in) ===")
|
|
541
|
+
print("Two-sided:", stats.ttest_rel(a, b, alternative='two-sided'))
|
|
542
|
+
print("Greater: ", stats.ttest_rel(a, b, alternative='greater'))
|
|
543
|
+
print("Less: ", stats.ttest_rel(a, b, alternative='less'))
|
|
544
|
+
# alternative: 'two-sided', 'less', 'greater'
|
|
545
|
+
# Reject H0 if pvalue < alpha""")
|
|
546
|
+
|
|
547
|
+
def mle(self):
|
|
548
|
+
"""Maximum likelihood estimation — Normal, Binomial, Poisson."""
|
|
549
|
+
print("""\
|
|
550
|
+
import numpy as np
|
|
551
|
+
from scipy.stats import norm, poisson
|
|
552
|
+
from scipy.optimize import minimize
|
|
553
|
+
from math import factorial
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
## ============================================================
|
|
557
|
+
## 1. MLE - NORMAL DISTRIBUTION
|
|
558
|
+
## ============================================================
|
|
559
|
+
|
|
560
|
+
def normal_dist(params,data):
|
|
561
|
+
mu, sd = params
|
|
562
|
+
log1 = norm.logpdf(data, mu, sd)
|
|
563
|
+
lll = -np.sum(log1)
|
|
564
|
+
return lll
|
|
565
|
+
|
|
566
|
+
initial_guess = [10, 3]
|
|
567
|
+
result = minimize(nll, initial_guess, args=(data,), bounds=((None, None), (1e-5, None)))
|
|
568
|
+
print("mean", result.x[0])
|
|
569
|
+
print("sd", result.x[1])
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
## ============================================================
|
|
573
|
+
## 2. MLE - BINOMIAL DISTRIBUTION
|
|
574
|
+
## ============================================================
|
|
575
|
+
|
|
576
|
+
hospital_data = [
|
|
577
|
+
(100, 85),
|
|
578
|
+
(150, 120),
|
|
579
|
+
(200, 160),
|
|
580
|
+
(250, 210),
|
|
581
|
+
(300, 260)
|
|
582
|
+
]
|
|
583
|
+
|
|
584
|
+
inta = [0.1]
|
|
585
|
+
def binomial_dist(p,n,k):
|
|
586
|
+
a1=factorial(n)/((factorial(n-k)*factorial(k)))
|
|
587
|
+
a2=(p**k)*((1-p)**(n-k))
|
|
588
|
+
ll = a1*a2
|
|
589
|
+
logll = np.log(ll)
|
|
590
|
+
return -logll
|
|
591
|
+
for n, k in hospital_data:
|
|
592
|
+
result = minimize(BLL, inta, args=(n,k), bounds=[(0.00005, 0.99995)])
|
|
593
|
+
print("prob", result.x[0])
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
## ============================================================
|
|
597
|
+
## 3. MLE - POISSON DISTRIBUTION
|
|
598
|
+
## ============================================================
|
|
599
|
+
|
|
600
|
+
def poisson_dist(lam, x):
|
|
601
|
+
log=[]
|
|
602
|
+
for i in x:
|
|
603
|
+
logs = np.log(poisson.pmf(x[i], lam))
|
|
604
|
+
log.append(logs)
|
|
605
|
+
return np.sum(log)
|
|
606
|
+
lams = np.arange(0,12,1)
|
|
607
|
+
ls = []
|
|
608
|
+
for lam in lams:
|
|
609
|
+
lsa = pd(lam, data)
|
|
610
|
+
ls.append(lsa)
|
|
611
|
+
bestll = lams[np.argmax(ls)]
|
|
612
|
+
print(bestll)""")
|
|
613
|
+
|
|
614
|
+
def nonpara(self):
|
|
615
|
+
"""Non-parametric tests — Mann-Whitney U, Kruskal-Wallis, Sign test."""
|
|
616
|
+
print("""\
|
|
617
|
+
import numpy as np
|
|
618
|
+
from scipy.stats import mannwhitneyu
|
|
619
|
+
from scipy import stats
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
## ============================================================
|
|
623
|
+
## 1. MANN-WHITNEY U TEST
|
|
624
|
+
## ============================================================
|
|
625
|
+
|
|
626
|
+
a=np.array([])
|
|
627
|
+
b=np.array([])
|
|
628
|
+
u_stat,p_value=mannwhitneyu(a,b,alternative="")
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
## ============================================================
|
|
632
|
+
## 2. KRUSKAL-WALLIS H TEST
|
|
633
|
+
## ============================================================
|
|
634
|
+
|
|
635
|
+
a=np.array([])
|
|
636
|
+
b=np.array([])
|
|
637
|
+
results=stats.kruskal(a,b)
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
## ============================================================
|
|
641
|
+
## 3. SIGN TEST
|
|
642
|
+
## ============================================================
|
|
643
|
+
|
|
644
|
+
from statsmodels.stats.descriptivestats import sign_test
|
|
645
|
+
a=np.array([])
|
|
646
|
+
b=np.array([])
|
|
647
|
+
diff=a-b
|
|
648
|
+
stat_val,p_val=sign_test(diff)""")
|
|
649
|
+
|
|
650
|
+
def linearregression(self):
|
|
651
|
+
"""Simple linear regression — sklearn, OLS summary, VIF, prediction, train-test."""
|
|
652
|
+
print("""\
|
|
653
|
+
import pandas as pd
|
|
654
|
+
import numpy as np
|
|
655
|
+
import seaborn as sns
|
|
656
|
+
import matplotlib.pyplot as plt
|
|
657
|
+
from sklearn.linear_model import LinearRegression
|
|
658
|
+
from sklearn.model_selection import (train_test_split,KFold,cross_val_score)
|
|
659
|
+
import statsmodels.formula.api as smf
|
|
660
|
+
from statsmodels.stats.outliers_influence import (variance_inflation_factor)
|
|
661
|
+
import statsmodels.api as sm
|
|
662
|
+
|
|
663
|
+
df = pd.read_csv('data.csv')
|
|
664
|
+
print(df.head())
|
|
665
|
+
print(df.info())
|
|
666
|
+
print(df.corr())
|
|
667
|
+
# Regplot
|
|
668
|
+
sns.regplot(x='FV1',y='targetvariable',data=df)
|
|
669
|
+
plt.show()
|
|
670
|
+
|
|
671
|
+
# SIMPLE LINEAR REGRESSION
|
|
672
|
+
lr = LinearRegression()
|
|
673
|
+
X = df['FV1'].values.reshape(-1,1)
|
|
674
|
+
Y = df['targetvariable']
|
|
675
|
+
|
|
676
|
+
#VIF
|
|
677
|
+
X_vif = pd.DataFrame(X,columns=['FV1'])
|
|
678
|
+
X_vif = sm.add_constant(X_vif)
|
|
679
|
+
vif = pd.DataFrame()
|
|
680
|
+
vif["Feature"] = X_vif.columns
|
|
681
|
+
vif["VIF"] = [variance_inflation_factor(X_vif.values,i)
|
|
682
|
+
for i in range(X_vif.shape[1])]
|
|
683
|
+
print(vif)
|
|
684
|
+
|
|
685
|
+
lr.fit(X,Y)
|
|
686
|
+
print("Intercept =", lr.intercept_)
|
|
687
|
+
print("Coefficient =", lr.coef_)
|
|
688
|
+
print(f"Equation: targetvariable = "f"{lr.intercept_:.4f} + "f"{lr.coef_[0]:.4f} * FV1")
|
|
689
|
+
|
|
690
|
+
# RSS
|
|
691
|
+
RSS = np.sum((lr.intercept_ +lr.coef_ * X -Y.values.reshape(-1,1))**2)
|
|
692
|
+
print("RSS =", RSS)
|
|
693
|
+
|
|
694
|
+
# SUMMARY
|
|
695
|
+
lm_fit = smf.ols('targetvariable ~ FV1',df).fit()
|
|
696
|
+
print(lm_fit.summary())
|
|
697
|
+
|
|
698
|
+
# PREDICTION
|
|
699
|
+
predictions = lm_fit.predict(pd.DataFrame({'FV1':[5,10,15]}))
|
|
700
|
+
print(predictions)
|
|
701
|
+
|
|
702
|
+
# TRAIN TEST SPLIT
|
|
703
|
+
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20,random_state=123)
|
|
704
|
+
|
|
705
|
+
lr.fit(X_train,Y_train)
|
|
706
|
+
pred_test = lr.predict(X_test)
|
|
707
|
+
|
|
708
|
+
# TEST MSE
|
|
709
|
+
mse_test = np.mean((Y_test - pred_test)**2)
|
|
710
|
+
print("Test MSE =", mse_test)
|
|
711
|
+
print("R-Squared =", lm_fit.rsquared)""")
|
|
712
|
+
|
|
713
|
+
def multilr(self):
|
|
714
|
+
"""Multiple linear regression — sklearn, OLS, interaction, VIF, scaling, K-Fold."""
|
|
715
|
+
print("""\
|
|
716
|
+
# MULTIPLE LINEAR REGRESSION
|
|
717
|
+
import pandas as pd
|
|
718
|
+
import numpy as np
|
|
719
|
+
import seaborn as sns
|
|
720
|
+
import matplotlib.pyplot as plt
|
|
721
|
+
from sklearn.linear_model import LinearRegression
|
|
722
|
+
from sklearn.model_selection import (train_test_split,KFold,cross_val_score)
|
|
723
|
+
from sklearn.preprocessing import StandardScaler
|
|
724
|
+
import statsmodels.formula.api as smf
|
|
725
|
+
from statsmodels.stats.outliers_influence import (variance_inflation_factor)
|
|
726
|
+
import statsmodels.api as sm
|
|
727
|
+
|
|
728
|
+
df = pd.read_csv('data.csv')
|
|
729
|
+
print(df.head())
|
|
730
|
+
print(df.info())
|
|
731
|
+
print(df.corr())
|
|
732
|
+
|
|
733
|
+
sns.regplot(x='FV1',y='targetvariable',data=df)
|
|
734
|
+
plt.show()
|
|
735
|
+
sns.regplot(x='FV2',y='targetvariable',data=df)
|
|
736
|
+
plt.show()
|
|
737
|
+
sns.regplot(x='FV3',y='targetvariable',data=df)
|
|
738
|
+
plt.show()
|
|
739
|
+
|
|
740
|
+
X = df[['FV1','FV2','FV3']]
|
|
741
|
+
Y = df['targetvariable']
|
|
742
|
+
|
|
743
|
+
X_vif = sm.add_constant(X)
|
|
744
|
+
vif = pd.DataFrame()
|
|
745
|
+
vif["Feature"] = X_vif.columns
|
|
746
|
+
vif["VIF"] = [variance_inflation_factor(X_vif.values,i)
|
|
747
|
+
for i in range(X_vif.shape[1])]
|
|
748
|
+
print(vif)
|
|
749
|
+
|
|
750
|
+
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20,random_state=123)
|
|
751
|
+
lr = LinearRegression()
|
|
752
|
+
lr.fit(X_train,Y_train)
|
|
753
|
+
print("Intercept =", lr.intercept_)
|
|
754
|
+
print("Coefficients =", lr.coef_)
|
|
755
|
+
pred_test = lr.predict(X_test)
|
|
756
|
+
print(pred_test)
|
|
757
|
+
mse = np.mean((Y_test - pred_test)**2)
|
|
758
|
+
print("MSE =", mse)
|
|
759
|
+
r2_score = lr.score(X_test,Y_test)
|
|
760
|
+
print("R-Squared =", r2_score)
|
|
761
|
+
|
|
762
|
+
#1
|
|
763
|
+
lm_fit = smf.ols('targetvariable ~ FV1 + FV2 + FV3',df).fit()
|
|
764
|
+
print(lm_fit.summary())
|
|
765
|
+
print("R-Squared =", lm_fit.rsquared)
|
|
766
|
+
#2
|
|
767
|
+
lm_fit2 = smf.ols('targetvariable ~ FV1 * FV2',df).fit()
|
|
768
|
+
print(lm_fit2.summary())
|
|
769
|
+
print("R-Squared =", lm_fit.rsquared)
|
|
770
|
+
|
|
771
|
+
scaler = StandardScaler()
|
|
772
|
+
X_scaled = scaler.fit_transform(X)
|
|
773
|
+
model = LinearRegression()
|
|
774
|
+
kfold = KFold(10,random_state=0,shuffle=True)
|
|
775
|
+
mse_cv = cross_val_score(model,X,Y,cv=kfold,scoring='neg_mean_squared_error')
|
|
776
|
+
print("Average MSE =", np.mean(-mse_cv))
|
|
777
|
+
r2_cv = cross_val_score(model,X,Y,cv=kfold,scoring='r2')
|
|
778
|
+
print("R2 Score Per Fold =")
|
|
779
|
+
print(r2_cv)
|
|
780
|
+
print("Average Accuracy (R2 Score) =",np.mean(r2_cv))""")
|
|
781
|
+
|
|
782
|
+
def ridge(self):
|
|
783
|
+
"""Ridge regression — with CV and train-test split."""
|
|
784
|
+
print("""\
|
|
785
|
+
import pandas as pd
|
|
786
|
+
import numpy as np
|
|
787
|
+
from sklearn.linear_model import (Ridge,RidgeCV)
|
|
788
|
+
|
|
789
|
+
from sklearn.model_selection import (train_test_split)
|
|
790
|
+
|
|
791
|
+
from sklearn.metrics import (mean_squared_error)
|
|
792
|
+
|
|
793
|
+
from sklearn.preprocessing import (StandardScaler)
|
|
794
|
+
|
|
795
|
+
df = pd.read_csv('data.csv')
|
|
796
|
+
print(df.head())
|
|
797
|
+
print(df.info())
|
|
798
|
+
print(df.corr())
|
|
799
|
+
|
|
800
|
+
X = df[['FV1','FV2','FV3','FV4']]
|
|
801
|
+
Y = df['targetvariable']
|
|
802
|
+
scaler = StandardScaler()
|
|
803
|
+
X_scaled = scaler.fit_transform(X)
|
|
804
|
+
|
|
805
|
+
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled,Y,test_size=0.20,random_state=123)
|
|
806
|
+
ridge = Ridge(alpha=10)
|
|
807
|
+
ridge.fit(X_train,Y_train)
|
|
808
|
+
|
|
809
|
+
print("Ridge Coefficients =")
|
|
810
|
+
print(pd.Series(ridge.coef_,index=X.columns))
|
|
811
|
+
|
|
812
|
+
pred = ridge.predict(X_test)
|
|
813
|
+
print(pred)
|
|
814
|
+
|
|
815
|
+
mse = mean_squared_error(Y_test,pred)
|
|
816
|
+
print("Test MSE =", mse)
|
|
817
|
+
|
|
818
|
+
r2 = ridge.score(X_test,Y_test)
|
|
819
|
+
print("R-Squared =", r2)
|
|
820
|
+
|
|
821
|
+
alphas = 10 ** np.linspace(10,-2,100) * 0.5
|
|
822
|
+
|
|
823
|
+
ridgecv = RidgeCV(alphas=alphas)
|
|
824
|
+
|
|
825
|
+
ridgecv.fit(X_scaled,Y)
|
|
826
|
+
|
|
827
|
+
print("Best Alpha =",ridgecv.alpha_)
|
|
828
|
+
ridge_best = Ridge(alpha=ridgecv.alpha_)
|
|
829
|
+
|
|
830
|
+
ridge_best.fit(X_train,Y_train)
|
|
831
|
+
pred_best = ridge_best.predict(X_test)
|
|
832
|
+
|
|
833
|
+
mse_best = mean_squared_error(Y_test,pred_best)
|
|
834
|
+
|
|
835
|
+
print("Best Model Test MSE =",mse_best)
|
|
836
|
+
r2_best = ridge_best.score(X_test,Y_test)
|
|
837
|
+
|
|
838
|
+
print("Best Model R-Squared =",r2_best)""")
|
|
839
|
+
|
|
840
|
+
def lasso(self):
|
|
841
|
+
"""Lasso regression — with LassoCV and train-test split."""
|
|
842
|
+
print("""\
|
|
843
|
+
import pandas as pd
|
|
844
|
+
import numpy as np
|
|
845
|
+
|
|
846
|
+
from sklearn.linear_model import (Lasso,LassoCV)
|
|
847
|
+
from sklearn.model_selection import (train_test_split)
|
|
848
|
+
from sklearn.metrics import (
|
|
849
|
+
mean_squared_error)
|
|
850
|
+
from sklearn.preprocessing import (StandardScaler)
|
|
851
|
+
|
|
852
|
+
df = pd.read_csv('data.csv')
|
|
853
|
+
print(df.head())
|
|
854
|
+
print(df.info())
|
|
855
|
+
print(df.corr())
|
|
856
|
+
|
|
857
|
+
X = df[['FV1','FV2','FV3','FV4']]
|
|
858
|
+
Y = df['targetvariable']
|
|
859
|
+
scaler = StandardScaler()
|
|
860
|
+
X_scaled = scaler.fit_transform(X)
|
|
861
|
+
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled,Y,test_size=0.20,random_state=123)
|
|
862
|
+
lasso = Lasso(alpha=10)
|
|
863
|
+
lasso.fit(X_train,Y_train)
|
|
864
|
+
|
|
865
|
+
print("Lasso Coefficients =")
|
|
866
|
+
print(pd.Series(lasso.coef_,index=X.columns))
|
|
867
|
+
|
|
868
|
+
print("Notice: Lasso may set some coefficients to EXACTLY 0")
|
|
869
|
+
pred_lasso = lasso.predict(X_test)
|
|
870
|
+
print(pred_lasso)
|
|
871
|
+
mse = mean_squared_error(Y_test,pred_lasso)
|
|
872
|
+
print("Test MSE =", mse)
|
|
873
|
+
r2 = lasso.score(X_test,Y_test)
|
|
874
|
+
print("R-Squared =", r2)
|
|
875
|
+
|
|
876
|
+
alphas = 10 ** np.linspace(10,-2,100) * 0.5
|
|
877
|
+
lassocv = LassoCV(alphas=alphas)
|
|
878
|
+
lassocv.fit(X_train,Y_train)
|
|
879
|
+
print("Best Alpha =",lassocv.alpha_)
|
|
880
|
+
lasso_best = Lasso(alpha=lassocv.alpha_)
|
|
881
|
+
|
|
882
|
+
lasso_best.fit(X_train,Y_train)
|
|
883
|
+
pred_lasso_best = lasso_best.predict(X_test)
|
|
884
|
+
print("Lasso Best Model Coefficients =")
|
|
885
|
+
print(pd.Series(lasso_best.coef_,index=X.columns))
|
|
886
|
+
mse_best = mean_squared_error(Y_test,pred_lasso_best)
|
|
887
|
+
print("Best Model Test MSE =",mse_best)
|
|
888
|
+
r2_best = lasso_best.score(X_test,Y_test)
|
|
889
|
+
print("Best Model R-Squared =",r2_best)""")
|
|
890
|
+
|
|
891
|
+
def ridgelasso(self):
|
|
892
|
+
"""Combined Ridge, Lasso, Linear Regression with VIF, OLS, and K-Fold."""
|
|
893
|
+
print("""\
|
|
894
|
+
import pandas as pd
|
|
895
|
+
import numpy as np
|
|
896
|
+
import seaborn as sns
|
|
897
|
+
import matplotlib.pyplot as plt
|
|
898
|
+
|
|
899
|
+
from sklearn.linear_model import (LinearRegression,Ridge,RidgeCV,Lasso,LassoCV)
|
|
900
|
+
from sklearn.model_selection import (train_test_split,KFold,cross_val_score)
|
|
901
|
+
from sklearn.metrics import (mean_squared_error)
|
|
902
|
+
from sklearn.preprocessing import (StandardScaler)
|
|
903
|
+
import statsmodels.formula.api as smf
|
|
904
|
+
from statsmodels.stats.outliers_influence import (variance_inflation_factor)
|
|
905
|
+
import statsmodels.api as sm
|
|
906
|
+
df = pd.read_csv('data.csv')
|
|
907
|
+
print(df.head())
|
|
908
|
+
print(df.info())
|
|
909
|
+
print(df.corr())
|
|
910
|
+
|
|
911
|
+
sns.regplot(x='FV1',y='targetvariable',data=df)
|
|
912
|
+
plt.show()
|
|
913
|
+
sns.regplot(x='FV2',y='targetvariable',data=df)
|
|
914
|
+
plt.show()
|
|
915
|
+
sns.regplot(x='FV3',y='targetvariable',data=df)
|
|
916
|
+
plt.show()
|
|
917
|
+
|
|
918
|
+
X = df[['FV1','FV2','FV3','FV4']]
|
|
919
|
+
Y = df['targetvariable']
|
|
920
|
+
|
|
921
|
+
X_vif = sm.add_constant(X)
|
|
922
|
+
vif = pd.DataFrame()
|
|
923
|
+
vif["Feature"] = X_vif.columns
|
|
924
|
+
vif["VIF"] = [variance_inflation_factor(X_vif.values,i)
|
|
925
|
+
for i in range(X_vif.shape[1])]
|
|
926
|
+
print(vif)
|
|
927
|
+
|
|
928
|
+
scaler = StandardScaler()
|
|
929
|
+
X_scaled = scaler.fit_transform(X)
|
|
930
|
+
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled,Y,test_size=0.20,random_state=123)
|
|
931
|
+
lr = LinearRegression()
|
|
932
|
+
lr.fit(X_train,Y_train)
|
|
933
|
+
print("Intercept =", lr.intercept_)
|
|
934
|
+
print("Coefficients =")
|
|
935
|
+
print(pd.Series(lr.coef_,index=X.columns))
|
|
936
|
+
pred_lr = lr.predict(X_test)
|
|
937
|
+
mse_lr = mean_squared_error(Y_test,pred_lr)
|
|
938
|
+
print("Linear Regression MSE =", mse_lr)
|
|
939
|
+
r2_lr = lr.score(X_test,Y_test)
|
|
940
|
+
print("Linear Regression R-Squared =", r2_lr)
|
|
941
|
+
|
|
942
|
+
lm_fit = smf.ols('targetvariable ~ FV1 + FV2 + FV3 + FV4',df).fit()
|
|
943
|
+
print(lm_fit.summary())
|
|
944
|
+
print("R-Squared =", lm_fit.rsquared)
|
|
945
|
+
print("Adjusted R-Squared =",lm_fit.rsquared_adj)
|
|
946
|
+
|
|
947
|
+
ridge = Ridge(alpha=10)
|
|
948
|
+
ridge.fit(X_train,Y_train)
|
|
949
|
+
print("Ridge Coefficients =")
|
|
950
|
+
|
|
951
|
+
print(pd.Series(ridge.coef_,index=X.columns))
|
|
952
|
+
pred_ridge = ridge.predict(X_test)
|
|
953
|
+
mse_ridge = mean_squared_error(Y_test,pred_ridge)
|
|
954
|
+
print("Ridge Test MSE =", mse_ridge)
|
|
955
|
+
r2_ridge = ridge.score(X_test,Y_test)
|
|
956
|
+
print("Ridge R-Squared =", r2_ridge)
|
|
957
|
+
alphas = 10 ** np.linspace(10,-2,100) * 0.5
|
|
958
|
+
ridgecv = RidgeCV(alphas=alphas)
|
|
959
|
+
ridgecv.fit(X_scaled,Y)
|
|
960
|
+
print("Best Ridge Alpha =",ridgecv.alpha_)
|
|
961
|
+
|
|
962
|
+
lasso = Lasso(alpha=10)
|
|
963
|
+
lasso.fit(X_train,Y_train)
|
|
964
|
+
print("Lasso Coefficients =")
|
|
965
|
+
print(pd.Series(lasso.coef_,index=X.columns))
|
|
966
|
+
print("Notice: Lasso may set coefficients to EXACTLY 0")
|
|
967
|
+
pred_lasso = lasso.predict(X_test)
|
|
968
|
+
mse_lasso = mean_squared_error(Y_test,pred_lasso)
|
|
969
|
+
print("Lasso Test MSE =", mse_lasso)
|
|
970
|
+
r2_lasso = lasso.score(X_test,Y_test)
|
|
971
|
+
print("Lasso R-Squared =", r2_lasso)
|
|
972
|
+
model = LinearRegression()
|
|
973
|
+
|
|
974
|
+
kfold = KFold(10,random_state=0,shuffle=True)
|
|
975
|
+
r2_cv = cross_val_score(model,X_scaled,Y,cv=kfold,scoring='r2')
|
|
976
|
+
print("R2 Score Per Fold =")
|
|
977
|
+
print(r2_cv)
|
|
978
|
+
print("Average Accuracy (R2 Score) =",np.mean(r2_cv))""")
|
|
979
|
+
|
|
980
|
+
def logi(self):
|
|
981
|
+
"""Logistic regression with VIF, confusion matrix, classification report, K-Fold."""
|
|
982
|
+
print("""\
|
|
983
|
+
import numpy as np
|
|
984
|
+
import pandas as pd
|
|
985
|
+
from sklearn.linear_model import LogisticRegression
|
|
986
|
+
from sklearn.model_selection import (train_test_split,KFold,cross_val_score)
|
|
987
|
+
from sklearn.metrics import (confusion_matrix,accuracy_score,classification_report)
|
|
988
|
+
from statsmodels.stats.outliers_influence import (variance_inflation_factor)
|
|
989
|
+
import statsmodels.api as sm
|
|
990
|
+
|
|
991
|
+
df = pd.read_csv('data.csv')
|
|
992
|
+
|
|
993
|
+
print(df.head())
|
|
994
|
+
print(df.info())
|
|
995
|
+
|
|
996
|
+
X = df[['FV1','FV2','FV3','FV4']]
|
|
997
|
+
Y = df['targetvariable']
|
|
998
|
+
|
|
999
|
+
X_vif = sm.add_constant(X)
|
|
1000
|
+
vif_data = pd.DataFrame()
|
|
1001
|
+
vif_data["Feature"] = X_vif.columns
|
|
1002
|
+
vif_data["VIF"] = [variance_inflation_factor(X_vif.values,i)
|
|
1003
|
+
for i in range(X_vif.shape[1])]
|
|
1004
|
+
print(vif_data)
|
|
1005
|
+
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20,random_state=123)
|
|
1006
|
+
print("Train Shape =", X_train.shape)
|
|
1007
|
+
print("Test Shape =", X_test.shape)
|
|
1008
|
+
logit = LogisticRegression()
|
|
1009
|
+
logit.fit(X_train, Y_train)
|
|
1010
|
+
print("Intercept =", logit.intercept_)
|
|
1011
|
+
print("Coefficients =", logit.coef_)
|
|
1012
|
+
Y_pred = logit.predict(X_test)
|
|
1013
|
+
print("Predictions =")
|
|
1014
|
+
print(Y_pred)
|
|
1015
|
+
|
|
1016
|
+
cm = confusion_matrix(Y_test,Y_pred)
|
|
1017
|
+
print("Confusion Matrix =")
|
|
1018
|
+
print(cm)
|
|
1019
|
+
|
|
1020
|
+
accuracy = accuracy_score(Y_test,Y_pred)
|
|
1021
|
+
print("Accuracy =", accuracy)
|
|
1022
|
+
print(classification_report(Y_test,Y_pred))
|
|
1023
|
+
K = 5
|
|
1024
|
+
kfold = KFold(K,random_state=0,shuffle=True)
|
|
1025
|
+
|
|
1026
|
+
accuracy_cv = cross_val_score(logit,X,Y,cv=kfold,scoring='accuracy')
|
|
1027
|
+
print("Accuracy Per Fold =")
|
|
1028
|
+
print(accuracy_cv)
|
|
1029
|
+
print("Average Accuracy =",np.mean(accuracy_cv))""")
|
|
1030
|
+
|
|
1031
|
+
def annova(self):
|
|
1032
|
+
"""ANOVA — One-way, Two-way, and Tukey post-hoc."""
|
|
1033
|
+
print("""\
|
|
1034
|
+
import pandas as pd
|
|
1035
|
+
import statsmodels.api as sm
|
|
1036
|
+
from statsmodels.formula.api import ols
|
|
1037
|
+
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
## ============================================================
|
|
1041
|
+
## 1. ONE-WAY ANOVA
|
|
1042
|
+
## ============================================================
|
|
1043
|
+
|
|
1044
|
+
df = pd.read_csv('data.csv')
|
|
1045
|
+
|
|
1046
|
+
df['column_name'] = df['column_name'].astype('category')
|
|
1047
|
+
|
|
1048
|
+
df.head()
|
|
1049
|
+
df.info()
|
|
1050
|
+
|
|
1051
|
+
oneway_fit = ols('target ~ factor', data=df).fit()
|
|
1052
|
+
oneway_anova = sm.stats.anova_lm(oneway_fit, typ=1)
|
|
1053
|
+
print(oneway_anova)
|
|
1054
|
+
|
|
1055
|
+
# IF 2 Factor variables are given they are to be compared individually
|
|
1056
|
+
|
|
1057
|
+
# If the P value of anova is less than 0.05 then Use Tukey for the Factor Variable.
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
## ============================================================
|
|
1061
|
+
## 2. TWO-WAY ANOVA
|
|
1062
|
+
## ============================================================
|
|
1063
|
+
|
|
1064
|
+
df2 = pd.read_csv('cars.csv')
|
|
1065
|
+
|
|
1066
|
+
twoway_fit = ols('target ~ C(factor1)+C(factor2)+C(factor1):C(factor2)', data=df2).fit()
|
|
1067
|
+
twoway_anova = sm.stats.anova_lm(twoway_fit, typ=2)
|
|
1068
|
+
|
|
1069
|
+
|
|
1070
|
+
## ============================================================
|
|
1071
|
+
## 3. TUKEY POST-HOC TEST
|
|
1072
|
+
## ============================================================
|
|
1073
|
+
|
|
1074
|
+
tukey = pairwise_tukeyhsd(df1["target"],groups=df1["factor"])
|
|
1075
|
+
tukey._results_table""")
|
|
1076
|
+
|
|
1077
|
+
def manova(self):
|
|
1078
|
+
"""MANOVA — One-way and Two-way."""
|
|
1079
|
+
print("""\
|
|
1080
|
+
import pandas as pd
|
|
1081
|
+
from statsmodels.multivariate.manova import MANOVA
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
## ============================================================
|
|
1085
|
+
## 1. ONE-WAY MANOVA
|
|
1086
|
+
## ============================================================
|
|
1087
|
+
|
|
1088
|
+
url = 'https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv'
|
|
1089
|
+
df = pd.read_csv(url)
|
|
1090
|
+
print(df.head())
|
|
1091
|
+
df.columns = df.columns.str.replace(".", "_")
|
|
1092
|
+
print(df.head())
|
|
1093
|
+
|
|
1094
|
+
maov = MANOVA.from_formula('target1 + target2 + target3 + ... ~ factor', data=df)
|
|
1095
|
+
print(maov.mv_test())
|
|
1096
|
+
|
|
1097
|
+
|
|
1098
|
+
## ============================================================
|
|
1099
|
+
## 2. TWO-WAY MANOVA
|
|
1100
|
+
## ============================================================
|
|
1101
|
+
|
|
1102
|
+
maov = MANOVA.from_formula('target1 + target2 + target3 + ... ~ factor1 + factor2', data=df)
|
|
1103
|
+
print(maov.mv_test())
|
|
1104
|
+
|
|
1105
|
+
|
|
1106
|
+
# If p value > 0.05, we reject H0""")
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
get = _Get()
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scypyy
|
|
3
|
+
Version: 0.7.0
|
|
4
|
+
Summary: A curated collection.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://google.com
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Dynamic: requires-python
|
|
14
|
+
|
|
15
|
+
# scypyy
|
|
16
|
+
|
|
17
|
+
A curated collection.
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install scypyy
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import scypyy
|
|
28
|
+
|
|
29
|
+
print(scypyy.get())
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
This prints a help.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
scypyy
|
scypyy-0.7.0/setup.cfg
ADDED
scypyy-0.7.0/setup.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="scypyy",
|
|
5
|
+
version="0.7.0",
|
|
6
|
+
packages=find_packages(),
|
|
7
|
+
description="A curated collection.",
|
|
8
|
+
long_description=open("README.md").read(),
|
|
9
|
+
long_description_content_type="text/markdown",
|
|
10
|
+
python_requires=">=3.8",
|
|
11
|
+
classifiers=[
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"Operating System :: OS Independent",
|
|
14
|
+
"Intended Audience :: Science/Research",
|
|
15
|
+
"Topic :: Scientific/Engineering",
|
|
16
|
+
],
|
|
17
|
+
)
|