data-blog-regression 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_blog_regression-0.1.0/PKG-INFO +54 -0
- data_blog_regression-0.1.0/README.md +44 -0
- data_blog_regression-0.1.0/data_blog/__init__.py +41 -0
- data_blog_regression-0.1.0/data_blog/animate/__init__.py +3 -0
- data_blog_regression-0.1.0/data_blog/animate/track.py +160 -0
- data_blog_regression-0.1.0/data_blog/data.py +222 -0
- data_blog_regression-0.1.0/data_blog/linear_regression.py +244 -0
- data_blog_regression-0.1.0/data_blog/utils.py +23 -0
- data_blog_regression-0.1.0/data_blog_regression.egg-info/PKG-INFO +54 -0
- data_blog_regression-0.1.0/data_blog_regression.egg-info/SOURCES.txt +14 -0
- data_blog_regression-0.1.0/data_blog_regression.egg-info/dependency_links.txt +1 -0
- data_blog_regression-0.1.0/data_blog_regression.egg-info/requires.txt +1 -0
- data_blog_regression-0.1.0/data_blog_regression.egg-info/top_level.txt +1 -0
- data_blog_regression-0.1.0/pyproject.toml +15 -0
- data_blog_regression-0.1.0/setup.cfg +4 -0
- data_blog_regression-0.1.0/tests/test_linear.py +195 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: data-blog-regression
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: From-scratch linear and multiple regression library for the Data Blog portfolio
|
|
5
|
+
Author: Your Name
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: numpy>=1.21
|
|
10
|
+
|
|
11
|
+
# Regression Library
|
|
12
|
+
|
|
13
|
+
A lightweight, from-scratch linear and multiple regression library implemented in Python with NumPy. Perfect for educational demonstration, portfolios, and quick testing of regression mechanics.
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
- **Simple Linear Regression**: Supports Batch Gradient Descent with complete loss and coefficient history, and closed-form OLS estimation.
|
|
18
|
+
- **Multiple Linear Regression**: Analytical Normal Equation solver for multidimensional datasets.
|
|
19
|
+
- **Visualization & Animation**: Real-time animation of gradient descent fitting and static regression line plotting with Matplotlib.
|
|
20
|
+
- **Dataset Loaders**: Pre-packaged synthetic generators and subsets of classic real datasets (Iris and Housing).
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
### Installation
|
|
25
|
+
|
|
26
|
+
Clone the repository and install dependencies:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install -r requirements.txt
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Running the Interactive Demo
|
|
33
|
+
|
|
34
|
+
To explore interactive fits on various datasets and watch the live training animation:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
python main.py
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Library Usage
|
|
41
|
+
|
|
42
|
+
You can easily import and train models in your own scripts:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from data_blog import LinearRegression, x, y
|
|
46
|
+
|
|
47
|
+
# Instantiate and fit using Gradient Descent
|
|
48
|
+
model = LinearRegression(learning_rate=0.01, epochs=1000)
|
|
49
|
+
model.fit(x, y)
|
|
50
|
+
|
|
51
|
+
print(f"Intercept: {model.b_0:.4f}")
|
|
52
|
+
print(f"Slope: {model.b_1:.4f}")
|
|
53
|
+
print(f"R-squared Score: {model.score(x, y):.4f}")
|
|
54
|
+
```
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Regression Library
|
|
2
|
+
|
|
3
|
+
A lightweight, from-scratch linear and multiple regression library implemented in Python with NumPy. Perfect for educational demonstration, portfolios, and quick testing of regression mechanics.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Simple Linear Regression**: Supports Batch Gradient Descent with complete loss and coefficient history, and closed-form OLS estimation.
|
|
8
|
+
- **Multiple Linear Regression**: Analytical Normal Equation solver for multidimensional datasets.
|
|
9
|
+
- **Visualization & Animation**: Real-time animation of gradient descent fitting and static regression line plotting with Matplotlib.
|
|
10
|
+
- **Dataset Loaders**: Pre-packaged synthetic generators and subsets of classic real datasets (Iris and Housing).
|
|
11
|
+
|
|
12
|
+
## Quick Start
|
|
13
|
+
|
|
14
|
+
### Installation
|
|
15
|
+
|
|
16
|
+
Clone the repository and install dependencies:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install -r requirements.txt
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### Running the Interactive Demo
|
|
23
|
+
|
|
24
|
+
To explore interactive fits on various datasets and watch the live training animation:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
python main.py
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Library Usage
|
|
31
|
+
|
|
32
|
+
You can easily import and train models in your own scripts:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from data_blog import LinearRegression, x, y
|
|
36
|
+
|
|
37
|
+
# Instantiate and fit using Gradient Descent
|
|
38
|
+
model = LinearRegression(learning_rate=0.01, epochs=1000)
|
|
39
|
+
model.fit(x, y)
|
|
40
|
+
|
|
41
|
+
print(f"Intercept: {model.b_0:.4f}")
|
|
42
|
+
print(f"Slope: {model.b_1:.4f}")
|
|
43
|
+
print(f"R-squared Score: {model.score(x, y):.4f}")
|
|
44
|
+
```
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
From-Scratch Linear Regression Library.
|
|
3
|
+
|
|
4
|
+
A educational, lightweight machine learning package implementing simple and multiple
|
|
5
|
+
linear regression algorithms from scratch using NumPy.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- `LinearRegression`: Supports gradient descent fitting and analytical closed-form (OLS) fitting.
|
|
9
|
+
- `MultipleLinearRegression`: Normal equation solver for multidimensional features.
|
|
10
|
+
- `data_blog.animate`: Visualization utilities to animate gradient descent fitting and plot regression lines.
|
|
11
|
+
- Synthetic and Classic dataset generators/loaders for testing and comparative benchmarking.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .data import (
|
|
15
|
+
x,
|
|
16
|
+
x_test,
|
|
17
|
+
y,
|
|
18
|
+
y_test,
|
|
19
|
+
x_multi_test,
|
|
20
|
+
y_multi_test,
|
|
21
|
+
make_simple_regression,
|
|
22
|
+
make_multiple_regression,
|
|
23
|
+
load_iris_regression,
|
|
24
|
+
load_housing_regression,
|
|
25
|
+
)
|
|
26
|
+
from .linear_regression import LinearRegression, MultipleLinearRegression
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"LinearRegression",
|
|
30
|
+
"MultipleLinearRegression",
|
|
31
|
+
"x",
|
|
32
|
+
"y",
|
|
33
|
+
"x_test",
|
|
34
|
+
"y_test",
|
|
35
|
+
"x_multi_test",
|
|
36
|
+
"y_multi_test",
|
|
37
|
+
"make_simple_regression",
|
|
38
|
+
"make_multiple_regression",
|
|
39
|
+
"load_iris_regression",
|
|
40
|
+
"load_housing_regression",
|
|
41
|
+
]
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Visualization and animation utilities for monitoring regression fitting.
|
|
3
|
+
|
|
4
|
+
This module provides functions to plot fitted regression lines and create real-time
|
|
5
|
+
animations of the gradient descent optimization process, tracking how loss decreases.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from typing import Union, List, Optional
|
|
10
|
+
import matplotlib
|
|
11
|
+
|
|
12
|
+
# Headless environment detection: use Agg backend if no DISPLAY is present
|
|
13
|
+
if "DISPLAY" not in os.environ and not os.environ.get("MPLBACKEND"):
|
|
14
|
+
matplotlib.use("Agg")
|
|
15
|
+
|
|
16
|
+
import matplotlib.pyplot as plt
|
|
17
|
+
from matplotlib.animation import FuncAnimation
|
|
18
|
+
import numpy as np
|
|
19
|
+
from data_blog import LinearRegression, x, y
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def plot_regression_line(
|
|
23
|
+
x: Union[List[float], np.ndarray],
|
|
24
|
+
y: Union[List[float], np.ndarray],
|
|
25
|
+
model: LinearRegression
|
|
26
|
+
) -> None:
|
|
27
|
+
"""
|
|
28
|
+
Plot a static 2D scatter plot of the data along with the fitted regression line.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
x : array-like of shape (n_samples,)
|
|
33
|
+
Input feature values.
|
|
34
|
+
y : array-like of shape (n_samples,)
|
|
35
|
+
True target values.
|
|
36
|
+
model : LinearRegression
|
|
37
|
+
A fitted simple linear regression model containing `b_0` and `b_1`.
|
|
38
|
+
"""
|
|
39
|
+
x_arr = np.asarray(x, dtype=float)
|
|
40
|
+
y_arr = np.asarray(y, dtype=float)
|
|
41
|
+
plt.figure()
|
|
42
|
+
plt.scatter(x_arr, y_arr, color="blue", label="Data Points")
|
|
43
|
+
|
|
44
|
+
# Generate points along x for the line
|
|
45
|
+
x_line = np.linspace(min(x_arr), max(x_arr), 100)
|
|
46
|
+
y_line = model.b_1 * x_line + model.b_0
|
|
47
|
+
|
|
48
|
+
plt.plot(x_line, y_line, color="red", label="Regression Line")
|
|
49
|
+
plt.xlabel("x")
|
|
50
|
+
plt.ylabel("y")
|
|
51
|
+
plt.title("Linear Regression Fit")
|
|
52
|
+
plt.legend()
|
|
53
|
+
plt.show()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def animate_regression_fitting(
|
|
57
|
+
x: Union[List[float], np.ndarray],
|
|
58
|
+
y: Union[List[float], np.ndarray],
|
|
59
|
+
model: LinearRegression,
|
|
60
|
+
save_path: Optional[str] = None
|
|
61
|
+
) -> FuncAnimation:
|
|
62
|
+
"""
|
|
63
|
+
Create an interactive animation showing the regression line and loss history over epochs.
|
|
64
|
+
|
|
65
|
+
Requires that the provided `model` has been fitted and contains `coeff_history`
|
|
66
|
+
and `loss_history`.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
x : array-like of shape (n_samples,)
|
|
71
|
+
Input feature values.
|
|
72
|
+
y : array-like of shape (n_samples,)
|
|
73
|
+
True target values.
|
|
74
|
+
model : LinearRegression
|
|
75
|
+
A fitted simple linear regression model with training history.
|
|
76
|
+
save_path : str or None, default=None
|
|
77
|
+
Optional file path (e.g., 'fitting.gif') to save the animation using Pillow.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
FuncAnimation
|
|
82
|
+
The matplotlib FuncAnimation object.
|
|
83
|
+
"""
|
|
84
|
+
if not hasattr(model, "coeff_history") or not model.coeff_history:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
"Model has not been fitted or does not have coefficient history."
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
x_arr = np.asarray(x, dtype=float)
|
|
90
|
+
y_arr = np.asarray(y, dtype=float)
|
|
91
|
+
|
|
92
|
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
|
|
93
|
+
|
|
94
|
+
# Left subplot: Data Points and Regression Line
|
|
95
|
+
ax1.scatter(x_arr, y_arr, color="blue", label="Data Points")
|
|
96
|
+
(line,) = ax1.plot([], [], color="red", lw=2, label="Fitting Line")
|
|
97
|
+
ax1.set_xlim(min(x_arr) - 1, max(x_arr) + 1)
|
|
98
|
+
ax1.set_ylim(min(y_arr) - 2, max(y_arr) + 2)
|
|
99
|
+
ax1.set_xlabel("x")
|
|
100
|
+
ax1.set_ylabel("y")
|
|
101
|
+
ax1.set_title("Linear Regression Fit")
|
|
102
|
+
ax1.legend()
|
|
103
|
+
|
|
104
|
+
# Right subplot: Loss Curve
|
|
105
|
+
(loss_line,) = ax2.plot([], [], color="purple", lw=2, label="MSE Loss")
|
|
106
|
+
ax2.set_xlim(0, len(model.loss_history))
|
|
107
|
+
ax2.set_ylim(0, max(model.loss_history) * 1.1 if model.loss_history else 1.0)
|
|
108
|
+
ax2.set_xlabel("Epoch")
|
|
109
|
+
ax2.set_ylabel("Loss (MSE)")
|
|
110
|
+
ax2.set_title("Loss Curve History")
|
|
111
|
+
ax2.legend()
|
|
112
|
+
|
|
113
|
+
def init():
|
|
114
|
+
line.set_data([], [])
|
|
115
|
+
loss_line.set_data([], [])
|
|
116
|
+
return line, loss_line
|
|
117
|
+
|
|
118
|
+
# Downsample history to keep the animation fast and smooth (max 100 frames)
|
|
119
|
+
history_len = len(model.coeff_history)
|
|
120
|
+
max_frames = 100
|
|
121
|
+
if history_len > max_frames:
|
|
122
|
+
indices = np.linspace(0, history_len - 1, max_frames, dtype=int)
|
|
123
|
+
else:
|
|
124
|
+
indices = list(range(history_len))
|
|
125
|
+
|
|
126
|
+
def update(frame_idx):
|
|
127
|
+
idx = indices[frame_idx]
|
|
128
|
+
b_0, b_1 = model.coeff_history[idx]
|
|
129
|
+
y_pred = b_1 * x_arr + b_0
|
|
130
|
+
line.set_data(x_arr, y_pred)
|
|
131
|
+
loss_line.set_data(range(idx + 1), model.loss_history[: idx + 1])
|
|
132
|
+
return line, loss_line
|
|
133
|
+
|
|
134
|
+
anim = FuncAnimation(
|
|
135
|
+
fig,
|
|
136
|
+
update,
|
|
137
|
+
frames=len(indices),
|
|
138
|
+
init_func=init,
|
|
139
|
+
interval=50,
|
|
140
|
+
blit=True,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if save_path:
|
|
144
|
+
anim.save(save_path, writer="pillow")
|
|
145
|
+
else:
|
|
146
|
+
plt.tight_layout()
|
|
147
|
+
plt.show()
|
|
148
|
+
|
|
149
|
+
return anim
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
# Test execution when run as a standalone script
|
|
154
|
+
print("Fitting model...")
|
|
155
|
+
model = LinearRegression(learning_rate=0.01, epochs=5)
|
|
156
|
+
model.fit(x, y)
|
|
157
|
+
print("Generating static plot...")
|
|
158
|
+
plot_regression_line(x, y, model)
|
|
159
|
+
print("Generating interactive animation...")
|
|
160
|
+
animate_regression_fitting(x, y, model)
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions and datasets for training and testing regression models.
|
|
3
|
+
|
|
4
|
+
This module provides original toy datasets, synthetic data generators for simple
|
|
5
|
+
and multiple linear regression, and subsets of classic datasets (Iris and Housing).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from typing import Tuple, List, Optional, Union
|
|
10
|
+
|
|
11
|
+
# Toy dataset for simple linear regression demonstration (8 data points)
|
|
12
|
+
x: List[float] = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
|
|
13
|
+
y: List[float] = [2.1, 3.9, 6.2, 7.8, 10.3, 11.7, 13.9, 16.2]
|
|
14
|
+
|
|
15
|
+
# Test dataset for simple linear regression validation
|
|
16
|
+
x_test: List[float] = [7.0, 8.0]
|
|
17
|
+
y_test: List[float] = [13.9, 16.2]
|
|
18
|
+
|
|
19
|
+
# Sample multiple features data (5 samples, 2 features)
|
|
20
|
+
x_multi_test: np.ndarray = np.array([[1.0, 4.0], [2.0, 1.0], [3.0, 0.0], [4.0, 2.0], [5.0, 5.0]])
|
|
21
|
+
y_multi_test: np.ndarray = np.array([0.0, 9.0, 14.0, 13.0, 10.0])
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def make_simple_regression(
|
|
25
|
+
n_samples: int = 100,
|
|
26
|
+
slope: float = 2.0,
|
|
27
|
+
intercept: float = 1.0,
|
|
28
|
+
noise: float = 1.0,
|
|
29
|
+
random_seed: Optional[int] = 42
|
|
30
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
31
|
+
"""
|
|
32
|
+
Generate synthetic data for simple linear regression.
|
|
33
|
+
|
|
34
|
+
Generates data according to: y = slope * x + intercept + gaussian_noise.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
n_samples : int, default=100
|
|
39
|
+
Number of data samples to generate.
|
|
40
|
+
slope : float, default=2.0
|
|
41
|
+
The true coefficient (slope) of the data.
|
|
42
|
+
intercept : float, default=1.0
|
|
43
|
+
The true intercept (y-intercept) of the data.
|
|
44
|
+
noise : float, default=1.0
|
|
45
|
+
Standard deviation of the gaussian noise added to the target.
|
|
46
|
+
random_seed : int or None, default=42
|
|
47
|
+
Seed for the random number generator to ensure reproducibility.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
x_arr : np.ndarray of shape (n_samples,)
|
|
52
|
+
Generated input features.
|
|
53
|
+
y_arr : np.ndarray of shape (n_samples,)
|
|
54
|
+
Generated target values.
|
|
55
|
+
"""
|
|
56
|
+
if random_seed is not None:
|
|
57
|
+
np.random.seed(random_seed)
|
|
58
|
+
|
|
59
|
+
x_arr = np.random.uniform(-10.0, 10.0, n_samples)
|
|
60
|
+
noise_arr = np.random.normal(0.0, noise, n_samples)
|
|
61
|
+
y_arr = slope * x_arr + intercept + noise_arr
|
|
62
|
+
|
|
63
|
+
return x_arr, y_arr
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def make_multiple_regression(
|
|
67
|
+
n_samples: int = 100,
|
|
68
|
+
n_features: int = 3,
|
|
69
|
+
weights: Optional[Union[List[float], np.ndarray]] = None,
|
|
70
|
+
intercept: float = 1.0,
|
|
71
|
+
noise: float = 1.0,
|
|
72
|
+
random_seed: Optional[int] = 42
|
|
73
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
74
|
+
"""
|
|
75
|
+
Generate synthetic data for multiple linear regression.
|
|
76
|
+
|
|
77
|
+
Generates data according to: y = X @ weights + intercept + gaussian_noise.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
n_samples : int, default=100
|
|
82
|
+
Number of data samples to generate.
|
|
83
|
+
n_features : int, default=3
|
|
84
|
+
Number of features to generate. Ignored if `weights` is specified.
|
|
85
|
+
weights : array-like of shape (n_features,) or None, default=None
|
|
86
|
+
True coefficients for each feature. If None, weights are generated randomly.
|
|
87
|
+
intercept : float, default=1.0
|
|
88
|
+
The true intercept of the data.
|
|
89
|
+
noise : float, default=1.0
|
|
90
|
+
Standard deviation of the gaussian noise added to the target.
|
|
91
|
+
random_seed : int or None, default=42
|
|
92
|
+
Seed for the random number generator.
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
X : np.ndarray of shape (n_samples, n_features)
|
|
97
|
+
Generated input feature matrix.
|
|
98
|
+
y : np.ndarray of shape (n_samples,)
|
|
99
|
+
Generated target values.
|
|
100
|
+
"""
|
|
101
|
+
if random_seed is not None:
|
|
102
|
+
np.random.seed(random_seed)
|
|
103
|
+
|
|
104
|
+
if weights is None:
|
|
105
|
+
actual_weights = np.random.uniform(1.0, 5.0, n_features)
|
|
106
|
+
else:
|
|
107
|
+
actual_weights = np.asarray(weights, dtype=float)
|
|
108
|
+
n_features = len(actual_weights)
|
|
109
|
+
|
|
110
|
+
X = np.random.uniform(-5.0, 5.0, size=(n_samples, n_features))
|
|
111
|
+
noise_arr = np.random.normal(0.0, noise, n_samples)
|
|
112
|
+
y = X @ actual_weights + intercept + noise_arr
|
|
113
|
+
|
|
114
|
+
return X, y
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def load_iris_regression() -> Tuple[np.ndarray, np.ndarray]:
|
|
118
|
+
"""
|
|
119
|
+
Load a small slice of the classic Iris dataset for simple linear regression.
|
|
120
|
+
|
|
121
|
+
Features (X) : Petal Width (cm)
|
|
122
|
+
Target (y) : Petal Length (cm)
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
X : np.ndarray of shape (40,)
|
|
127
|
+
Petal Width features.
|
|
128
|
+
y : np.ndarray of shape (40,)
|
|
129
|
+
Petal Length target values.
|
|
130
|
+
"""
|
|
131
|
+
# 40 real samples from Setosa, Versicolor, and Virginica classes
|
|
132
|
+
iris_raw = [
|
|
133
|
+
[0.2, 1.4],
|
|
134
|
+
[0.2, 1.4],
|
|
135
|
+
[0.2, 1.3],
|
|
136
|
+
[0.2, 1.5],
|
|
137
|
+
[0.2, 1.4],
|
|
138
|
+
[0.4, 1.7],
|
|
139
|
+
[0.3, 1.4],
|
|
140
|
+
[0.2, 1.5],
|
|
141
|
+
[0.2, 1.4],
|
|
142
|
+
[0.1, 1.5],
|
|
143
|
+
[0.2, 1.5],
|
|
144
|
+
[0.2, 1.6],
|
|
145
|
+
[0.1, 1.4],
|
|
146
|
+
[0.1, 1.1],
|
|
147
|
+
[0.2, 1.2],
|
|
148
|
+
[0.4, 1.5],
|
|
149
|
+
[0.4, 1.3],
|
|
150
|
+
[0.3, 1.4],
|
|
151
|
+
[0.3, 1.7],
|
|
152
|
+
[0.3, 1.5],
|
|
153
|
+
[1.4, 4.7],
|
|
154
|
+
[1.5, 4.5],
|
|
155
|
+
[1.5, 4.9],
|
|
156
|
+
[1.3, 4.0],
|
|
157
|
+
[1.5, 4.6],
|
|
158
|
+
[1.3, 4.5],
|
|
159
|
+
[1.6, 4.7],
|
|
160
|
+
[1.0, 3.3],
|
|
161
|
+
[1.3, 4.6],
|
|
162
|
+
[1.4, 3.9],
|
|
163
|
+
[2.5, 6.0],
|
|
164
|
+
[1.9, 5.1],
|
|
165
|
+
[2.1, 5.9],
|
|
166
|
+
[1.8, 5.6],
|
|
167
|
+
[2.2, 5.8],
|
|
168
|
+
[2.1, 6.6],
|
|
169
|
+
[1.7, 4.5],
|
|
170
|
+
[1.8, 4.8],
|
|
171
|
+
[1.8, 5.4],
|
|
172
|
+
[2.5, 5.7],
|
|
173
|
+
]
|
|
174
|
+
data = np.array(iris_raw)
|
|
175
|
+
X = data[:, 0] # Petal Width
|
|
176
|
+
y = data[:, 1] # Petal Length
|
|
177
|
+
return X, y
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def load_housing_regression() -> Tuple[np.ndarray, np.ndarray]:
|
|
181
|
+
"""
|
|
182
|
+
Load a toy housing price dataset for multiple linear regression.
|
|
183
|
+
|
|
184
|
+
Features (X) : Matrix of shape (20, 3) representing:
|
|
185
|
+
- Square Footage (in hundreds of sq ft)
|
|
186
|
+
- Number of Bedrooms
|
|
187
|
+
- Age of the property (in years)
|
|
188
|
+
Target (y) : Array of shape (20,) representing house price (in thousands of dollars)
|
|
189
|
+
|
|
190
|
+
Returns
|
|
191
|
+
-------
|
|
192
|
+
X : np.ndarray of shape (20, 3)
|
|
193
|
+
Feature matrix.
|
|
194
|
+
y : np.ndarray of shape (20,)
|
|
195
|
+
Target values (House Prices).
|
|
196
|
+
"""
|
|
197
|
+
housing_raw = [
|
|
198
|
+
[15.0, 3, 10, 350.0],
|
|
199
|
+
[20.0, 4, 5, 480.0],
|
|
200
|
+
[12.0, 2, 15, 270.0],
|
|
201
|
+
[18.0, 3, 8, 410.0],
|
|
202
|
+
[25.0, 4, 2, 590.0],
|
|
203
|
+
[14.0, 3, 20, 290.0],
|
|
204
|
+
[22.0, 4, 12, 450.0],
|
|
205
|
+
[16.0, 3, 14, 330.0],
|
|
206
|
+
[30.0, 5, 1, 720.0],
|
|
207
|
+
[10.0, 2, 30, 180.0],
|
|
208
|
+
[19.0, 3, 6, 435.0],
|
|
209
|
+
[21.0, 4, 4, 495.0],
|
|
210
|
+
[13.0, 2, 18, 285.0],
|
|
211
|
+
[17.0, 3, 11, 365.0],
|
|
212
|
+
[24.0, 4, 3, 560.0],
|
|
213
|
+
[11.0, 2, 25, 210.0],
|
|
214
|
+
[28.0, 5, 2, 680.0],
|
|
215
|
+
[15.5, 3, 9, 360.0],
|
|
216
|
+
[20.5, 4, 7, 470.0],
|
|
217
|
+
[23.0, 4, 5, 520.0],
|
|
218
|
+
]
|
|
219
|
+
data = np.array(housing_raw)
|
|
220
|
+
X = data[:, :3] # [Square Footage, Bedrooms, Age]
|
|
221
|
+
y = data[:, 3] # Price
|
|
222
|
+
return X, y
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""
|
|
2
|
+
From-scratch implementations of Linear Regression and Multiple Linear Regression.
|
|
3
|
+
|
|
4
|
+
These classes are built using Python and NumPy, demonstrating both gradient descent
|
|
5
|
+
and closed-form analytical solutions (Ordinary Least Squares/Normal Equation).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from typing import Union, List, Tuple, Optional
|
|
10
|
+
from .utils import func
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LinearRegression:
|
|
14
|
+
"""
|
|
15
|
+
Simple Linear Regression model supporting both Gradient Descent and
|
|
16
|
+
Ordinary Least Squares (OLS) closed-form estimation.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
learning_rate : float, default=0.001
|
|
21
|
+
The step size used for updating parameters during gradient descent.
|
|
22
|
+
epochs : int, default=1000
|
|
23
|
+
The number of training iterations for gradient descent.
|
|
24
|
+
|
|
25
|
+
Attributes
|
|
26
|
+
----------
|
|
27
|
+
b_0 : float
|
|
28
|
+
The intercept of the regression line (bias). Default is 0.0.
|
|
29
|
+
b_1 : float
|
|
30
|
+
The slope of the regression line (weight). Default is 0.0.
|
|
31
|
+
loss_history : list of float
|
|
32
|
+
A list storing the Mean Squared Error (MSE) loss value at each epoch.
|
|
33
|
+
coeff_history : list of tuple of (float, float)
|
|
34
|
+
A history of (intercept, slope) tuples recorded at each epoch or step.
|
|
35
|
+
coeff : tuple of (float, float)
|
|
36
|
+
The final trained coefficients (b_0, b_1).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, learning_rate: float = 0.001, epochs: int = 1000) -> None:
|
|
40
|
+
self.learning_rate: float = learning_rate
|
|
41
|
+
self.epochs: int = epochs
|
|
42
|
+
self.b_0: float = 0.0
|
|
43
|
+
self.b_1: float = 0.0
|
|
44
|
+
self.loss_history: List[float] = []
|
|
45
|
+
self.coeff_history: List[Tuple[float, float]] = []
|
|
46
|
+
self.coeff: Tuple[float, float] = (0.0, 0.0)
|
|
47
|
+
|
|
48
|
+
def fit(self, x: Union[List[float], np.ndarray], y: Union[List[float], np.ndarray]) -> None:
|
|
49
|
+
"""
|
|
50
|
+
Fit the model coefficients using Batch Gradient Descent.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
x : array-like of shape (n_samples,)
|
|
55
|
+
Training input features.
|
|
56
|
+
y : array-like of shape (n_samples,)
|
|
57
|
+
Target values.
|
|
58
|
+
"""
|
|
59
|
+
x_arr = np.asarray(x, dtype=float)
|
|
60
|
+
y_arr = np.asarray(y, dtype=float)
|
|
61
|
+
n = len(x_arr)
|
|
62
|
+
|
|
63
|
+
self.loss_history = []
|
|
64
|
+
self.coeff_history = []
|
|
65
|
+
|
|
66
|
+
for _ in range(self.epochs):
|
|
67
|
+
# Vectorized prediction
|
|
68
|
+
y_pred = self.b_1 * x_arr + self.b_0
|
|
69
|
+
errors = y_pred - y_arr
|
|
70
|
+
|
|
71
|
+
# Vectorized gradients
|
|
72
|
+
grad_b0 = (2.0 / n) * np.sum(errors)
|
|
73
|
+
grad_b1 = (2.0 / n) * np.sum(errors * x_arr)
|
|
74
|
+
|
|
75
|
+
# Update weights
|
|
76
|
+
self.b_0 -= self.learning_rate * grad_b0
|
|
77
|
+
self.b_1 -= self.learning_rate * grad_b1
|
|
78
|
+
|
|
79
|
+
self.loss_history.append(float(np.mean(errors ** 2)))
|
|
80
|
+
self.coeff_history.append((self.b_0, self.b_1))
|
|
81
|
+
|
|
82
|
+
self.coeff = (self.b_0, self.b_1)
|
|
83
|
+
|
|
84
|
+
def fit_ols(self, x: Union[List[float], np.ndarray], y: Union[List[float], np.ndarray]) -> None:
|
|
85
|
+
"""
|
|
86
|
+
Fit the model coefficients using Ordinary Least Squares (OLS) closed-form equations.
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
x : array-like of shape (n_samples,)
|
|
91
|
+
Training input features.
|
|
92
|
+
y : array-like of shape (n_samples,)
|
|
93
|
+
Target values.
|
|
94
|
+
"""
|
|
95
|
+
x_arr = np.asarray(x, dtype=float)
|
|
96
|
+
y_arr = np.asarray(y, dtype=float)
|
|
97
|
+
|
|
98
|
+
mean_x, mean_y = np.mean(x_arr), np.mean(y_arr)
|
|
99
|
+
numerator = np.sum((x_arr - mean_x) * (y_arr - mean_y))
|
|
100
|
+
denominator = np.sum((x_arr - mean_x) ** 2)
|
|
101
|
+
|
|
102
|
+
if denominator == 0.0:
|
|
103
|
+
self.b_1 = 0.0
|
|
104
|
+
else:
|
|
105
|
+
self.b_1 = float(numerator / denominator)
|
|
106
|
+
|
|
107
|
+
self.b_0 = float(mean_y - self.b_1 * mean_x)
|
|
108
|
+
self.coeff = (self.b_0, self.b_1)
|
|
109
|
+
self.coeff_history = [(self.b_0, self.b_1)]
|
|
110
|
+
|
|
111
|
+
def predict(self, x: Union[float, List[float], np.ndarray]) -> Union[float, List[float]]:
|
|
112
|
+
"""
|
|
113
|
+
Predict target values using the linear model: y = b_1 * x + b_0.
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
x : float or array-like
|
|
118
|
+
Input features to predict.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
float or list of float
|
|
123
|
+
Predicted target values matching the type/shape of the input.
|
|
124
|
+
"""
|
|
125
|
+
if isinstance(x, (list, tuple, np.ndarray)):
|
|
126
|
+
return [float(func(xi, b=self.b_0, m=self.b_1)) for xi in x]
|
|
127
|
+
return float(func(x, b=self.b_0, m=self.b_1))
|
|
128
|
+
|
|
129
|
+
def score(self, x: Union[List[float], np.ndarray], y: Union[List[float], np.ndarray]) -> float:
|
|
130
|
+
"""
|
|
131
|
+
Calculate the Coefficient of Determination (R^2 score) of the prediction.
|
|
132
|
+
|
|
133
|
+
Parameters
|
|
134
|
+
----------
|
|
135
|
+
x : array-like of shape (n_samples,)
|
|
136
|
+
Test input features.
|
|
137
|
+
y : array-like of shape (n_samples,)
|
|
138
|
+
True target values.
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
float
|
|
143
|
+
R^2 score. A value of 1.0 indicates perfect fit.
|
|
144
|
+
"""
|
|
145
|
+
x_arr = np.asarray(x, dtype=float)
|
|
146
|
+
y_arr = np.asarray(y, dtype=float)
|
|
147
|
+
y_pred = np.array(self.predict(x_arr), dtype=float)
|
|
148
|
+
|
|
149
|
+
ss_res = np.sum((y_arr - y_pred) ** 2)
|
|
150
|
+
ss_tot = np.sum((y_arr - np.mean(y_arr)) ** 2)
|
|
151
|
+
|
|
152
|
+
if ss_tot == 0.0:
|
|
153
|
+
return 0.0
|
|
154
|
+
return float(1.0 - (ss_res / ss_tot))
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class MultipleLinearRegression:
|
|
158
|
+
"""
|
|
159
|
+
Multiple Linear Regression model using the closed-form Normal Equation.
|
|
160
|
+
|
|
161
|
+
Attributes
|
|
162
|
+
----------
|
|
163
|
+
weights : np.ndarray or None
|
|
164
|
+
Coefficients for the input features (excluding the intercept).
|
|
165
|
+
intercept : float or None
|
|
166
|
+
The intercept of the regression hyperplane (bias).
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
def __init__(self) -> None:
|
|
170
|
+
self.weights: Optional[np.ndarray] = None
|
|
171
|
+
self.intercept: Optional[float] = None
|
|
172
|
+
|
|
173
|
+
def fit(self, X: Union[List[List[float]], np.ndarray], y: Union[List[float], np.ndarray]) -> None:
|
|
174
|
+
"""
|
|
175
|
+
Fit the multiple linear model using the Normal Equation:
|
|
176
|
+
theta = (X^T * X)^(-1) * X^T * y
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
X : array-like of shape (n_samples, n_features)
|
|
181
|
+
Training input features.
|
|
182
|
+
y : array-like of shape (n_samples,)
|
|
183
|
+
Target values.
|
|
184
|
+
"""
|
|
185
|
+
X_arr = np.asarray(X, dtype=float)
|
|
186
|
+
y_arr = np.asarray(y, dtype=float)
|
|
187
|
+
|
|
188
|
+
# Add bias term (column of 1s) to feature matrix
|
|
189
|
+
X_b = np.c_[np.ones(X_arr.shape[0]), X_arr]
|
|
190
|
+
|
|
191
|
+
# Normal Equation solver
|
|
192
|
+
theta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y_arr
|
|
193
|
+
self.intercept = float(theta[0])
|
|
194
|
+
self.weights = theta[1:]
|
|
195
|
+
|
|
196
|
+
def predict(self, X: Union[List[List[float]], np.ndarray]) -> np.ndarray:
|
|
197
|
+
"""
|
|
198
|
+
Predict target values for multiple features.
|
|
199
|
+
|
|
200
|
+
Parameters
|
|
201
|
+
----------
|
|
202
|
+
X : array-like of shape (n_samples, n_features)
|
|
203
|
+
Input features to predict.
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
np.ndarray of shape (n_samples,)
|
|
208
|
+
Predicted target values.
|
|
209
|
+
|
|
210
|
+
Raises
|
|
211
|
+
------
|
|
212
|
+
ValueError
|
|
213
|
+
If the model has not been fitted prior to prediction.
|
|
214
|
+
"""
|
|
215
|
+
X_arr = np.asarray(X, dtype=float)
|
|
216
|
+
if self.weights is None or self.intercept is None:
|
|
217
|
+
raise ValueError("Model must be fitted before calling predict.")
|
|
218
|
+
return X_arr @ self.weights + self.intercept
|
|
219
|
+
|
|
220
|
+
def score(self, X: Union[List[List[float]], np.ndarray], y: Union[List[float], np.ndarray]) -> float:
|
|
221
|
+
"""
|
|
222
|
+
Calculate the Coefficient of Determination (R^2 score) of the prediction.
|
|
223
|
+
|
|
224
|
+
Parameters
|
|
225
|
+
----------
|
|
226
|
+
X : array-like of shape (n_samples, n_features)
|
|
227
|
+
Test input features.
|
|
228
|
+
y : array-like of shape (n_samples,)
|
|
229
|
+
True target values.
|
|
230
|
+
|
|
231
|
+
Returns
|
|
232
|
+
-------
|
|
233
|
+
float
|
|
234
|
+
R^2 score. A value of 1.0 indicates perfect fit.
|
|
235
|
+
"""
|
|
236
|
+
X_arr = np.asarray(X, dtype=float)
|
|
237
|
+
y_arr = np.asarray(y, dtype=float)
|
|
238
|
+
preds = self.predict(X_arr)
|
|
239
|
+
ss_res = np.sum((y_arr - preds) ** 2)
|
|
240
|
+
ss_tot = np.sum((y_arr - np.mean(y_arr)) ** 2)
|
|
241
|
+
|
|
242
|
+
if ss_tot == 0.0:
|
|
243
|
+
return 0.0
|
|
244
|
+
return float(1.0 - (ss_res / ss_tot))
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for linear regression operations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
def func(x: float, m: float = 0.0, b: float = 0.0) -> float:
|
|
6
|
+
"""
|
|
7
|
+
Compute the linear function y = m * x + b.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
x : float
|
|
12
|
+
The input feature value.
|
|
13
|
+
m : float, default=0.0
|
|
14
|
+
The slope of the line (weight).
|
|
15
|
+
b : float, default=0.0
|
|
16
|
+
The y-intercept (bias).
|
|
17
|
+
|
|
18
|
+
Returns
|
|
19
|
+
-------
|
|
20
|
+
float
|
|
21
|
+
The calculated function value.
|
|
22
|
+
"""
|
|
23
|
+
return m * x + b
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: data-blog-regression
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: From-scratch linear and multiple regression library for the Data Blog portfolio
|
|
5
|
+
Author: Your Name
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: numpy>=1.21
|
|
10
|
+
|
|
11
|
+
# Regression Library
|
|
12
|
+
|
|
13
|
+
A lightweight, from-scratch linear and multiple regression library implemented in Python with NumPy. Perfect for educational demonstration, portfolios, and quick testing of regression mechanics.
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
- **Simple Linear Regression**: Supports Batch Gradient Descent with complete loss and coefficient history, and closed-form OLS estimation.
|
|
18
|
+
- **Multiple Linear Regression**: Analytical Normal Equation solver for multidimensional datasets.
|
|
19
|
+
- **Visualization & Animation**: Real-time animation of gradient descent fitting and static regression line plotting with Matplotlib.
|
|
20
|
+
- **Dataset Loaders**: Pre-packaged synthetic generators and subsets of classic real datasets (Iris and Housing).
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
### Installation
|
|
25
|
+
|
|
26
|
+
Clone the repository and install dependencies:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install -r requirements.txt
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Running the Interactive Demo
|
|
33
|
+
|
|
34
|
+
To explore interactive fits on various datasets and watch the live training animation:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
python main.py
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Library Usage
|
|
41
|
+
|
|
42
|
+
You can easily import and train models in your own scripts:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from data_blog import LinearRegression, x, y
|
|
46
|
+
|
|
47
|
+
# Instantiate and fit using Gradient Descent
|
|
48
|
+
model = LinearRegression(learning_rate=0.01, epochs=1000)
|
|
49
|
+
model.fit(x, y)
|
|
50
|
+
|
|
51
|
+
print(f"Intercept: {model.b_0:.4f}")
|
|
52
|
+
print(f"Slope: {model.b_1:.4f}")
|
|
53
|
+
print(f"R-squared Score: {model.score(x, y):.4f}")
|
|
54
|
+
```
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
data_blog/__init__.py
|
|
4
|
+
data_blog/data.py
|
|
5
|
+
data_blog/linear_regression.py
|
|
6
|
+
data_blog/utils.py
|
|
7
|
+
data_blog/animate/__init__.py
|
|
8
|
+
data_blog/animate/track.py
|
|
9
|
+
data_blog_regression.egg-info/PKG-INFO
|
|
10
|
+
data_blog_regression.egg-info/SOURCES.txt
|
|
11
|
+
data_blog_regression.egg-info/dependency_links.txt
|
|
12
|
+
data_blog_regression.egg-info/requires.txt
|
|
13
|
+
data_blog_regression.egg-info/top_level.txt
|
|
14
|
+
tests/test_linear.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
numpy>=1.21
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
data_blog
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "data-blog-regression"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "From-scratch linear and multiple regression library for the Data Blog portfolio"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [ { name = "Your Name" } ]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"numpy>=1.21"
|
|
15
|
+
]
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import math
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pytest
|
|
5
|
+
from sklearn.linear_model import LinearRegression as SklearnLinearRegression
|
|
6
|
+
from sklearn.metrics import r2_score
|
|
7
|
+
|
|
8
|
+
from data_blog import (
|
|
9
|
+
LinearRegression,
|
|
10
|
+
MultipleLinearRegression,
|
|
11
|
+
x,
|
|
12
|
+
x_multi_test,
|
|
13
|
+
x_test,
|
|
14
|
+
y,
|
|
15
|
+
y_multi_test,
|
|
16
|
+
y_test,
|
|
17
|
+
make_simple_regression,
|
|
18
|
+
make_multiple_regression,
|
|
19
|
+
load_iris_regression,
|
|
20
|
+
load_housing_regression,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.fixture
|
|
25
|
+
def model():
|
|
26
|
+
model = LinearRegression(learning_rate=0.01, epochs=1000)
|
|
27
|
+
model_ols = LinearRegression()
|
|
28
|
+
model_ols.fit_ols(x, y)
|
|
29
|
+
model.fit(x, y)
|
|
30
|
+
yield model, model_ols
|
|
31
|
+
del model
|
|
32
|
+
|
|
33
|
+
def test_fit_changes_coefficients():
|
|
34
|
+
model = LinearRegression(learning_rate=0.01, epochs=10)
|
|
35
|
+
initial_b0, initial_b1 = model.b_0, model.b_1
|
|
36
|
+
model.fit(x, y)
|
|
37
|
+
assert not (
|
|
38
|
+
math.isclose(model.b_0, initial_b0) and math.isclose(model.b_1, initial_b1)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def test_score_calculates_r_squared(model):
|
|
42
|
+
model, _ = model
|
|
43
|
+
r_squared = model.score(x, y)
|
|
44
|
+
assert isinstance(r_squared, float)
|
|
45
|
+
assert 0 <= r_squared <= 1
|
|
46
|
+
|
|
47
|
+
def test_predict_returns_expected_values(model):
|
|
48
|
+
model, _ = model
|
|
49
|
+
predictions = model.predict(x)
|
|
50
|
+
assert len(predictions) == len(x)
|
|
51
|
+
assert all(isinstance(pred, (int, float)) for pred in predictions)
|
|
52
|
+
|
|
53
|
+
def test_coefficients_close_to_expected(model):
|
|
54
|
+
model, _ = model
|
|
55
|
+
expected_b0 = 0.0
|
|
56
|
+
expected_b1 = 2.0
|
|
57
|
+
assert math.isclose(model.b_0, expected_b0, abs_tol=0.1)
|
|
58
|
+
assert math.isclose(model.b_1, expected_b1, abs_tol=0.1)
|
|
59
|
+
assert model.coeff == (model.b_0, model.b_1)
|
|
60
|
+
|
|
61
|
+
def test_r_square_to_sklearn(model):
|
|
62
|
+
model, _ = model
|
|
63
|
+
sklearn_model = SklearnLinearRegression()
|
|
64
|
+
sklearn_model.fit(np.array(x).reshape(-1, 1), y)
|
|
65
|
+
sklearn_r2 = sklearn_model.score(np.array(x).reshape(-1, 1), y)
|
|
66
|
+
model_r2 = model.score(x, y)
|
|
67
|
+
assert math.isclose(model_r2, sklearn_r2, abs_tol=0.01)
|
|
68
|
+
|
|
69
|
+
def test_predict_on_test_data(model):
|
|
70
|
+
model, _ = model
|
|
71
|
+
predictions = model.predict(x_test)
|
|
72
|
+
assert len(predictions) == len(x_test)
|
|
73
|
+
assert all(isinstance(pred, (int, float)) for pred in predictions)
|
|
74
|
+
r2 = r2_score(y_test, predictions)
|
|
75
|
+
assert r2 > 0.9
|
|
76
|
+
|
|
77
|
+
def test_ols_coefficients_close_to_expected(model):
|
|
78
|
+
model, model_ols = model
|
|
79
|
+
expected_b0 = 0.0
|
|
80
|
+
expected_b1 = 2.0
|
|
81
|
+
assert math.isclose(model_ols.b_0, expected_b0, abs_tol=0.1)
|
|
82
|
+
assert math.isclose(model_ols.b_1, expected_b1, abs_tol=0.1)
|
|
83
|
+
assert model_ols.coeff == (model_ols.b_0, model_ols.b_1)
|
|
84
|
+
|
|
85
|
+
def test_ols_r_square_to_sklearn(model):
|
|
86
|
+
model, model_ols = model
|
|
87
|
+
sklearn_model = SklearnLinearRegression()
|
|
88
|
+
sklearn_model.fit(np.array(x).reshape(-1, 1), y)
|
|
89
|
+
sklearn_r2 = sklearn_model.score(np.array(x).reshape(-1, 1), y)
|
|
90
|
+
model_ols_r2 = model_ols.score(x, y)
|
|
91
|
+
assert math.isclose(model_ols_r2, sklearn_r2, abs_tol=0.01)
|
|
92
|
+
|
|
93
|
+
def test_ols_predict_on_test_data(model):
|
|
94
|
+
model, model_ols = model
|
|
95
|
+
predictions = model_ols.predict(x_test)
|
|
96
|
+
assert len(predictions) == len(x_test)
|
|
97
|
+
assert all(isinstance(pred, (int, float)) for pred in predictions)
|
|
98
|
+
r2 = r2_score(y_test, predictions)
|
|
99
|
+
assert r2 > 0.9
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_multiple_regression_vs_sklearn():
|
|
103
|
+
model = MultipleLinearRegression()
|
|
104
|
+
model.fit(x_multi_test, y_multi_test)
|
|
105
|
+
|
|
106
|
+
sklearn_model = SklearnLinearRegression()
|
|
107
|
+
sklearn_model.fit(x_multi_test, y_multi_test)
|
|
108
|
+
|
|
109
|
+
assert math.isclose(model.intercept, sklearn_model.intercept_, abs_tol=1e-5)
|
|
110
|
+
assert np.allclose(model.weights, sklearn_model.coef_, atol=1e-5)
|
|
111
|
+
|
|
112
|
+
preds = model.predict(x_multi_test)
|
|
113
|
+
sklearn_preds = sklearn_model.predict(x_multi_test)
|
|
114
|
+
assert np.allclose(preds, sklearn_preds, atol=1e-5)
|
|
115
|
+
|
|
116
|
+
score = model.score(x_multi_test, y_multi_test)
|
|
117
|
+
sklearn_score = sklearn_model.score(x_multi_test, y_multi_test)
|
|
118
|
+
assert math.isclose(score, sklearn_score, abs_tol=1e-5)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def test_plot_regression_line():
|
|
122
|
+
from unittest.mock import patch
|
|
123
|
+
from data_blog.animate import plot_regression_line
|
|
124
|
+
|
|
125
|
+
model = LinearRegression(learning_rate=0.01, epochs=10)
|
|
126
|
+
model.fit(x, y)
|
|
127
|
+
|
|
128
|
+
with patch("matplotlib.pyplot.show") as mock_show:
|
|
129
|
+
plot_regression_line(x, y, model)
|
|
130
|
+
mock_show.assert_called_once()
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def test_animate_regression_fitting():
|
|
134
|
+
from unittest.mock import patch
|
|
135
|
+
from data_blog.animate import animate_regression_fitting
|
|
136
|
+
|
|
137
|
+
model = LinearRegression(learning_rate=0.01, epochs=10)
|
|
138
|
+
model.fit(x, y)
|
|
139
|
+
|
|
140
|
+
with patch("matplotlib.pyplot.show") as mock_show:
|
|
141
|
+
anim = animate_regression_fitting(x, y, model)
|
|
142
|
+
assert anim is not None
|
|
143
|
+
mock_show.assert_called_once()
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def test_make_simple_regression():
|
|
147
|
+
X, y = make_simple_regression(n_samples=50, slope=3.0, intercept=-2.0, noise=0.5, random_seed=123)
|
|
148
|
+
assert len(X) == 50
|
|
149
|
+
assert len(y) == 50
|
|
150
|
+
|
|
151
|
+
# Fit OLS model on the generated data
|
|
152
|
+
model = LinearRegression()
|
|
153
|
+
model.fit_ols(X, y)
|
|
154
|
+
|
|
155
|
+
# Should be close to slope 3.0 and intercept -2.0
|
|
156
|
+
assert math.isclose(model.b_1, 3.0, abs_tol=0.2)
|
|
157
|
+
assert math.isclose(model.b_0, -2.0, abs_tol=0.2)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_make_multiple_regression():
|
|
161
|
+
X, y = make_multiple_regression(n_samples=100, n_features=4, weights=[1.5, -2.0, 0.5, 3.0], intercept=10.0, noise=0.1, random_seed=42)
|
|
162
|
+
assert X.shape == (100, 4)
|
|
163
|
+
assert len(y) == 100
|
|
164
|
+
|
|
165
|
+
model = MultipleLinearRegression()
|
|
166
|
+
model.fit(X, y)
|
|
167
|
+
|
|
168
|
+
assert math.isclose(model.intercept, 10.0, abs_tol=0.1)
|
|
169
|
+
assert np.allclose(model.weights, [1.5, -2.0, 0.5, 3.0], atol=0.1)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def test_load_iris_regression():
|
|
173
|
+
X, y = load_iris_regression()
|
|
174
|
+
assert len(X) == 40
|
|
175
|
+
assert len(y) == 40
|
|
176
|
+
|
|
177
|
+
model = LinearRegression()
|
|
178
|
+
model.fit_ols(X, y)
|
|
179
|
+
|
|
180
|
+
score = model.score(X, y)
|
|
181
|
+
assert score > 0.90 # Iris petal width vs length has a strong correlation
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def test_load_housing_regression():
|
|
185
|
+
X, y = load_housing_regression()
|
|
186
|
+
assert X.shape == (20, 3)
|
|
187
|
+
assert len(y) == 20
|
|
188
|
+
|
|
189
|
+
model = MultipleLinearRegression()
|
|
190
|
+
model.fit(X, y)
|
|
191
|
+
|
|
192
|
+
score = model.score(X, y)
|
|
193
|
+
assert score > 0.95 # Our synthetic housing data fits very well
|
|
194
|
+
|
|
195
|
+
|