data-blog-regression 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-blog-regression
3
+ Version: 0.1.0
4
+ Summary: From-scratch linear and multiple regression library for the Data Blog portfolio
5
+ Author: Your Name
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: numpy>=1.21
10
+
11
+ # Regression Library
12
+
13
+ A lightweight, from-scratch linear and multiple regression library implemented in Python with NumPy. Perfect for educational demonstration, portfolios, and quick testing of regression mechanics.
14
+
15
+ ## Features
16
+
17
+ - **Simple Linear Regression**: Supports Batch Gradient Descent with complete loss and coefficient history, and closed-form OLS estimation.
18
+ - **Multiple Linear Regression**: Analytical Normal Equation solver for multidimensional datasets.
19
+ - **Visualization & Animation**: Real-time animation of gradient descent fitting and static regression line plotting with Matplotlib.
20
+ - **Dataset Loaders**: Pre-packaged synthetic generators and subsets of classic real datasets (Iris and Housing).
21
+
22
+ ## Quick Start
23
+
24
+ ### Installation
25
+
26
+ Clone the repository and install dependencies:
27
+
28
+ ```bash
29
+ pip install -r requirements.txt
30
+ ```
31
+
32
+ ### Running the Interactive Demo
33
+
34
+ To explore interactive fits on various datasets and watch the live training animation:
35
+
36
+ ```bash
37
+ python main.py
38
+ ```
39
+
40
+ ### Library Usage
41
+
42
+ You can easily import and train models in your own scripts:
43
+
44
+ ```python
45
+ from data_blog import LinearRegression, x, y
46
+
47
+ # Instantiate and fit using Gradient Descent
48
+ model = LinearRegression(learning_rate=0.01, epochs=1000)
49
+ model.fit(x, y)
50
+
51
+ print(f"Intercept: {model.b_0:.4f}")
52
+ print(f"Slope: {model.b_1:.4f}")
53
+ print(f"R-squared Score: {model.score(x, y):.4f}")
54
+ ```
@@ -0,0 +1,44 @@
1
+ # Regression Library
2
+
3
+ A lightweight, from-scratch linear and multiple regression library implemented in Python with NumPy. Perfect for educational demonstration, portfolios, and quick testing of regression mechanics.
4
+
5
+ ## Features
6
+
7
+ - **Simple Linear Regression**: Supports Batch Gradient Descent with complete loss and coefficient history, and closed-form OLS estimation.
8
+ - **Multiple Linear Regression**: Analytical Normal Equation solver for multidimensional datasets.
9
+ - **Visualization & Animation**: Real-time animation of gradient descent fitting and static regression line plotting with Matplotlib.
10
+ - **Dataset Loaders**: Pre-packaged synthetic generators and subsets of classic real datasets (Iris and Housing).
11
+
12
+ ## Quick Start
13
+
14
+ ### Installation
15
+
16
+ Clone the repository and install dependencies:
17
+
18
+ ```bash
19
+ pip install -r requirements.txt
20
+ ```
21
+
22
+ ### Running the Interactive Demo
23
+
24
+ To explore interactive fits on various datasets and watch the live training animation:
25
+
26
+ ```bash
27
+ python main.py
28
+ ```
29
+
30
+ ### Library Usage
31
+
32
+ You can easily import and train models in your own scripts:
33
+
34
+ ```python
35
+ from data_blog import LinearRegression, x, y
36
+
37
+ # Instantiate and fit using Gradient Descent
38
+ model = LinearRegression(learning_rate=0.01, epochs=1000)
39
+ model.fit(x, y)
40
+
41
+ print(f"Intercept: {model.b_0:.4f}")
42
+ print(f"Slope: {model.b_1:.4f}")
43
+ print(f"R-squared Score: {model.score(x, y):.4f}")
44
+ ```
@@ -0,0 +1,41 @@
1
+ """
2
+ From-Scratch Linear Regression Library.
3
+
4
+ A educational, lightweight machine learning package implementing simple and multiple
5
+ linear regression algorithms from scratch using NumPy.
6
+
7
+ Features:
8
+ - `LinearRegression`: Supports gradient descent fitting and analytical closed-form (OLS) fitting.
9
+ - `MultipleLinearRegression`: Normal equation solver for multidimensional features.
10
+ - `data_blog.animate`: Visualization utilities to animate gradient descent fitting and plot regression lines.
11
+ - Synthetic and Classic dataset generators/loaders for testing and comparative benchmarking.
12
+ """
13
+
14
+ from .data import (
15
+ x,
16
+ x_test,
17
+ y,
18
+ y_test,
19
+ x_multi_test,
20
+ y_multi_test,
21
+ make_simple_regression,
22
+ make_multiple_regression,
23
+ load_iris_regression,
24
+ load_housing_regression,
25
+ )
26
+ from .linear_regression import LinearRegression, MultipleLinearRegression
27
+
28
+ __all__ = [
29
+ "LinearRegression",
30
+ "MultipleLinearRegression",
31
+ "x",
32
+ "y",
33
+ "x_test",
34
+ "y_test",
35
+ "x_multi_test",
36
+ "y_multi_test",
37
+ "make_simple_regression",
38
+ "make_multiple_regression",
39
+ "load_iris_regression",
40
+ "load_housing_regression",
41
+ ]
@@ -0,0 +1,3 @@
1
+ from .track import plot_regression_line, animate_regression_fitting
2
+
3
+ __all__ = ["plot_regression_line", "animate_regression_fitting"]
@@ -0,0 +1,160 @@
1
+ """
2
+ Visualization and animation utilities for monitoring regression fitting.
3
+
4
+ This module provides functions to plot fitted regression lines and create real-time
5
+ animations of the gradient descent optimization process, tracking how loss decreases.
6
+ """
7
+
8
+ import os
9
+ from typing import Union, List, Optional
10
+ import matplotlib
11
+
12
+ # Headless environment detection: use Agg backend if no DISPLAY is present
13
+ if "DISPLAY" not in os.environ and not os.environ.get("MPLBACKEND"):
14
+ matplotlib.use("Agg")
15
+
16
+ import matplotlib.pyplot as plt
17
+ from matplotlib.animation import FuncAnimation
18
+ import numpy as np
19
+ from data_blog import LinearRegression, x, y
20
+
21
+
22
+ def plot_regression_line(
23
+ x: Union[List[float], np.ndarray],
24
+ y: Union[List[float], np.ndarray],
25
+ model: LinearRegression
26
+ ) -> None:
27
+ """
28
+ Plot a static 2D scatter plot of the data along with the fitted regression line.
29
+
30
+ Parameters
31
+ ----------
32
+ x : array-like of shape (n_samples,)
33
+ Input feature values.
34
+ y : array-like of shape (n_samples,)
35
+ True target values.
36
+ model : LinearRegression
37
+ A fitted simple linear regression model containing `b_0` and `b_1`.
38
+ """
39
+ x_arr = np.asarray(x, dtype=float)
40
+ y_arr = np.asarray(y, dtype=float)
41
+ plt.figure()
42
+ plt.scatter(x_arr, y_arr, color="blue", label="Data Points")
43
+
44
+ # Generate points along x for the line
45
+ x_line = np.linspace(min(x_arr), max(x_arr), 100)
46
+ y_line = model.b_1 * x_line + model.b_0
47
+
48
+ plt.plot(x_line, y_line, color="red", label="Regression Line")
49
+ plt.xlabel("x")
50
+ plt.ylabel("y")
51
+ plt.title("Linear Regression Fit")
52
+ plt.legend()
53
+ plt.show()
54
+
55
+
56
+ def animate_regression_fitting(
57
+ x: Union[List[float], np.ndarray],
58
+ y: Union[List[float], np.ndarray],
59
+ model: LinearRegression,
60
+ save_path: Optional[str] = None
61
+ ) -> FuncAnimation:
62
+ """
63
+ Create an interactive animation showing the regression line and loss history over epochs.
64
+
65
+ Requires that the provided `model` has been fitted and contains `coeff_history`
66
+ and `loss_history`.
67
+
68
+ Parameters
69
+ ----------
70
+ x : array-like of shape (n_samples,)
71
+ Input feature values.
72
+ y : array-like of shape (n_samples,)
73
+ True target values.
74
+ model : LinearRegression
75
+ A fitted simple linear regression model with training history.
76
+ save_path : str or None, default=None
77
+ Optional file path (e.g., 'fitting.gif') to save the animation using Pillow.
78
+
79
+ Returns
80
+ -------
81
+ FuncAnimation
82
+ The matplotlib FuncAnimation object.
83
+ """
84
+ if not hasattr(model, "coeff_history") or not model.coeff_history:
85
+ raise ValueError(
86
+ "Model has not been fitted or does not have coefficient history."
87
+ )
88
+
89
+ x_arr = np.asarray(x, dtype=float)
90
+ y_arr = np.asarray(y, dtype=float)
91
+
92
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
93
+
94
+ # Left subplot: Data Points and Regression Line
95
+ ax1.scatter(x_arr, y_arr, color="blue", label="Data Points")
96
+ (line,) = ax1.plot([], [], color="red", lw=2, label="Fitting Line")
97
+ ax1.set_xlim(min(x_arr) - 1, max(x_arr) + 1)
98
+ ax1.set_ylim(min(y_arr) - 2, max(y_arr) + 2)
99
+ ax1.set_xlabel("x")
100
+ ax1.set_ylabel("y")
101
+ ax1.set_title("Linear Regression Fit")
102
+ ax1.legend()
103
+
104
+ # Right subplot: Loss Curve
105
+ (loss_line,) = ax2.plot([], [], color="purple", lw=2, label="MSE Loss")
106
+ ax2.set_xlim(0, len(model.loss_history))
107
+ ax2.set_ylim(0, max(model.loss_history) * 1.1 if model.loss_history else 1.0)
108
+ ax2.set_xlabel("Epoch")
109
+ ax2.set_ylabel("Loss (MSE)")
110
+ ax2.set_title("Loss Curve History")
111
+ ax2.legend()
112
+
113
+ def init():
114
+ line.set_data([], [])
115
+ loss_line.set_data([], [])
116
+ return line, loss_line
117
+
118
+ # Downsample history to keep the animation fast and smooth (max 100 frames)
119
+ history_len = len(model.coeff_history)
120
+ max_frames = 100
121
+ if history_len > max_frames:
122
+ indices = np.linspace(0, history_len - 1, max_frames, dtype=int)
123
+ else:
124
+ indices = list(range(history_len))
125
+
126
+ def update(frame_idx):
127
+ idx = indices[frame_idx]
128
+ b_0, b_1 = model.coeff_history[idx]
129
+ y_pred = b_1 * x_arr + b_0
130
+ line.set_data(x_arr, y_pred)
131
+ loss_line.set_data(range(idx + 1), model.loss_history[: idx + 1])
132
+ return line, loss_line
133
+
134
+ anim = FuncAnimation(
135
+ fig,
136
+ update,
137
+ frames=len(indices),
138
+ init_func=init,
139
+ interval=50,
140
+ blit=True,
141
+ )
142
+
143
+ if save_path:
144
+ anim.save(save_path, writer="pillow")
145
+ else:
146
+ plt.tight_layout()
147
+ plt.show()
148
+
149
+ return anim
150
+
151
+
152
+ if __name__ == "__main__":
153
+ # Test execution when run as a standalone script
154
+ print("Fitting model...")
155
+ model = LinearRegression(learning_rate=0.01, epochs=5)
156
+ model.fit(x, y)
157
+ print("Generating static plot...")
158
+ plot_regression_line(x, y, model)
159
+ print("Generating interactive animation...")
160
+ animate_regression_fitting(x, y, model)
@@ -0,0 +1,222 @@
1
+ """
2
+ Utility functions and datasets for training and testing regression models.
3
+
4
+ This module provides original toy datasets, synthetic data generators for simple
5
+ and multiple linear regression, and subsets of classic datasets (Iris and Housing).
6
+ """
7
+
8
+ import numpy as np
9
+ from typing import Tuple, List, Optional, Union
10
+
11
+ # Toy dataset for simple linear regression demonstration (8 data points)
12
+ x: List[float] = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
13
+ y: List[float] = [2.1, 3.9, 6.2, 7.8, 10.3, 11.7, 13.9, 16.2]
14
+
15
+ # Test dataset for simple linear regression validation
16
+ x_test: List[float] = [7.0, 8.0]
17
+ y_test: List[float] = [13.9, 16.2]
18
+
19
+ # Sample multiple features data (5 samples, 2 features)
20
+ x_multi_test: np.ndarray = np.array([[1.0, 4.0], [2.0, 1.0], [3.0, 0.0], [4.0, 2.0], [5.0, 5.0]])
21
+ y_multi_test: np.ndarray = np.array([0.0, 9.0, 14.0, 13.0, 10.0])
22
+
23
+
24
+ def make_simple_regression(
25
+ n_samples: int = 100,
26
+ slope: float = 2.0,
27
+ intercept: float = 1.0,
28
+ noise: float = 1.0,
29
+ random_seed: Optional[int] = 42
30
+ ) -> Tuple[np.ndarray, np.ndarray]:
31
+ """
32
+ Generate synthetic data for simple linear regression.
33
+
34
+ Generates data according to: y = slope * x + intercept + gaussian_noise.
35
+
36
+ Parameters
37
+ ----------
38
+ n_samples : int, default=100
39
+ Number of data samples to generate.
40
+ slope : float, default=2.0
41
+ The true coefficient (slope) of the data.
42
+ intercept : float, default=1.0
43
+ The true intercept (y-intercept) of the data.
44
+ noise : float, default=1.0
45
+ Standard deviation of the gaussian noise added to the target.
46
+ random_seed : int or None, default=42
47
+ Seed for the random number generator to ensure reproducibility.
48
+
49
+ Returns
50
+ -------
51
+ x_arr : np.ndarray of shape (n_samples,)
52
+ Generated input features.
53
+ y_arr : np.ndarray of shape (n_samples,)
54
+ Generated target values.
55
+ """
56
+ if random_seed is not None:
57
+ np.random.seed(random_seed)
58
+
59
+ x_arr = np.random.uniform(-10.0, 10.0, n_samples)
60
+ noise_arr = np.random.normal(0.0, noise, n_samples)
61
+ y_arr = slope * x_arr + intercept + noise_arr
62
+
63
+ return x_arr, y_arr
64
+
65
+
66
+ def make_multiple_regression(
67
+ n_samples: int = 100,
68
+ n_features: int = 3,
69
+ weights: Optional[Union[List[float], np.ndarray]] = None,
70
+ intercept: float = 1.0,
71
+ noise: float = 1.0,
72
+ random_seed: Optional[int] = 42
73
+ ) -> Tuple[np.ndarray, np.ndarray]:
74
+ """
75
+ Generate synthetic data for multiple linear regression.
76
+
77
+ Generates data according to: y = X @ weights + intercept + gaussian_noise.
78
+
79
+ Parameters
80
+ ----------
81
+ n_samples : int, default=100
82
+ Number of data samples to generate.
83
+ n_features : int, default=3
84
+ Number of features to generate. Ignored if `weights` is specified.
85
+ weights : array-like of shape (n_features,) or None, default=None
86
+ True coefficients for each feature. If None, weights are generated randomly.
87
+ intercept : float, default=1.0
88
+ The true intercept of the data.
89
+ noise : float, default=1.0
90
+ Standard deviation of the gaussian noise added to the target.
91
+ random_seed : int or None, default=42
92
+ Seed for the random number generator.
93
+
94
+ Returns
95
+ -------
96
+ X : np.ndarray of shape (n_samples, n_features)
97
+ Generated input feature matrix.
98
+ y : np.ndarray of shape (n_samples,)
99
+ Generated target values.
100
+ """
101
+ if random_seed is not None:
102
+ np.random.seed(random_seed)
103
+
104
+ if weights is None:
105
+ actual_weights = np.random.uniform(1.0, 5.0, n_features)
106
+ else:
107
+ actual_weights = np.asarray(weights, dtype=float)
108
+ n_features = len(actual_weights)
109
+
110
+ X = np.random.uniform(-5.0, 5.0, size=(n_samples, n_features))
111
+ noise_arr = np.random.normal(0.0, noise, n_samples)
112
+ y = X @ actual_weights + intercept + noise_arr
113
+
114
+ return X, y
115
+
116
+
117
+ def load_iris_regression() -> Tuple[np.ndarray, np.ndarray]:
118
+ """
119
+ Load a small slice of the classic Iris dataset for simple linear regression.
120
+
121
+ Features (X) : Petal Width (cm)
122
+ Target (y) : Petal Length (cm)
123
+
124
+ Returns
125
+ -------
126
+ X : np.ndarray of shape (40,)
127
+ Petal Width features.
128
+ y : np.ndarray of shape (40,)
129
+ Petal Length target values.
130
+ """
131
+ # 40 real samples from Setosa, Versicolor, and Virginica classes
132
+ iris_raw = [
133
+ [0.2, 1.4],
134
+ [0.2, 1.4],
135
+ [0.2, 1.3],
136
+ [0.2, 1.5],
137
+ [0.2, 1.4],
138
+ [0.4, 1.7],
139
+ [0.3, 1.4],
140
+ [0.2, 1.5],
141
+ [0.2, 1.4],
142
+ [0.1, 1.5],
143
+ [0.2, 1.5],
144
+ [0.2, 1.6],
145
+ [0.1, 1.4],
146
+ [0.1, 1.1],
147
+ [0.2, 1.2],
148
+ [0.4, 1.5],
149
+ [0.4, 1.3],
150
+ [0.3, 1.4],
151
+ [0.3, 1.7],
152
+ [0.3, 1.5],
153
+ [1.4, 4.7],
154
+ [1.5, 4.5],
155
+ [1.5, 4.9],
156
+ [1.3, 4.0],
157
+ [1.5, 4.6],
158
+ [1.3, 4.5],
159
+ [1.6, 4.7],
160
+ [1.0, 3.3],
161
+ [1.3, 4.6],
162
+ [1.4, 3.9],
163
+ [2.5, 6.0],
164
+ [1.9, 5.1],
165
+ [2.1, 5.9],
166
+ [1.8, 5.6],
167
+ [2.2, 5.8],
168
+ [2.1, 6.6],
169
+ [1.7, 4.5],
170
+ [1.8, 4.8],
171
+ [1.8, 5.4],
172
+ [2.5, 5.7],
173
+ ]
174
+ data = np.array(iris_raw)
175
+ X = data[:, 0] # Petal Width
176
+ y = data[:, 1] # Petal Length
177
+ return X, y
178
+
179
+
180
+ def load_housing_regression() -> Tuple[np.ndarray, np.ndarray]:
181
+ """
182
+ Load a toy housing price dataset for multiple linear regression.
183
+
184
+ Features (X) : Matrix of shape (20, 3) representing:
185
+ - Square Footage (in hundreds of sq ft)
186
+ - Number of Bedrooms
187
+ - Age of the property (in years)
188
+ Target (y) : Array of shape (20,) representing house price (in thousands of dollars)
189
+
190
+ Returns
191
+ -------
192
+ X : np.ndarray of shape (20, 3)
193
+ Feature matrix.
194
+ y : np.ndarray of shape (20,)
195
+ Target values (House Prices).
196
+ """
197
+ housing_raw = [
198
+ [15.0, 3, 10, 350.0],
199
+ [20.0, 4, 5, 480.0],
200
+ [12.0, 2, 15, 270.0],
201
+ [18.0, 3, 8, 410.0],
202
+ [25.0, 4, 2, 590.0],
203
+ [14.0, 3, 20, 290.0],
204
+ [22.0, 4, 12, 450.0],
205
+ [16.0, 3, 14, 330.0],
206
+ [30.0, 5, 1, 720.0],
207
+ [10.0, 2, 30, 180.0],
208
+ [19.0, 3, 6, 435.0],
209
+ [21.0, 4, 4, 495.0],
210
+ [13.0, 2, 18, 285.0],
211
+ [17.0, 3, 11, 365.0],
212
+ [24.0, 4, 3, 560.0],
213
+ [11.0, 2, 25, 210.0],
214
+ [28.0, 5, 2, 680.0],
215
+ [15.5, 3, 9, 360.0],
216
+ [20.5, 4, 7, 470.0],
217
+ [23.0, 4, 5, 520.0],
218
+ ]
219
+ data = np.array(housing_raw)
220
+ X = data[:, :3] # [Square Footage, Bedrooms, Age]
221
+ y = data[:, 3] # Price
222
+ return X, y
@@ -0,0 +1,244 @@
1
+ """
2
+ From-scratch implementations of Linear Regression and Multiple Linear Regression.
3
+
4
+ These classes are built using Python and NumPy, demonstrating both gradient descent
5
+ and closed-form analytical solutions (Ordinary Least Squares/Normal Equation).
6
+ """
7
+
8
+ import numpy as np
9
+ from typing import Union, List, Tuple, Optional
10
+ from .utils import func
11
+
12
+
13
+ class LinearRegression:
14
+ """
15
+ Simple Linear Regression model supporting both Gradient Descent and
16
+ Ordinary Least Squares (OLS) closed-form estimation.
17
+
18
+ Parameters
19
+ ----------
20
+ learning_rate : float, default=0.001
21
+ The step size used for updating parameters during gradient descent.
22
+ epochs : int, default=1000
23
+ The number of training iterations for gradient descent.
24
+
25
+ Attributes
26
+ ----------
27
+ b_0 : float
28
+ The intercept of the regression line (bias). Default is 0.0.
29
+ b_1 : float
30
+ The slope of the regression line (weight). Default is 0.0.
31
+ loss_history : list of float
32
+ A list storing the Mean Squared Error (MSE) loss value at each epoch.
33
+ coeff_history : list of tuple of (float, float)
34
+ A history of (intercept, slope) tuples recorded at each epoch or step.
35
+ coeff : tuple of (float, float)
36
+ The final trained coefficients (b_0, b_1).
37
+ """
38
+
39
+ def __init__(self, learning_rate: float = 0.001, epochs: int = 1000) -> None:
40
+ self.learning_rate: float = learning_rate
41
+ self.epochs: int = epochs
42
+ self.b_0: float = 0.0
43
+ self.b_1: float = 0.0
44
+ self.loss_history: List[float] = []
45
+ self.coeff_history: List[Tuple[float, float]] = []
46
+ self.coeff: Tuple[float, float] = (0.0, 0.0)
47
+
48
+ def fit(self, x: Union[List[float], np.ndarray], y: Union[List[float], np.ndarray]) -> None:
49
+ """
50
+ Fit the model coefficients using Batch Gradient Descent.
51
+
52
+ Parameters
53
+ ----------
54
+ x : array-like of shape (n_samples,)
55
+ Training input features.
56
+ y : array-like of shape (n_samples,)
57
+ Target values.
58
+ """
59
+ x_arr = np.asarray(x, dtype=float)
60
+ y_arr = np.asarray(y, dtype=float)
61
+ n = len(x_arr)
62
+
63
+ self.loss_history = []
64
+ self.coeff_history = []
65
+
66
+ for _ in range(self.epochs):
67
+ # Vectorized prediction
68
+ y_pred = self.b_1 * x_arr + self.b_0
69
+ errors = y_pred - y_arr
70
+
71
+ # Vectorized gradients
72
+ grad_b0 = (2.0 / n) * np.sum(errors)
73
+ grad_b1 = (2.0 / n) * np.sum(errors * x_arr)
74
+
75
+ # Update weights
76
+ self.b_0 -= self.learning_rate * grad_b0
77
+ self.b_1 -= self.learning_rate * grad_b1
78
+
79
+ self.loss_history.append(float(np.mean(errors ** 2)))
80
+ self.coeff_history.append((self.b_0, self.b_1))
81
+
82
+ self.coeff = (self.b_0, self.b_1)
83
+
84
+ def fit_ols(self, x: Union[List[float], np.ndarray], y: Union[List[float], np.ndarray]) -> None:
85
+ """
86
+ Fit the model coefficients using Ordinary Least Squares (OLS) closed-form equations.
87
+
88
+ Parameters
89
+ ----------
90
+ x : array-like of shape (n_samples,)
91
+ Training input features.
92
+ y : array-like of shape (n_samples,)
93
+ Target values.
94
+ """
95
+ x_arr = np.asarray(x, dtype=float)
96
+ y_arr = np.asarray(y, dtype=float)
97
+
98
+ mean_x, mean_y = np.mean(x_arr), np.mean(y_arr)
99
+ numerator = np.sum((x_arr - mean_x) * (y_arr - mean_y))
100
+ denominator = np.sum((x_arr - mean_x) ** 2)
101
+
102
+ if denominator == 0.0:
103
+ self.b_1 = 0.0
104
+ else:
105
+ self.b_1 = float(numerator / denominator)
106
+
107
+ self.b_0 = float(mean_y - self.b_1 * mean_x)
108
+ self.coeff = (self.b_0, self.b_1)
109
+ self.coeff_history = [(self.b_0, self.b_1)]
110
+
111
+ def predict(self, x: Union[float, List[float], np.ndarray]) -> Union[float, List[float]]:
112
+ """
113
+ Predict target values using the linear model: y = b_1 * x + b_0.
114
+
115
+ Parameters
116
+ ----------
117
+ x : float or array-like
118
+ Input features to predict.
119
+
120
+ Returns
121
+ -------
122
+ float or list of float
123
+ Predicted target values matching the type/shape of the input.
124
+ """
125
+ if isinstance(x, (list, tuple, np.ndarray)):
126
+ return [float(func(xi, b=self.b_0, m=self.b_1)) for xi in x]
127
+ return float(func(x, b=self.b_0, m=self.b_1))
128
+
129
+ def score(self, x: Union[List[float], np.ndarray], y: Union[List[float], np.ndarray]) -> float:
130
+ """
131
+ Calculate the Coefficient of Determination (R^2 score) of the prediction.
132
+
133
+ Parameters
134
+ ----------
135
+ x : array-like of shape (n_samples,)
136
+ Test input features.
137
+ y : array-like of shape (n_samples,)
138
+ True target values.
139
+
140
+ Returns
141
+ -------
142
+ float
143
+ R^2 score. A value of 1.0 indicates perfect fit.
144
+ """
145
+ x_arr = np.asarray(x, dtype=float)
146
+ y_arr = np.asarray(y, dtype=float)
147
+ y_pred = np.array(self.predict(x_arr), dtype=float)
148
+
149
+ ss_res = np.sum((y_arr - y_pred) ** 2)
150
+ ss_tot = np.sum((y_arr - np.mean(y_arr)) ** 2)
151
+
152
+ if ss_tot == 0.0:
153
+ return 0.0
154
+ return float(1.0 - (ss_res / ss_tot))
155
+
156
+
157
+ class MultipleLinearRegression:
158
+ """
159
+ Multiple Linear Regression model using the closed-form Normal Equation.
160
+
161
+ Attributes
162
+ ----------
163
+ weights : np.ndarray or None
164
+ Coefficients for the input features (excluding the intercept).
165
+ intercept : float or None
166
+ The intercept of the regression hyperplane (bias).
167
+ """
168
+
169
+ def __init__(self) -> None:
170
+ self.weights: Optional[np.ndarray] = None
171
+ self.intercept: Optional[float] = None
172
+
173
+ def fit(self, X: Union[List[List[float]], np.ndarray], y: Union[List[float], np.ndarray]) -> None:
174
+ """
175
+ Fit the multiple linear model using the Normal Equation:
176
+ theta = (X^T * X)^(-1) * X^T * y
177
+
178
+ Parameters
179
+ ----------
180
+ X : array-like of shape (n_samples, n_features)
181
+ Training input features.
182
+ y : array-like of shape (n_samples,)
183
+ Target values.
184
+ """
185
+ X_arr = np.asarray(X, dtype=float)
186
+ y_arr = np.asarray(y, dtype=float)
187
+
188
+ # Add bias term (column of 1s) to feature matrix
189
+ X_b = np.c_[np.ones(X_arr.shape[0]), X_arr]
190
+
191
+ # Normal Equation solver
192
+ theta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y_arr
193
+ self.intercept = float(theta[0])
194
+ self.weights = theta[1:]
195
+
196
+ def predict(self, X: Union[List[List[float]], np.ndarray]) -> np.ndarray:
197
+ """
198
+ Predict target values for multiple features.
199
+
200
+ Parameters
201
+ ----------
202
+ X : array-like of shape (n_samples, n_features)
203
+ Input features to predict.
204
+
205
+ Returns
206
+ -------
207
+ np.ndarray of shape (n_samples,)
208
+ Predicted target values.
209
+
210
+ Raises
211
+ ------
212
+ ValueError
213
+ If the model has not been fitted prior to prediction.
214
+ """
215
+ X_arr = np.asarray(X, dtype=float)
216
+ if self.weights is None or self.intercept is None:
217
+ raise ValueError("Model must be fitted before calling predict.")
218
+ return X_arr @ self.weights + self.intercept
219
+
220
+ def score(self, X: Union[List[List[float]], np.ndarray], y: Union[List[float], np.ndarray]) -> float:
221
+ """
222
+ Calculate the Coefficient of Determination (R^2 score) of the prediction.
223
+
224
+ Parameters
225
+ ----------
226
+ X : array-like of shape (n_samples, n_features)
227
+ Test input features.
228
+ y : array-like of shape (n_samples,)
229
+ True target values.
230
+
231
+ Returns
232
+ -------
233
+ float
234
+ R^2 score. A value of 1.0 indicates perfect fit.
235
+ """
236
+ X_arr = np.asarray(X, dtype=float)
237
+ y_arr = np.asarray(y, dtype=float)
238
+ preds = self.predict(X_arr)
239
+ ss_res = np.sum((y_arr - preds) ** 2)
240
+ ss_tot = np.sum((y_arr - np.mean(y_arr)) ** 2)
241
+
242
+ if ss_tot == 0.0:
243
+ return 0.0
244
+ return float(1.0 - (ss_res / ss_tot))
@@ -0,0 +1,23 @@
1
+ """
2
+ Utility functions for linear regression operations.
3
+ """
4
+
5
+ def func(x: float, m: float = 0.0, b: float = 0.0) -> float:
6
+ """
7
+ Compute the linear function y = m * x + b.
8
+
9
+ Parameters
10
+ ----------
11
+ x : float
12
+ The input feature value.
13
+ m : float, default=0.0
14
+ The slope of the line (weight).
15
+ b : float, default=0.0
16
+ The y-intercept (bias).
17
+
18
+ Returns
19
+ -------
20
+ float
21
+ The calculated function value.
22
+ """
23
+ return m * x + b
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-blog-regression
3
+ Version: 0.1.0
4
+ Summary: From-scratch linear and multiple regression library for the Data Blog portfolio
5
+ Author: Your Name
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: numpy>=1.21
10
+
11
+ # Regression Library
12
+
13
+ A lightweight, from-scratch linear and multiple regression library implemented in Python with NumPy. Perfect for educational demonstration, portfolios, and quick testing of regression mechanics.
14
+
15
+ ## Features
16
+
17
+ - **Simple Linear Regression**: Supports Batch Gradient Descent with complete loss and coefficient history, and closed-form OLS estimation.
18
+ - **Multiple Linear Regression**: Analytical Normal Equation solver for multidimensional datasets.
19
+ - **Visualization & Animation**: Real-time animation of gradient descent fitting and static regression line plotting with Matplotlib.
20
+ - **Dataset Loaders**: Pre-packaged synthetic generators and subsets of classic real datasets (Iris and Housing).
21
+
22
+ ## Quick Start
23
+
24
+ ### Installation
25
+
26
+ Clone the repository and install dependencies:
27
+
28
+ ```bash
29
+ pip install -r requirements.txt
30
+ ```
31
+
32
+ ### Running the Interactive Demo
33
+
34
+ To explore interactive fits on various datasets and watch the live training animation:
35
+
36
+ ```bash
37
+ python main.py
38
+ ```
39
+
40
+ ### Library Usage
41
+
42
+ You can easily import and train models in your own scripts:
43
+
44
+ ```python
45
+ from data_blog import LinearRegression, x, y
46
+
47
+ # Instantiate and fit using Gradient Descent
48
+ model = LinearRegression(learning_rate=0.01, epochs=1000)
49
+ model.fit(x, y)
50
+
51
+ print(f"Intercept: {model.b_0:.4f}")
52
+ print(f"Slope: {model.b_1:.4f}")
53
+ print(f"R-squared Score: {model.score(x, y):.4f}")
54
+ ```
@@ -0,0 +1,14 @@
1
+ README.md
2
+ pyproject.toml
3
+ data_blog/__init__.py
4
+ data_blog/data.py
5
+ data_blog/linear_regression.py
6
+ data_blog/utils.py
7
+ data_blog/animate/__init__.py
8
+ data_blog/animate/track.py
9
+ data_blog_regression.egg-info/PKG-INFO
10
+ data_blog_regression.egg-info/SOURCES.txt
11
+ data_blog_regression.egg-info/dependency_links.txt
12
+ data_blog_regression.egg-info/requires.txt
13
+ data_blog_regression.egg-info/top_level.txt
14
+ tests/test_linear.py
@@ -0,0 +1,15 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "data-blog-regression"
7
+ version = "0.1.0"
8
+ description = "From-scratch linear and multiple regression library for the Data Blog portfolio"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = { text = "MIT" }
12
+ authors = [ { name = "Your Name" } ]
13
+ dependencies = [
14
+ "numpy>=1.21"
15
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,195 @@
1
+ import math
2
+
3
+ import numpy as np
4
+ import pytest
5
+ from sklearn.linear_model import LinearRegression as SklearnLinearRegression
6
+ from sklearn.metrics import r2_score
7
+
8
+ from data_blog import (
9
+ LinearRegression,
10
+ MultipleLinearRegression,
11
+ x,
12
+ x_multi_test,
13
+ x_test,
14
+ y,
15
+ y_multi_test,
16
+ y_test,
17
+ make_simple_regression,
18
+ make_multiple_regression,
19
+ load_iris_regression,
20
+ load_housing_regression,
21
+ )
22
+
23
+
24
+ @pytest.fixture
25
+ def model():
26
+ model = LinearRegression(learning_rate=0.01, epochs=1000)
27
+ model_ols = LinearRegression()
28
+ model_ols.fit_ols(x, y)
29
+ model.fit(x, y)
30
+ yield model, model_ols
31
+ del model
32
+
33
+ def test_fit_changes_coefficients():
34
+ model = LinearRegression(learning_rate=0.01, epochs=10)
35
+ initial_b0, initial_b1 = model.b_0, model.b_1
36
+ model.fit(x, y)
37
+ assert not (
38
+ math.isclose(model.b_0, initial_b0) and math.isclose(model.b_1, initial_b1)
39
+ )
40
+
41
+ def test_score_calculates_r_squared(model):
42
+ model, _ = model
43
+ r_squared = model.score(x, y)
44
+ assert isinstance(r_squared, float)
45
+ assert 0 <= r_squared <= 1
46
+
47
+ def test_predict_returns_expected_values(model):
48
+ model, _ = model
49
+ predictions = model.predict(x)
50
+ assert len(predictions) == len(x)
51
+ assert all(isinstance(pred, (int, float)) for pred in predictions)
52
+
53
+ def test_coefficients_close_to_expected(model):
54
+ model, _ = model
55
+ expected_b0 = 0.0
56
+ expected_b1 = 2.0
57
+ assert math.isclose(model.b_0, expected_b0, abs_tol=0.1)
58
+ assert math.isclose(model.b_1, expected_b1, abs_tol=0.1)
59
+ assert model.coeff == (model.b_0, model.b_1)
60
+
61
+ def test_r_square_to_sklearn(model):
62
+ model, _ = model
63
+ sklearn_model = SklearnLinearRegression()
64
+ sklearn_model.fit(np.array(x).reshape(-1, 1), y)
65
+ sklearn_r2 = sklearn_model.score(np.array(x).reshape(-1, 1), y)
66
+ model_r2 = model.score(x, y)
67
+ assert math.isclose(model_r2, sklearn_r2, abs_tol=0.01)
68
+
69
+ def test_predict_on_test_data(model):
70
+ model, _ = model
71
+ predictions = model.predict(x_test)
72
+ assert len(predictions) == len(x_test)
73
+ assert all(isinstance(pred, (int, float)) for pred in predictions)
74
+ r2 = r2_score(y_test, predictions)
75
+ assert r2 > 0.9
76
+
77
+ def test_ols_coefficients_close_to_expected(model):
78
+ model, model_ols = model
79
+ expected_b0 = 0.0
80
+ expected_b1 = 2.0
81
+ assert math.isclose(model_ols.b_0, expected_b0, abs_tol=0.1)
82
+ assert math.isclose(model_ols.b_1, expected_b1, abs_tol=0.1)
83
+ assert model_ols.coeff == (model_ols.b_0, model_ols.b_1)
84
+
85
+ def test_ols_r_square_to_sklearn(model):
86
+ model, model_ols = model
87
+ sklearn_model = SklearnLinearRegression()
88
+ sklearn_model.fit(np.array(x).reshape(-1, 1), y)
89
+ sklearn_r2 = sklearn_model.score(np.array(x).reshape(-1, 1), y)
90
+ model_ols_r2 = model_ols.score(x, y)
91
+ assert math.isclose(model_ols_r2, sklearn_r2, abs_tol=0.01)
92
+
93
+ def test_ols_predict_on_test_data(model):
94
+ model, model_ols = model
95
+ predictions = model_ols.predict(x_test)
96
+ assert len(predictions) == len(x_test)
97
+ assert all(isinstance(pred, (int, float)) for pred in predictions)
98
+ r2 = r2_score(y_test, predictions)
99
+ assert r2 > 0.9
100
+
101
+
102
+ def test_multiple_regression_vs_sklearn():
103
+ model = MultipleLinearRegression()
104
+ model.fit(x_multi_test, y_multi_test)
105
+
106
+ sklearn_model = SklearnLinearRegression()
107
+ sklearn_model.fit(x_multi_test, y_multi_test)
108
+
109
+ assert math.isclose(model.intercept, sklearn_model.intercept_, abs_tol=1e-5)
110
+ assert np.allclose(model.weights, sklearn_model.coef_, atol=1e-5)
111
+
112
+ preds = model.predict(x_multi_test)
113
+ sklearn_preds = sklearn_model.predict(x_multi_test)
114
+ assert np.allclose(preds, sklearn_preds, atol=1e-5)
115
+
116
+ score = model.score(x_multi_test, y_multi_test)
117
+ sklearn_score = sklearn_model.score(x_multi_test, y_multi_test)
118
+ assert math.isclose(score, sklearn_score, abs_tol=1e-5)
119
+
120
+
121
+ def test_plot_regression_line():
122
+ from unittest.mock import patch
123
+ from data_blog.animate import plot_regression_line
124
+
125
+ model = LinearRegression(learning_rate=0.01, epochs=10)
126
+ model.fit(x, y)
127
+
128
+ with patch("matplotlib.pyplot.show") as mock_show:
129
+ plot_regression_line(x, y, model)
130
+ mock_show.assert_called_once()
131
+
132
+
133
+ def test_animate_regression_fitting():
134
+ from unittest.mock import patch
135
+ from data_blog.animate import animate_regression_fitting
136
+
137
+ model = LinearRegression(learning_rate=0.01, epochs=10)
138
+ model.fit(x, y)
139
+
140
+ with patch("matplotlib.pyplot.show") as mock_show:
141
+ anim = animate_regression_fitting(x, y, model)
142
+ assert anim is not None
143
+ mock_show.assert_called_once()
144
+
145
+
146
+ def test_make_simple_regression():
147
+ X, y = make_simple_regression(n_samples=50, slope=3.0, intercept=-2.0, noise=0.5, random_seed=123)
148
+ assert len(X) == 50
149
+ assert len(y) == 50
150
+
151
+ # Fit OLS model on the generated data
152
+ model = LinearRegression()
153
+ model.fit_ols(X, y)
154
+
155
+ # Should be close to slope 3.0 and intercept -2.0
156
+ assert math.isclose(model.b_1, 3.0, abs_tol=0.2)
157
+ assert math.isclose(model.b_0, -2.0, abs_tol=0.2)
158
+
159
+
160
+ def test_make_multiple_regression():
161
+ X, y = make_multiple_regression(n_samples=100, n_features=4, weights=[1.5, -2.0, 0.5, 3.0], intercept=10.0, noise=0.1, random_seed=42)
162
+ assert X.shape == (100, 4)
163
+ assert len(y) == 100
164
+
165
+ model = MultipleLinearRegression()
166
+ model.fit(X, y)
167
+
168
+ assert math.isclose(model.intercept, 10.0, abs_tol=0.1)
169
+ assert np.allclose(model.weights, [1.5, -2.0, 0.5, 3.0], atol=0.1)
170
+
171
+
172
+ def test_load_iris_regression():
173
+ X, y = load_iris_regression()
174
+ assert len(X) == 40
175
+ assert len(y) == 40
176
+
177
+ model = LinearRegression()
178
+ model.fit_ols(X, y)
179
+
180
+ score = model.score(X, y)
181
+ assert score > 0.90 # Iris petal width vs length has a strong correlation
182
+
183
+
184
+ def test_load_housing_regression():
185
+ X, y = load_housing_regression()
186
+ assert X.shape == (20, 3)
187
+ assert len(y) == 20
188
+
189
+ model = MultipleLinearRegression()
190
+ model.fit(X, y)
191
+
192
+ score = model.score(X, y)
193
+ assert score > 0.95 # Our synthetic housing data fits very well
194
+
195
+