ml_ai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +8 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +437 -0
- data/Rakefile +12 -0
- data/benchmarks/car_price_prediction_benchmark.py +38 -0
- data/benchmarks/employee_salary_prediction_benchmark.py +38 -0
- data/benchmarks/energy_consumption_prediction_benchmark.py +37 -0
- data/benchmarks/evaluation_metrics.py +32 -0
- data/benchmarks/house_price_prediction_benchmark.py +38 -0
- data/benchmarks/multiple_linear_regression.py +78 -0
- data/benchmarks/simple_linear_regression_benchmark.py +55 -0
- data/data/advertising_revenue.csv +6 -0
- data/data/car_prices.csv +6 -0
- data/data/employee_salaries.csv +6 -0
- data/data/energy_consumption.csv +11 -0
- data/data/house_prices.csv +6 -0
- data/data/multiple_linear_regression_data.csv +6 -0
- data/data/simple_linear_regression_data.csv +6 -0
- data/examples/car_price_prediction.rb +30 -0
- data/examples/employee_salary_prediction.rb +34 -0
- data/examples/energy_consumption_prediction.rb +39 -0
- data/examples/house_price_prediction.rb +30 -0
- data/lib/ml_ai/dataset.rb +25 -0
- data/lib/ml_ai/multiple_linear_regression.rb +113 -0
- data/lib/ml_ai/simple_linear_regression.rb +87 -0
- data/lib/ml_ai/version.rb +5 -0
- data/lib/ml_ai.rb +11 -0
- data/sig/ml_ai.rbs +4 -0
- metadata +77 -0
@@ -0,0 +1,38 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from sklearn.linear_model import Ridge
|
4
|
+
from sklearn.metrics import mean_squared_error, r2_score
|
5
|
+
|
6
|
+
# Load data from CSV file
|
7
|
+
csv_file_path = "data/employee_salaries.csv"
|
8
|
+
data = pd.read_csv(csv_file_path)
|
9
|
+
|
10
|
+
# Extract feature and target columns
|
11
|
+
x = data[['Experience', 'Education', 'Skills']].values
|
12
|
+
y = data['Salary'].values
|
13
|
+
|
14
|
+
# Initialize and fit the model with regularization (Ridge Regression)
|
15
|
+
model = Ridge(alpha=0.1)
|
16
|
+
model.fit(x, y)
|
17
|
+
|
18
|
+
# Make predictions on the same data
|
19
|
+
predictions = model.predict(x)
|
20
|
+
predictions = np.round(predictions, 2) # Limit to two decimal places
|
21
|
+
|
22
|
+
# Calculate evaluation metrics
|
23
|
+
mse = mean_squared_error(y, predictions)
|
24
|
+
r2 = r2_score(y, predictions)
|
25
|
+
|
26
|
+
# Print the results
|
27
|
+
print(f"Coefficients: {np.round(model.coef_, 2)}")
|
28
|
+
print(f"Intercept: {round(model.intercept_, 2)}")
|
29
|
+
print(f"Predictions: {predictions}")
|
30
|
+
print(f"MSE: {round(mse, 2)}")
|
31
|
+
print(f"R-squared: {round(r2, 2)}")
|
32
|
+
|
33
|
+
# Predict on new data
|
34
|
+
new_data = np.array([[6, 3, 6]])
|
35
|
+
new_prediction = model.predict(new_data)
|
36
|
+
new_prediction = np.round(new_prediction, 2) # Limit to two decimal places
|
37
|
+
|
38
|
+
print(f"Predicted Salary for the new employee: ${new_prediction[0]}")
|
@@ -0,0 +1,37 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
from sklearn.linear_model import LinearRegression
|
3
|
+
from sklearn.metrics import mean_squared_error, r2_score
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
# Load the dataset
|
7
|
+
df = pd.read_csv('data/energy_consumption.csv')
|
8
|
+
|
9
|
+
# Prepare the features (X) and the target (y)
|
10
|
+
X = df[['Size', 'Occupants', 'Computers']]
|
11
|
+
y = df['EnergyConsumption']
|
12
|
+
|
13
|
+
# Initialize and fit the model
|
14
|
+
model = LinearRegression()
|
15
|
+
model.fit(X, y)
|
16
|
+
|
17
|
+
# Perform predictions on the same dataset to evaluate the model
|
18
|
+
predictions = model.predict(X)
|
19
|
+
predictions = np.round(predictions, 2) # Round predictions to 2 decimal places
|
20
|
+
|
21
|
+
# Calculate evaluation metrics
|
22
|
+
mse = mean_squared_error(y, predictions)
|
23
|
+
r2 = r2_score(y, predictions)
|
24
|
+
|
25
|
+
# Print the results
|
26
|
+
print(f"Coefficients: {np.round(model.coef_, 2)}")
|
27
|
+
print(f"Intercept: {round(model.intercept_, 2)}")
|
28
|
+
print(f"Predictions: {predictions}")
|
29
|
+
print(f"MSE: {round(mse, 2)}")
|
30
|
+
print(f"R-squared: {round(r2, 2)}")
|
31
|
+
|
32
|
+
# Predict the energy consumption for a new building
|
33
|
+
new_building = pd.DataFrame([[3500, 60, 70]], columns=['Size', 'Occupants', 'Computers'])
|
34
|
+
new_prediction = model.predict(new_building)
|
35
|
+
new_prediction = round(new_prediction[0], 2) # Round the prediction to 2 decimal places
|
36
|
+
|
37
|
+
print(f"Predicted energy consumption for the new building: {new_prediction} kWh")
|
@@ -0,0 +1,32 @@
|
|
1
|
+
from sklearn.linear_model import LinearRegression
|
2
|
+
from sklearn.metrics import mean_squared_error, r2_score
|
3
|
+
import numpy as np
|
4
|
+
|
5
|
+
# Define the dataset
|
6
|
+
x_values = np.array([1, 2, 3, 4, 5]).reshape(-1, 1)
|
7
|
+
y_values = np.array([2, 4, 5, 4, 5])
|
8
|
+
|
9
|
+
# Initialize and fit the model
|
10
|
+
model = LinearRegression()
|
11
|
+
model.fit(x_values, y_values)
|
12
|
+
|
13
|
+
# Make predictions
|
14
|
+
predictions = model.predict(x_values)
|
15
|
+
|
16
|
+
# Limit predictions to two decimal digits
|
17
|
+
predictions = np.round(predictions, 2)
|
18
|
+
|
19
|
+
# Calculate evaluation metrics
|
20
|
+
mse = mean_squared_error(y_values, predictions)
|
21
|
+
r2 = r2_score(y_values, predictions)
|
22
|
+
|
23
|
+
# Limit MSE and R-squared to two decimal digits
|
24
|
+
mse = round(mse, 2)
|
25
|
+
r2 = round(r2, 2)
|
26
|
+
|
27
|
+
# Print results
|
28
|
+
print(f"Slope: {round(model.coef_[0], 2)}")
|
29
|
+
print(f"Intercept: {round(model.intercept_, 2)}")
|
30
|
+
print(f"Predictions: {predictions}")
|
31
|
+
print(f"MSE: {mse}")
|
32
|
+
print(f"R-squared: {r2}")
|
@@ -0,0 +1,38 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from sklearn.linear_model import LinearRegression
|
4
|
+
from sklearn.metrics import mean_squared_error, r2_score
|
5
|
+
|
6
|
+
# Load data from CSV file
|
7
|
+
csv_file_path = "data/house_prices.csv"
|
8
|
+
data = pd.read_csv(csv_file_path)
|
9
|
+
|
10
|
+
# Extract feature and target columns
|
11
|
+
x = data[['Size']].values
|
12
|
+
y = data['Price'].values
|
13
|
+
|
14
|
+
# Initialize and fit the model
|
15
|
+
model = LinearRegression()
|
16
|
+
model.fit(x, y)
|
17
|
+
|
18
|
+
# Make predictions on the same data
|
19
|
+
predictions = model.predict(x)
|
20
|
+
predictions = np.round(predictions, 2) # Limit to two decimal places
|
21
|
+
|
22
|
+
# Calculate evaluation metrics
|
23
|
+
mse = mean_squared_error(y, predictions)
|
24
|
+
r2 = r2_score(y, predictions)
|
25
|
+
|
26
|
+
# Print the results
|
27
|
+
print(f"Coefficients: {np.round(model.coef_, 2)}")
|
28
|
+
print(f"Intercept: {round(model.intercept_, 2)}")
|
29
|
+
print(f"Predictions: {predictions}")
|
30
|
+
print(f"MSE: {round(mse, 2)}")
|
31
|
+
print(f"R-squared: {round(r2, 2)}")
|
32
|
+
|
33
|
+
# Predict on new data
|
34
|
+
new_data = np.array([[1600]])
|
35
|
+
new_prediction = model.predict(new_data)
|
36
|
+
new_prediction = np.round(new_prediction, 2) # Limit to two decimal places
|
37
|
+
|
38
|
+
print(f"Predicted Price for a 1600 sq ft house: {new_prediction[0]} thousand dollars")
|
@@ -0,0 +1,78 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from sklearn.linear_model import LinearRegression
|
4
|
+
from sklearn.metrics import mean_squared_error, r2_score
|
5
|
+
|
6
|
+
# Load data from CSV file
|
7
|
+
csv_file_path = "data/multiple_linear_regression_data.csv"
|
8
|
+
data = pd.read_csv(csv_file_path)
|
9
|
+
|
10
|
+
# Extract feature and target columns
|
11
|
+
x_csv = data[['Feature1', 'Feature2']].values
|
12
|
+
y_csv = data['Target'].values
|
13
|
+
|
14
|
+
# Initialize and fit the model using the CSV data
|
15
|
+
model_csv = LinearRegression()
|
16
|
+
model_csv.fit(x_csv, y_csv)
|
17
|
+
|
18
|
+
# Make predictions on the same data
|
19
|
+
predictions_csv = model_csv.predict(x_csv)
|
20
|
+
predictions_csv = np.round(predictions_csv, 2) # Limit to two decimal places
|
21
|
+
|
22
|
+
# Calculate evaluation metrics
|
23
|
+
mse_csv = mean_squared_error(y_csv, predictions_csv)
|
24
|
+
r2_csv = r2_score(y_csv, predictions_csv)
|
25
|
+
|
26
|
+
# Print the results
|
27
|
+
print(f"CSV Coefficients: {np.round(model_csv.coef_, 2)}")
|
28
|
+
print(f"CSV Intercept: {round(model_csv.intercept_, 2)}")
|
29
|
+
print(f"CSV Predictions: {predictions_csv}")
|
30
|
+
print(f"CSV MSE: {round(mse_csv, 2)}")
|
31
|
+
print(f"CSV R-squared: {round(r2_csv, 2)}")
|
32
|
+
|
33
|
+
# Predict on new data using the model trained on CSV data
|
34
|
+
new_data = np.array([
|
35
|
+
[6, 7],
|
36
|
+
[7, 8]
|
37
|
+
])
|
38
|
+
new_predictions_csv = model_csv.predict(new_data)
|
39
|
+
new_predictions_csv = np.round(new_predictions_csv, 2) # Limit to two decimal places
|
40
|
+
|
41
|
+
print(f"New Predictions from CSV: {new_predictions_csv}")
|
42
|
+
|
43
|
+
# Original example for comparison
|
44
|
+
|
45
|
+
# Define the dataset with multiple features
|
46
|
+
x_values = np.array([
|
47
|
+
[1, 2],
|
48
|
+
[2, 3],
|
49
|
+
[3, 4],
|
50
|
+
[4, 5],
|
51
|
+
[5, 6]
|
52
|
+
])
|
53
|
+
y_values = np.array([5, 7, 9, 11, 13])
|
54
|
+
|
55
|
+
# Initialize and fit the model
|
56
|
+
model = LinearRegression()
|
57
|
+
model.fit(x_values, y_values)
|
58
|
+
|
59
|
+
# Make predictions on the same data
|
60
|
+
predictions = model.predict(x_values)
|
61
|
+
predictions = np.round(predictions, 2) # Limit to two decimal places
|
62
|
+
|
63
|
+
# Calculate evaluation metrics
|
64
|
+
mse = mean_squared_error(y_values, predictions)
|
65
|
+
r2 = r2_score(y_values, predictions)
|
66
|
+
|
67
|
+
# Print the results
|
68
|
+
print(f"Coefficients: {np.round(model.coef_, 2)}")
|
69
|
+
print(f"Intercept: {round(model.intercept_, 2)}")
|
70
|
+
print(f"Predictions: {predictions}")
|
71
|
+
print(f"MSE: {round(mse, 2)}")
|
72
|
+
print(f"R-squared: {round(r2, 2)}")
|
73
|
+
|
74
|
+
# Predict on new data
|
75
|
+
new_predictions = model.predict(new_data)
|
76
|
+
new_predictions = np.round(new_predictions, 2) # Limit to two decimal places
|
77
|
+
|
78
|
+
print(f"New Predictions: {new_predictions}")
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from sklearn.linear_model import LinearRegression
|
4
|
+
|
5
|
+
# Load data from CSV file
|
6
|
+
csv_file_path = "data/simple_linear_regression_data.csv"
|
7
|
+
data = pd.read_csv(csv_file_path)
|
8
|
+
|
9
|
+
# Extract feature and target columns
|
10
|
+
x_csv = data[['Feature']].values
|
11
|
+
y_csv = data['Target'].values
|
12
|
+
|
13
|
+
# Initialize the model and fit using the CSV data
|
14
|
+
model_csv = LinearRegression().fit(x_csv, y_csv)
|
15
|
+
|
16
|
+
# Make predictions on the same data
|
17
|
+
predictions_csv = model_csv.predict(x_csv)
|
18
|
+
print(f"CSV Dataset Predictions: {predictions_csv}") # Output will match the Target column
|
19
|
+
|
20
|
+
# Make a prediction on new data
|
21
|
+
new_data_prediction = model_csv.predict([[6]])
|
22
|
+
print(f"New Data Prediction from CSV: {new_data_prediction}") # Example: [13.]
|
23
|
+
|
24
|
+
# Example 1: Basic Usage
|
25
|
+
x_basic = np.array([1, 2, 3]).reshape(-1, 1)
|
26
|
+
y_basic = np.array([2, 4, 6])
|
27
|
+
|
28
|
+
# Initialize the model and fit
|
29
|
+
model_basic = LinearRegression().fit(x_basic, y_basic)
|
30
|
+
|
31
|
+
# Make a prediction
|
32
|
+
prediction_basic = model_basic.predict([[4]])
|
33
|
+
print(f"Basic Example Prediction: {prediction_basic}") # Output: [8.]
|
34
|
+
|
35
|
+
# Example 2: Larger Dataset
|
36
|
+
x_large = np.array(range(1, 101)).reshape(-1, 1)
|
37
|
+
y_large = np.array([3 * x + 5 for x in range(1, 101)])
|
38
|
+
|
39
|
+
# Initialize the model and fit
|
40
|
+
model_large = LinearRegression().fit(x_large, y_large)
|
41
|
+
|
42
|
+
# Make a prediction
|
43
|
+
prediction_large = model_large.predict([[150]])
|
44
|
+
print(f"Larger Dataset Prediction: {prediction_large}") # Output: [455.]
|
45
|
+
|
46
|
+
# Example 3: Handling Negative and Positive Values
|
47
|
+
x_mixed = np.array([-10, -5, 0, 5, 10]).reshape(-1, 1)
|
48
|
+
y_mixed = np.array([2 * x - 3 for x in [-10, -5, 0, 5, 10]])
|
49
|
+
|
50
|
+
# Initialize the model and fit
|
51
|
+
model_mixed = LinearRegression().fit(x_mixed, y_mixed)
|
52
|
+
|
53
|
+
# Make a prediction
|
54
|
+
prediction_mixed = model_mixed.predict([[15]])
|
55
|
+
print(f"Negative and Positive Values Prediction: {prediction_mixed}") # Output: [27.]
|
data/data/car_prices.csv
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Example: Predicting Car Prices Based on Multiple Features
|
4
|
+
|
5
|
+
require_relative '../lib/ml_ai'
|
6
|
+
|
7
|
+
# Initialize the model
|
8
|
+
model = MLAI::MultipleLinearRegression.new
|
9
|
+
|
10
|
+
# Create a Dataset from the CSV file
|
11
|
+
dataset = MLAI::Dataset.new('data/car_prices.csv')
|
12
|
+
|
13
|
+
# Fit the model using the Dataset
|
14
|
+
model.fit(dataset: dataset, feature_columns: ['Age', 'Mileage', 'Horsepower'], target_column: 'Price')
|
15
|
+
|
16
|
+
# Predict the price of a new car
|
17
|
+
new_car_features = [[4, 55000, 140]] # Age: 4 years, Mileage: 55,000 miles, Horsepower: 140
|
18
|
+
predicted_price = model.predict(new_car_features).first.round(2)
|
19
|
+
puts "Predicted Price for the car: $#{predicted_price}"
|
20
|
+
|
21
|
+
# Evaluate the model using the original dataset
|
22
|
+
original_features = dataset.data.map { |row| row[0..2] } # Assuming 'Age', 'Mileage', 'Horsepower' are the first three columns
|
23
|
+
original_prices = dataset.data.map { |row| row[3] } # Assuming 'Price' is the fourth column
|
24
|
+
predictions = model.predict(original_features).map { |p| p.round(2) }
|
25
|
+
|
26
|
+
mse = model.mean_squared_error(original_prices, predictions).round(2)
|
27
|
+
r2 = model.r_squared(original_prices, predictions).round(2)
|
28
|
+
|
29
|
+
puts "Mean Squared Error: #{mse}"
|
30
|
+
puts "R-squared: #{r2}"
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Example: Predicting Employee Salaries Based on Experience, Education, and Skills
|
4
|
+
|
5
|
+
require_relative '../lib/ml_ai'
|
6
|
+
|
7
|
+
# Initialize the model with a smaller regularization parameter
|
8
|
+
model = MLAI::MultipleLinearRegression.new(regularization: 0.00083)
|
9
|
+
|
10
|
+
# Create a Dataset from the CSV file
|
11
|
+
dataset = MLAI::Dataset.new('data/employee_salaries.csv')
|
12
|
+
|
13
|
+
# Fit the model using the Dataset
|
14
|
+
model.fit(dataset: dataset, feature_columns: ['Experience', 'Education', 'Skills'], target_column: 'Salary')
|
15
|
+
|
16
|
+
# Print coefficients and intercept for comparison
|
17
|
+
puts "Coefficients: #{model.coefficients.map { |coef| coef.round(2) }}"
|
18
|
+
puts "Intercept: #{model.intercept.round(2)}"
|
19
|
+
|
20
|
+
# Predict the salary of a new employee
|
21
|
+
new_employee_features = [[6, 3, 6]] # Experience: 6 years, Education: Master's, Skills: 6
|
22
|
+
predicted_salary = model.predict(new_employee_features).first.round(2)
|
23
|
+
puts "Predicted Salary for the new employee: $#{predicted_salary}"
|
24
|
+
|
25
|
+
# Evaluate the model using the original dataset
|
26
|
+
original_features = dataset.data.map { |row| row[0..2] } # Assuming 'Experience', 'Education', 'Skills' are the first three columns
|
27
|
+
original_salaries = dataset.data.map { |row| row[3] } # Assuming 'Salary' is the fourth column
|
28
|
+
predictions = model.predict(original_features).map { |p| p.round(2) }
|
29
|
+
|
30
|
+
mse = model.mean_squared_error(original_salaries, predictions).round(2)
|
31
|
+
r2 = model.r_squared(original_salaries, predictions).round(2)
|
32
|
+
|
33
|
+
puts "Mean Squared Error: #{mse}"
|
34
|
+
puts "R-squared: #{r2}"
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../lib/ml_ai'
|
4
|
+
|
5
|
+
# Load the dataset
|
6
|
+
dataset = MLAI::Dataset.new('data/energy_consumption.csv')
|
7
|
+
|
8
|
+
# Initialize the MultipleLinearRegression model with slight regularization to avoid singular matrix
|
9
|
+
model = MLAI::MultipleLinearRegression.new(regularization: 1e-9)
|
10
|
+
|
11
|
+
# Perform 5-fold cross-validation
|
12
|
+
average_mse = model.cross_validate(dataset: dataset, feature_columns: ["Size", "Occupants", "Computers"], target_column: "EnergyConsumption", k: 5)
|
13
|
+
|
14
|
+
# Fit the model on the entire dataset
|
15
|
+
model.fit(dataset: dataset, feature_columns: ["Size", "Occupants", "Computers"], target_column: "EnergyConsumption")
|
16
|
+
|
17
|
+
# Get the coefficients and intercept
|
18
|
+
coefficients = model.coefficients.map { |coef| coef.round(2) }
|
19
|
+
intercept = model.intercept.round(2)
|
20
|
+
|
21
|
+
# Perform predictions on the same dataset to evaluate the model
|
22
|
+
predictions = model.predict(dataset.data.map { |row| row[0..2] })
|
23
|
+
predictions = predictions.map { |pred| pred.round(2) } # Round predictions to 2 decimal places
|
24
|
+
|
25
|
+
# Calculate evaluation metrics
|
26
|
+
mse = model.mean_squared_error(dataset.data.map { |row| row[3] }, predictions)
|
27
|
+
r_squared = model.r_squared(dataset.data.map { |row| row[3] }, predictions)
|
28
|
+
|
29
|
+
# Print the results in the desired format
|
30
|
+
puts "Coefficients: #{coefficients.inspect}"
|
31
|
+
puts "Intercept: #{intercept}"
|
32
|
+
puts "Predictions: #{predictions.inspect}"
|
33
|
+
puts "MSE: #{mse.round(2)}"
|
34
|
+
puts "R-squared: #{r_squared.round(2)}"
|
35
|
+
|
36
|
+
# Predict the energy consumption for a new building
|
37
|
+
new_building = [[3500, 60, 70]] # A building with 3500 sq ft, 60 occupants, and 70 computers
|
38
|
+
predicted_energy_consumption = model.predict(new_building).first
|
39
|
+
puts "Predicted energy consumption for the new building: #{predicted_energy_consumption.round(2)} kWh"
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Example: Predicting House Prices Based on Size
|
4
|
+
|
5
|
+
require_relative '../lib/ml_ai'
|
6
|
+
|
7
|
+
# Initialize the model
|
8
|
+
model = MLAI::SimpleLinearRegression.new
|
9
|
+
|
10
|
+
# Create a Dataset from the CSV file
|
11
|
+
dataset = MLAI::Dataset.new('data/house_prices.csv')
|
12
|
+
|
13
|
+
# Fit the model using the Dataset
|
14
|
+
model.fit(dataset: dataset, feature_column: 'Size', target_column: 'Price')
|
15
|
+
|
16
|
+
# Predict the price of a new house
|
17
|
+
new_house_size = [1600] # Size of the new house in square feet
|
18
|
+
predicted_price = model.predict(new_house_size).first.round(2)
|
19
|
+
puts "Predicted Price for a 1600 sq ft house: #{predicted_price} thousand dollars"
|
20
|
+
|
21
|
+
# Evaluate the model using the original dataset
|
22
|
+
original_sizes = dataset.data.map { |row| row[0] } # Assuming 'Size' is the first column
|
23
|
+
original_prices = dataset.data.map { |row| row[1] } # Assuming 'Price' is the second column
|
24
|
+
predictions = model.predict(original_sizes).map { |p| p.round(2) }
|
25
|
+
|
26
|
+
mse = model.mean_squared_error(original_prices, predictions).round(2)
|
27
|
+
r2 = model.r_squared(original_prices, predictions).round(2)
|
28
|
+
|
29
|
+
puts "Mean Squared Error: #{mse}"
|
30
|
+
puts "R-squared: #{r2}"
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module MLAI
|
6
|
+
class Dataset
|
7
|
+
attr_reader :data, :headers
|
8
|
+
|
9
|
+
def initialize(filename)
|
10
|
+
@filename = filename
|
11
|
+
@data = []
|
12
|
+
@headers = []
|
13
|
+
|
14
|
+
load_csv
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def load_csv
|
20
|
+
csv_data = CSV.read(@filename, headers: true)
|
21
|
+
@headers = csv_data.headers
|
22
|
+
@data = csv_data.map { |row| row.fields.map(&:to_f) } # Convert all fields to floats
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'matrix'
|
4
|
+
require_relative 'dataset'
|
5
|
+
|
6
|
+
module MLAI
|
7
|
+
class MultipleLinearRegression
|
8
|
+
attr_reader :coefficients, :intercept, :regularization
|
9
|
+
|
10
|
+
def initialize(alpha = 1e-8, regularization: 0.0)
|
11
|
+
@coefficients = nil
|
12
|
+
@intercept = nil
|
13
|
+
@alpha = alpha # Small value to avoid singular matrix in inversion
|
14
|
+
@regularization = regularization # Regularization strength for Ridge Regression
|
15
|
+
end
|
16
|
+
|
17
|
+
# Fit method accepts either x_values and y_values or a Dataset object with specified columns
|
18
|
+
def fit(x_values: nil, y_values: nil, dataset: nil, feature_columns: nil, target_column: nil)
|
19
|
+
if dataset
|
20
|
+
# Extract feature and target columns from the dataset
|
21
|
+
feature_indices = feature_columns.map { |col| dataset.headers.index(col) }
|
22
|
+
target_index = dataset.headers.index(target_column)
|
23
|
+
|
24
|
+
x_values = dataset.data.map { |row| feature_indices.map { |i| row[i] } }
|
25
|
+
y_values = dataset.data.map { |row| row[target_index] }
|
26
|
+
end
|
27
|
+
|
28
|
+
raise "Input arrays must have the same length" unless x_values.length == y_values.length
|
29
|
+
|
30
|
+
# Convert x_values to a matrix and add a column of ones for the intercept
|
31
|
+
x_matrix = Matrix[*x_values.map { |x| [1] + x }]
|
32
|
+
y_vector = Vector.elements(y_values)
|
33
|
+
|
34
|
+
# Calculate coefficients using the normal equation with regularization: (X^T * X + λI)^-1 * X^T * Y
|
35
|
+
x_transpose = x_matrix.transpose
|
36
|
+
regularization_matrix = Matrix.build(x_matrix.column_count) { |i, j| i == j ? @regularization : 0 }
|
37
|
+
|
38
|
+
xtx = x_transpose * x_matrix + regularization_matrix
|
39
|
+
|
40
|
+
begin
|
41
|
+
theta = xtx.inverse * x_transpose * y_vector
|
42
|
+
rescue ExceptionForMatrix::ErrNotRegular
|
43
|
+
raise "Matrix is singular or nearly singular, consider increasing regularization"
|
44
|
+
end
|
45
|
+
|
46
|
+
@intercept = theta[0]
|
47
|
+
@coefficients = theta.to_a[1..-1]
|
48
|
+
end
|
49
|
+
|
50
|
+
def predict(x_values)
|
51
|
+
raise "Model has not been fitted yet" if @coefficients.nil? || @intercept.nil?
|
52
|
+
|
53
|
+
x_values.map do |x|
|
54
|
+
@coefficients.each_with_index.reduce(@intercept) do |sum, (coef, i)|
|
55
|
+
sum + coef * x[i]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def mean_squared_error(y_true, y_pred)
|
61
|
+
raise "Input arrays must have the same length" unless y_true.length == y_pred.length
|
62
|
+
|
63
|
+
n = y_true.length
|
64
|
+
sum_squared_errors = y_true.each_with_index.map { |y, i| (y - y_pred[i]) ** 2 }.sum
|
65
|
+
sum_squared_errors / n.to_f
|
66
|
+
end
|
67
|
+
|
68
|
+
def r_squared(y_true, y_pred)
|
69
|
+
raise "Input arrays must have the same length" unless y_true.length == y_pred.length
|
70
|
+
|
71
|
+
mean_y = y_true.sum / y_true.length.to_f
|
72
|
+
ss_total = y_true.map { |y| (y - mean_y) ** 2 }.sum
|
73
|
+
ss_residual = y_true.each_with_index.map { |y, i| (y - y_pred[i]) ** 2 }.sum
|
74
|
+
|
75
|
+
1 - (ss_residual / ss_total.to_f)
|
76
|
+
end
|
77
|
+
|
78
|
+
# Cross-validation method to evaluate model performance
|
79
|
+
def cross_validate(x_values: nil, y_values: nil, dataset: nil, feature_columns: nil, target_column: nil, k: 5)
|
80
|
+
if dataset
|
81
|
+
# Extract feature and target columns from the dataset
|
82
|
+
feature_indices = feature_columns.map { |col| dataset.headers.index(col) }
|
83
|
+
target_index = dataset.headers.index(target_column)
|
84
|
+
|
85
|
+
x_values = dataset.data.map { |row| feature_indices.map { |i| row[i] } }
|
86
|
+
y_values = dataset.data.map { |row| row[target_index] }
|
87
|
+
end
|
88
|
+
|
89
|
+
raise "Input arrays must have the same length" unless x_values.length == y_values.length
|
90
|
+
|
91
|
+
fold_size = x_values.length / k
|
92
|
+
errors = []
|
93
|
+
|
94
|
+
k.times do |i|
|
95
|
+
test_start = i * fold_size
|
96
|
+
test_end = test_start + fold_size
|
97
|
+
|
98
|
+
x_train = x_values[0...test_start] + x_values[test_end..-1]
|
99
|
+
y_train = y_values[0...test_start] + y_values[test_end..-1]
|
100
|
+
|
101
|
+
x_test = x_values[test_start...test_end]
|
102
|
+
y_test = y_values[test_start...test_end]
|
103
|
+
|
104
|
+
fit(x_values: x_train, y_values: y_train)
|
105
|
+
predictions = predict(x_test)
|
106
|
+
|
107
|
+
errors << mean_squared_error(y_test, predictions)
|
108
|
+
end
|
109
|
+
|
110
|
+
errors.sum / errors.size.to_f
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|