ml_ai 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.linear_model import Ridge
4
+ from sklearn.metrics import mean_squared_error, r2_score
5
+
6
+ # Load data from CSV file
7
+ csv_file_path = "data/employee_salaries.csv"
8
+ data = pd.read_csv(csv_file_path)
9
+
10
+ # Extract feature and target columns
11
+ x = data[['Experience', 'Education', 'Skills']].values
12
+ y = data['Salary'].values
13
+
14
+ # Initialize and fit the model with regularization (Ridge Regression)
15
+ model = Ridge(alpha=0.1)
16
+ model.fit(x, y)
17
+
18
+ # Make predictions on the same data
19
+ predictions = model.predict(x)
20
+ predictions = np.round(predictions, 2) # Limit to two decimal places
21
+
22
+ # Calculate evaluation metrics
23
+ mse = mean_squared_error(y, predictions)
24
+ r2 = r2_score(y, predictions)
25
+
26
+ # Print the results
27
+ print(f"Coefficients: {np.round(model.coef_, 2)}")
28
+ print(f"Intercept: {round(model.intercept_, 2)}")
29
+ print(f"Predictions: {predictions}")
30
+ print(f"MSE: {round(mse, 2)}")
31
+ print(f"R-squared: {round(r2, 2)}")
32
+
33
+ # Predict on new data
34
+ new_data = np.array([[6, 3, 6]])
35
+ new_prediction = model.predict(new_data)
36
+ new_prediction = np.round(new_prediction, 2) # Limit to two decimal places
37
+
38
+ print(f"Predicted Salary for the new employee: ${new_prediction[0]}")
@@ -0,0 +1,37 @@
1
+ import pandas as pd
2
+ from sklearn.linear_model import LinearRegression
3
+ from sklearn.metrics import mean_squared_error, r2_score
4
+ import numpy as np
5
+
6
+ # Load the dataset
7
+ df = pd.read_csv('data/energy_consumption.csv')
8
+
9
+ # Prepare the features (X) and the target (y)
10
+ X = df[['Size', 'Occupants', 'Computers']]
11
+ y = df['EnergyConsumption']
12
+
13
+ # Initialize and fit the model
14
+ model = LinearRegression()
15
+ model.fit(X, y)
16
+
17
+ # Perform predictions on the same dataset to evaluate the model
18
+ predictions = model.predict(X)
19
+ predictions = np.round(predictions, 2) # Round predictions to 2 decimal places
20
+
21
+ # Calculate evaluation metrics
22
+ mse = mean_squared_error(y, predictions)
23
+ r2 = r2_score(y, predictions)
24
+
25
+ # Print the results
26
+ print(f"Coefficients: {np.round(model.coef_, 2)}")
27
+ print(f"Intercept: {round(model.intercept_, 2)}")
28
+ print(f"Predictions: {predictions}")
29
+ print(f"MSE: {round(mse, 2)}")
30
+ print(f"R-squared: {round(r2, 2)}")
31
+
32
+ # Predict the energy consumption for a new building
33
+ new_building = pd.DataFrame([[3500, 60, 70]], columns=['Size', 'Occupants', 'Computers'])
34
+ new_prediction = model.predict(new_building)
35
+ new_prediction = round(new_prediction[0], 2) # Round the prediction to 2 decimal places
36
+
37
+ print(f"Predicted energy consumption for the new building: {new_prediction} kWh")
@@ -0,0 +1,32 @@
1
+ from sklearn.linear_model import LinearRegression
2
+ from sklearn.metrics import mean_squared_error, r2_score
3
+ import numpy as np
4
+
5
+ # Define the dataset
6
+ x_values = np.array([1, 2, 3, 4, 5]).reshape(-1, 1)
7
+ y_values = np.array([2, 4, 5, 4, 5])
8
+
9
+ # Initialize and fit the model
10
+ model = LinearRegression()
11
+ model.fit(x_values, y_values)
12
+
13
+ # Make predictions
14
+ predictions = model.predict(x_values)
15
+
16
+ # Limit predictions to two decimal digits
17
+ predictions = np.round(predictions, 2)
18
+
19
+ # Calculate evaluation metrics
20
+ mse = mean_squared_error(y_values, predictions)
21
+ r2 = r2_score(y_values, predictions)
22
+
23
+ # Limit MSE and R-squared to two decimal digits
24
+ mse = round(mse, 2)
25
+ r2 = round(r2, 2)
26
+
27
+ # Print results
28
+ print(f"Slope: {round(model.coef_[0], 2)}")
29
+ print(f"Intercept: {round(model.intercept_, 2)}")
30
+ print(f"Predictions: {predictions}")
31
+ print(f"MSE: {mse}")
32
+ print(f"R-squared: {r2}")
@@ -0,0 +1,38 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.linear_model import LinearRegression
4
+ from sklearn.metrics import mean_squared_error, r2_score
5
+
6
+ # Load data from CSV file
7
+ csv_file_path = "data/house_prices.csv"
8
+ data = pd.read_csv(csv_file_path)
9
+
10
+ # Extract feature and target columns
11
+ x = data[['Size']].values
12
+ y = data['Price'].values
13
+
14
+ # Initialize and fit the model
15
+ model = LinearRegression()
16
+ model.fit(x, y)
17
+
18
+ # Make predictions on the same data
19
+ predictions = model.predict(x)
20
+ predictions = np.round(predictions, 2) # Limit to two decimal places
21
+
22
+ # Calculate evaluation metrics
23
+ mse = mean_squared_error(y, predictions)
24
+ r2 = r2_score(y, predictions)
25
+
26
+ # Print the results
27
+ print(f"Coefficients: {np.round(model.coef_, 2)}")
28
+ print(f"Intercept: {round(model.intercept_, 2)}")
29
+ print(f"Predictions: {predictions}")
30
+ print(f"MSE: {round(mse, 2)}")
31
+ print(f"R-squared: {round(r2, 2)}")
32
+
33
+ # Predict on new data
34
+ new_data = np.array([[1600]])
35
+ new_prediction = model.predict(new_data)
36
+ new_prediction = np.round(new_prediction, 2) # Limit to two decimal places
37
+
38
+ print(f"Predicted Price for a 1600 sq ft house: {new_prediction[0]} thousand dollars")
@@ -0,0 +1,78 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.linear_model import LinearRegression
4
+ from sklearn.metrics import mean_squared_error, r2_score
5
+
6
+ # Load data from CSV file
7
+ csv_file_path = "data/multiple_linear_regression_data.csv"
8
+ data = pd.read_csv(csv_file_path)
9
+
10
+ # Extract feature and target columns
11
+ x_csv = data[['Feature1', 'Feature2']].values
12
+ y_csv = data['Target'].values
13
+
14
+ # Initialize and fit the model using the CSV data
15
+ model_csv = LinearRegression()
16
+ model_csv.fit(x_csv, y_csv)
17
+
18
+ # Make predictions on the same data
19
+ predictions_csv = model_csv.predict(x_csv)
20
+ predictions_csv = np.round(predictions_csv, 2) # Limit to two decimal places
21
+
22
+ # Calculate evaluation metrics
23
+ mse_csv = mean_squared_error(y_csv, predictions_csv)
24
+ r2_csv = r2_score(y_csv, predictions_csv)
25
+
26
+ # Print the results
27
+ print(f"CSV Coefficients: {np.round(model_csv.coef_, 2)}")
28
+ print(f"CSV Intercept: {round(model_csv.intercept_, 2)}")
29
+ print(f"CSV Predictions: {predictions_csv}")
30
+ print(f"CSV MSE: {round(mse_csv, 2)}")
31
+ print(f"CSV R-squared: {round(r2_csv, 2)}")
32
+
33
+ # Predict on new data using the model trained on CSV data
34
+ new_data = np.array([
35
+ [6, 7],
36
+ [7, 8]
37
+ ])
38
+ new_predictions_csv = model_csv.predict(new_data)
39
+ new_predictions_csv = np.round(new_predictions_csv, 2) # Limit to two decimal places
40
+
41
+ print(f"New Predictions from CSV: {new_predictions_csv}")
42
+
43
+ # Original example for comparison
44
+
45
+ # Define the dataset with multiple features
46
+ x_values = np.array([
47
+ [1, 2],
48
+ [2, 3],
49
+ [3, 4],
50
+ [4, 5],
51
+ [5, 6]
52
+ ])
53
+ y_values = np.array([5, 7, 9, 11, 13])
54
+
55
+ # Initialize and fit the model
56
+ model = LinearRegression()
57
+ model.fit(x_values, y_values)
58
+
59
+ # Make predictions on the same data
60
+ predictions = model.predict(x_values)
61
+ predictions = np.round(predictions, 2) # Limit to two decimal places
62
+
63
+ # Calculate evaluation metrics
64
+ mse = mean_squared_error(y_values, predictions)
65
+ r2 = r2_score(y_values, predictions)
66
+
67
+ # Print the results
68
+ print(f"Coefficients: {np.round(model.coef_, 2)}")
69
+ print(f"Intercept: {round(model.intercept_, 2)}")
70
+ print(f"Predictions: {predictions}")
71
+ print(f"MSE: {round(mse, 2)}")
72
+ print(f"R-squared: {round(r2, 2)}")
73
+
74
+ # Predict on new data
75
+ new_predictions = model.predict(new_data)
76
+ new_predictions = np.round(new_predictions, 2) # Limit to two decimal places
77
+
78
+ print(f"New Predictions: {new_predictions}")
@@ -0,0 +1,55 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.linear_model import LinearRegression
4
+
5
+ # Load data from CSV file
6
+ csv_file_path = "data/simple_linear_regression_data.csv"
7
+ data = pd.read_csv(csv_file_path)
8
+
9
+ # Extract feature and target columns
10
+ x_csv = data[['Feature']].values
11
+ y_csv = data['Target'].values
12
+
13
+ # Initialize the model and fit using the CSV data
14
+ model_csv = LinearRegression().fit(x_csv, y_csv)
15
+
16
+ # Make predictions on the same data
17
+ predictions_csv = model_csv.predict(x_csv)
18
+ print(f"CSV Dataset Predictions: {predictions_csv}") # Output will match the Target column
19
+
20
+ # Make a prediction on new data
21
+ new_data_prediction = model_csv.predict([[6]])
22
+ print(f"New Data Prediction from CSV: {new_data_prediction}") # Example: [13.]
23
+
24
+ # Example 1: Basic Usage
25
+ x_basic = np.array([1, 2, 3]).reshape(-1, 1)
26
+ y_basic = np.array([2, 4, 6])
27
+
28
+ # Initialize the model and fit
29
+ model_basic = LinearRegression().fit(x_basic, y_basic)
30
+
31
+ # Make a prediction
32
+ prediction_basic = model_basic.predict([[4]])
33
+ print(f"Basic Example Prediction: {prediction_basic}") # Output: [8.]
34
+
35
+ # Example 2: Larger Dataset
36
+ x_large = np.array(range(1, 101)).reshape(-1, 1)
37
+ y_large = np.array([3 * x + 5 for x in range(1, 101)])
38
+
39
+ # Initialize the model and fit
40
+ model_large = LinearRegression().fit(x_large, y_large)
41
+
42
+ # Make a prediction
43
+ prediction_large = model_large.predict([[150]])
44
+ print(f"Larger Dataset Prediction: {prediction_large}") # Output: [455.]
45
+
46
+ # Example 3: Handling Negative and Positive Values
47
+ x_mixed = np.array([-10, -5, 0, 5, 10]).reshape(-1, 1)
48
+ y_mixed = np.array([2 * x - 3 for x in [-10, -5, 0, 5, 10]])
49
+
50
+ # Initialize the model and fit
51
+ model_mixed = LinearRegression().fit(x_mixed, y_mixed)
52
+
53
+ # Make a prediction
54
+ prediction_mixed = model_mixed.predict([[15]])
55
+ print(f"Negative and Positive Values Prediction: {prediction_mixed}") # Output: [27.]
@@ -0,0 +1,6 @@
1
+ TV,Radio,Newspaper,Revenue
2
+ 150,23,50,12.5
3
+ 120,10,45,10.2
4
+ 200,30,60,14.8
5
+ 180,28,52,13.6
6
+ 240,40,70,16.3
@@ -0,0 +1,6 @@
1
+ Age,Mileage,Horsepower,Price
2
+ 3,50000,150,20000
3
+ 5,80000,120,15000
4
+ 2,30000,200,25000
5
+ 7,100000,100,12000
6
+ 4,60000,160,18000
@@ -0,0 +1,6 @@
1
+ Experience,Education,Skills,Salary
2
+ 5,3,5,70000
3
+ 7,2,7,85000
4
+ 10,1,8,90000
5
+ 3,4,4,60000
6
+ 8,2,6,80000
@@ -0,0 +1,11 @@
1
+ Size,Occupants,Computers,EnergyConsumption
2
+ 1000,10,20,3000
3
+ 1200,15,25,3500
4
+ 1500,20,30,4000
5
+ 1700,25,35,4500
6
+ 2000,30,40,5000
7
+ 2200,35,45,5500
8
+ 2500,40,50,6000
9
+ 2700,45,55,6500
10
+ 3000,50,60,7000
11
+ 3200,55,65,7500
@@ -0,0 +1,6 @@
1
+ Size,Price
2
+ 800,150
3
+ 1000,200
4
+ 1200,240
5
+ 1500,300
6
+ 1800,360
@@ -0,0 +1,6 @@
1
+ Feature1,Feature2,Target
2
+ 1,2,5
3
+ 2,3,7
4
+ 3,4,9
5
+ 4,5,11
6
+ 5,6,13
@@ -0,0 +1,6 @@
1
+ Feature,Target
2
+ 1,3
3
+ 2,5
4
+ 3,7
5
+ 4,9
6
+ 5,11
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Example: Predicting Car Prices Based on Multiple Features
4
+
5
+ require_relative '../lib/ml_ai'
6
+
7
+ # Initialize the model
8
+ model = MLAI::MultipleLinearRegression.new
9
+
10
+ # Create a Dataset from the CSV file
11
+ dataset = MLAI::Dataset.new('data/car_prices.csv')
12
+
13
+ # Fit the model using the Dataset
14
+ model.fit(dataset: dataset, feature_columns: ['Age', 'Mileage', 'Horsepower'], target_column: 'Price')
15
+
16
+ # Predict the price of a new car
17
+ new_car_features = [[4, 55000, 140]] # Age: 4 years, Mileage: 55,000 miles, Horsepower: 140
18
+ predicted_price = model.predict(new_car_features).first.round(2)
19
+ puts "Predicted Price for the car: $#{predicted_price}"
20
+
21
+ # Evaluate the model using the original dataset
22
+ original_features = dataset.data.map { |row| row[0..2] } # Assuming 'Age', 'Mileage', 'Horsepower' are the first three columns
23
+ original_prices = dataset.data.map { |row| row[3] } # Assuming 'Price' is the fourth column
24
+ predictions = model.predict(original_features).map { |p| p.round(2) }
25
+
26
+ mse = model.mean_squared_error(original_prices, predictions).round(2)
27
+ r2 = model.r_squared(original_prices, predictions).round(2)
28
+
29
+ puts "Mean Squared Error: #{mse}"
30
+ puts "R-squared: #{r2}"
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Example: Predicting Employee Salaries Based on Experience, Education, and Skills
4
+
5
+ require_relative '../lib/ml_ai'
6
+
7
+ # Initialize the model with a smaller regularization parameter
8
+ model = MLAI::MultipleLinearRegression.new(regularization: 0.00083)
9
+
10
+ # Create a Dataset from the CSV file
11
+ dataset = MLAI::Dataset.new('data/employee_salaries.csv')
12
+
13
+ # Fit the model using the Dataset
14
+ model.fit(dataset: dataset, feature_columns: ['Experience', 'Education', 'Skills'], target_column: 'Salary')
15
+
16
+ # Print coefficients and intercept for comparison
17
+ puts "Coefficients: #{model.coefficients.map { |coef| coef.round(2) }}"
18
+ puts "Intercept: #{model.intercept.round(2)}"
19
+
20
+ # Predict the salary of a new employee
21
+ new_employee_features = [[6, 3, 6]] # Experience: 6 years, Education: Master's, Skills: 6
22
+ predicted_salary = model.predict(new_employee_features).first.round(2)
23
+ puts "Predicted Salary for the new employee: $#{predicted_salary}"
24
+
25
+ # Evaluate the model using the original dataset
26
+ original_features = dataset.data.map { |row| row[0..2] } # Assuming 'Experience', 'Education', 'Skills' are the first three columns
27
+ original_salaries = dataset.data.map { |row| row[3] } # Assuming 'Salary' is the fourth column
28
+ predictions = model.predict(original_features).map { |p| p.round(2) }
29
+
30
+ mse = model.mean_squared_error(original_salaries, predictions).round(2)
31
+ r2 = model.r_squared(original_salaries, predictions).round(2)
32
+
33
+ puts "Mean Squared Error: #{mse}"
34
+ puts "R-squared: #{r2}"
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../lib/ml_ai'
4
+
5
+ # Load the dataset
6
+ dataset = MLAI::Dataset.new('data/energy_consumption.csv')
7
+
8
+ # Initialize the MultipleLinearRegression model with slight regularization to avoid singular matrix
9
+ model = MLAI::MultipleLinearRegression.new(regularization: 1e-9)
10
+
11
+ # Perform 5-fold cross-validation
12
+ average_mse = model.cross_validate(dataset: dataset, feature_columns: ["Size", "Occupants", "Computers"], target_column: "EnergyConsumption", k: 5)
13
+
14
+ # Fit the model on the entire dataset
15
+ model.fit(dataset: dataset, feature_columns: ["Size", "Occupants", "Computers"], target_column: "EnergyConsumption")
16
+
17
+ # Get the coefficients and intercept
18
+ coefficients = model.coefficients.map { |coef| coef.round(2) }
19
+ intercept = model.intercept.round(2)
20
+
21
+ # Perform predictions on the same dataset to evaluate the model
22
+ predictions = model.predict(dataset.data.map { |row| row[0..2] })
23
+ predictions = predictions.map { |pred| pred.round(2) } # Round predictions to 2 decimal places
24
+
25
+ # Calculate evaluation metrics
26
+ mse = model.mean_squared_error(dataset.data.map { |row| row[3] }, predictions)
27
+ r_squared = model.r_squared(dataset.data.map { |row| row[3] }, predictions)
28
+
29
+ # Print the results in the desired format
30
+ puts "Coefficients: #{coefficients.inspect}"
31
+ puts "Intercept: #{intercept}"
32
+ puts "Predictions: #{predictions.inspect}"
33
+ puts "MSE: #{mse.round(2)}"
34
+ puts "R-squared: #{r_squared.round(2)}"
35
+
36
+ # Predict the energy consumption for a new building
37
+ new_building = [[3500, 60, 70]] # A building with 3500 sq ft, 60 occupants, and 70 computers
38
+ predicted_energy_consumption = model.predict(new_building).first
39
+ puts "Predicted energy consumption for the new building: #{predicted_energy_consumption.round(2)} kWh"
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Example: Predicting House Prices Based on Size
4
+
5
+ require_relative '../lib/ml_ai'
6
+
7
+ # Initialize the model
8
+ model = MLAI::SimpleLinearRegression.new
9
+
10
+ # Create a Dataset from the CSV file
11
+ dataset = MLAI::Dataset.new('data/house_prices.csv')
12
+
13
+ # Fit the model using the Dataset
14
+ model.fit(dataset: dataset, feature_column: 'Size', target_column: 'Price')
15
+
16
+ # Predict the price of a new house
17
+ new_house_size = [1600] # Size of the new house in square feet
18
+ predicted_price = model.predict(new_house_size).first.round(2)
19
+ puts "Predicted Price for a 1600 sq ft house: #{predicted_price} thousand dollars"
20
+
21
+ # Evaluate the model using the original dataset
22
+ original_sizes = dataset.data.map { |row| row[0] } # Assuming 'Size' is the first column
23
+ original_prices = dataset.data.map { |row| row[1] } # Assuming 'Price' is the second column
24
+ predictions = model.predict(original_sizes).map { |p| p.round(2) }
25
+
26
+ mse = model.mean_squared_error(original_prices, predictions).round(2)
27
+ r2 = model.r_squared(original_prices, predictions).round(2)
28
+
29
+ puts "Mean Squared Error: #{mse}"
30
+ puts "R-squared: #{r2}"
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ module MLAI
6
+ class Dataset
7
+ attr_reader :data, :headers
8
+
9
+ def initialize(filename)
10
+ @filename = filename
11
+ @data = []
12
+ @headers = []
13
+
14
+ load_csv
15
+ end
16
+
17
+ private
18
+
19
+ def load_csv
20
+ csv_data = CSV.read(@filename, headers: true)
21
+ @headers = csv_data.headers
22
+ @data = csv_data.map { |row| row.fields.map(&:to_f) } # Convert all fields to floats
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'matrix'
4
+ require_relative 'dataset'
5
+
6
+ module MLAI
7
+ class MultipleLinearRegression
8
+ attr_reader :coefficients, :intercept, :regularization
9
+
10
+ def initialize(alpha = 1e-8, regularization: 0.0)
11
+ @coefficients = nil
12
+ @intercept = nil
13
+ @alpha = alpha # Small value to avoid singular matrix in inversion
14
+ @regularization = regularization # Regularization strength for Ridge Regression
15
+ end
16
+
17
+ # Fit method accepts either x_values and y_values or a Dataset object with specified columns
18
+ def fit(x_values: nil, y_values: nil, dataset: nil, feature_columns: nil, target_column: nil)
19
+ if dataset
20
+ # Extract feature and target columns from the dataset
21
+ feature_indices = feature_columns.map { |col| dataset.headers.index(col) }
22
+ target_index = dataset.headers.index(target_column)
23
+
24
+ x_values = dataset.data.map { |row| feature_indices.map { |i| row[i] } }
25
+ y_values = dataset.data.map { |row| row[target_index] }
26
+ end
27
+
28
+ raise "Input arrays must have the same length" unless x_values.length == y_values.length
29
+
30
+ # Convert x_values to a matrix and add a column of ones for the intercept
31
+ x_matrix = Matrix[*x_values.map { |x| [1] + x }]
32
+ y_vector = Vector.elements(y_values)
33
+
34
+ # Calculate coefficients using the normal equation with regularization: (X^T * X + λI)^-1 * X^T * Y
35
+ x_transpose = x_matrix.transpose
36
+ regularization_matrix = Matrix.build(x_matrix.column_count) { |i, j| i == j ? @regularization : 0 }
37
+
38
+ xtx = x_transpose * x_matrix + regularization_matrix
39
+
40
+ begin
41
+ theta = xtx.inverse * x_transpose * y_vector
42
+ rescue ExceptionForMatrix::ErrNotRegular
43
+ raise "Matrix is singular or nearly singular, consider increasing regularization"
44
+ end
45
+
46
+ @intercept = theta[0]
47
+ @coefficients = theta.to_a[1..-1]
48
+ end
49
+
50
+ def predict(x_values)
51
+ raise "Model has not been fitted yet" if @coefficients.nil? || @intercept.nil?
52
+
53
+ x_values.map do |x|
54
+ @coefficients.each_with_index.reduce(@intercept) do |sum, (coef, i)|
55
+ sum + coef * x[i]
56
+ end
57
+ end
58
+ end
59
+
60
+ def mean_squared_error(y_true, y_pred)
61
+ raise "Input arrays must have the same length" unless y_true.length == y_pred.length
62
+
63
+ n = y_true.length
64
+ sum_squared_errors = y_true.each_with_index.map { |y, i| (y - y_pred[i]) ** 2 }.sum
65
+ sum_squared_errors / n.to_f
66
+ end
67
+
68
+ def r_squared(y_true, y_pred)
69
+ raise "Input arrays must have the same length" unless y_true.length == y_pred.length
70
+
71
+ mean_y = y_true.sum / y_true.length.to_f
72
+ ss_total = y_true.map { |y| (y - mean_y) ** 2 }.sum
73
+ ss_residual = y_true.each_with_index.map { |y, i| (y - y_pred[i]) ** 2 }.sum
74
+
75
+ 1 - (ss_residual / ss_total.to_f)
76
+ end
77
+
78
+ # Cross-validation method to evaluate model performance
79
+ def cross_validate(x_values: nil, y_values: nil, dataset: nil, feature_columns: nil, target_column: nil, k: 5)
80
+ if dataset
81
+ # Extract feature and target columns from the dataset
82
+ feature_indices = feature_columns.map { |col| dataset.headers.index(col) }
83
+ target_index = dataset.headers.index(target_column)
84
+
85
+ x_values = dataset.data.map { |row| feature_indices.map { |i| row[i] } }
86
+ y_values = dataset.data.map { |row| row[target_index] }
87
+ end
88
+
89
+ raise "Input arrays must have the same length" unless x_values.length == y_values.length
90
+
91
+ fold_size = x_values.length / k
92
+ errors = []
93
+
94
+ k.times do |i|
95
+ test_start = i * fold_size
96
+ test_end = test_start + fold_size
97
+
98
+ x_train = x_values[0...test_start] + x_values[test_end..-1]
99
+ y_train = y_values[0...test_start] + y_values[test_end..-1]
100
+
101
+ x_test = x_values[test_start...test_end]
102
+ y_test = y_values[test_start...test_end]
103
+
104
+ fit(x_values: x_train, y_values: y_train)
105
+ predictions = predict(x_test)
106
+
107
+ errors << mean_squared_error(y_test, predictions)
108
+ end
109
+
110
+ errors.sum / errors.size.to_f
111
+ end
112
+ end
113
+ end