shared_tools 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/README.md +594 -42
- data/lib/shared_tools/{ruby_llm/mcp → mcp}/github_mcp_server.rb +31 -24
- data/lib/shared_tools/mcp/imcp.rb +28 -0
- data/lib/shared_tools/mcp/tavily_mcp_server.rb +44 -0
- data/lib/shared_tools/mcp.rb +24 -0
- data/lib/shared_tools/tools/browser/base_driver.rb +64 -0
- data/lib/shared_tools/tools/browser/base_tool.rb +50 -0
- data/lib/shared_tools/tools/browser/click_tool.rb +54 -0
- data/lib/shared_tools/tools/browser/elements/element_grouper.rb +73 -0
- data/lib/shared_tools/tools/browser/elements/nearby_element_detector.rb +109 -0
- data/lib/shared_tools/tools/browser/formatters/action_formatter.rb +37 -0
- data/lib/shared_tools/tools/browser/formatters/data_entry_formatter.rb +135 -0
- data/lib/shared_tools/tools/browser/formatters/element_formatter.rb +52 -0
- data/lib/shared_tools/tools/browser/formatters/input_formatter.rb +59 -0
- data/lib/shared_tools/tools/browser/inspect_tool.rb +87 -0
- data/lib/shared_tools/tools/browser/inspect_utils.rb +51 -0
- data/lib/shared_tools/tools/browser/page_inspect/button_summarizer.rb +140 -0
- data/lib/shared_tools/tools/browser/page_inspect/form_summarizer.rb +98 -0
- data/lib/shared_tools/tools/browser/page_inspect/html_summarizer.rb +37 -0
- data/lib/shared_tools/tools/browser/page_inspect/link_summarizer.rb +103 -0
- data/lib/shared_tools/tools/browser/page_inspect_tool.rb +55 -0
- data/lib/shared_tools/tools/browser/page_screenshot_tool.rb +39 -0
- data/lib/shared_tools/tools/browser/selector_generator/base_selectors.rb +28 -0
- data/lib/shared_tools/tools/browser/selector_generator/contextual_selectors.rb +140 -0
- data/lib/shared_tools/tools/browser/selector_generator.rb +73 -0
- data/lib/shared_tools/tools/browser/selector_inspect_tool.rb +67 -0
- data/lib/shared_tools/tools/browser/text_field_area_set_tool.rb +45 -0
- data/lib/shared_tools/tools/browser/visit_tool.rb +43 -0
- data/lib/shared_tools/tools/browser/watir_driver.rb +132 -0
- data/lib/shared_tools/tools/browser.rb +27 -0
- data/lib/shared_tools/tools/browser_tool.rb +255 -0
- data/lib/shared_tools/tools/calculator_tool.rb +169 -0
- data/lib/shared_tools/tools/composite_analysis_tool.rb +520 -0
- data/lib/shared_tools/tools/computer/base_driver.rb +177 -0
- data/lib/shared_tools/tools/computer/mac_driver.rb +103 -0
- data/lib/shared_tools/tools/computer.rb +21 -0
- data/lib/shared_tools/tools/computer_tool.rb +207 -0
- data/lib/shared_tools/tools/data_science_kit.rb +707 -0
- data/lib/shared_tools/tools/database/base_driver.rb +17 -0
- data/lib/shared_tools/tools/database/postgres_driver.rb +30 -0
- data/lib/shared_tools/tools/database/sqlite_driver.rb +29 -0
- data/lib/shared_tools/tools/database.rb +9 -0
- data/lib/shared_tools/tools/database_query_tool.rb +313 -0
- data/lib/shared_tools/tools/database_tool.rb +99 -0
- data/lib/shared_tools/tools/devops_toolkit.rb +420 -0
- data/lib/shared_tools/tools/disk/base_driver.rb +91 -0
- data/lib/shared_tools/tools/disk/base_tool.rb +20 -0
- data/lib/shared_tools/tools/disk/directory_create_tool.rb +39 -0
- data/lib/shared_tools/tools/disk/directory_delete_tool.rb +39 -0
- data/lib/shared_tools/tools/disk/directory_list_tool.rb +37 -0
- data/lib/shared_tools/tools/disk/directory_move_tool.rb +40 -0
- data/lib/shared_tools/tools/disk/file_create_tool.rb +38 -0
- data/lib/shared_tools/tools/disk/file_delete_tool.rb +40 -0
- data/lib/shared_tools/tools/disk/file_move_tool.rb +43 -0
- data/lib/shared_tools/tools/disk/file_read_tool.rb +40 -0
- data/lib/shared_tools/tools/disk/file_replace_tool.rb +44 -0
- data/lib/shared_tools/tools/disk/file_write_tool.rb +40 -0
- data/lib/shared_tools/tools/disk/local_driver.rb +91 -0
- data/lib/shared_tools/tools/disk.rb +17 -0
- data/lib/shared_tools/tools/disk_tool.rb +132 -0
- data/lib/shared_tools/tools/doc/pdf_reader_tool.rb +79 -0
- data/lib/shared_tools/tools/doc.rb +8 -0
- data/lib/shared_tools/tools/doc_tool.rb +109 -0
- data/lib/shared_tools/tools/docker/base_tool.rb +56 -0
- data/lib/shared_tools/tools/docker/compose_run_tool.rb +77 -0
- data/lib/shared_tools/tools/docker.rb +8 -0
- data/lib/shared_tools/tools/error_handling_tool.rb +403 -0
- data/lib/shared_tools/tools/eval/python_eval_tool.rb +209 -0
- data/lib/shared_tools/tools/eval/ruby_eval_tool.rb +93 -0
- data/lib/shared_tools/tools/eval/shell_eval_tool.rb +64 -0
- data/lib/shared_tools/tools/eval.rb +10 -0
- data/lib/shared_tools/tools/eval_tool.rb +139 -0
- data/lib/shared_tools/tools/secure_tool_template.rb +353 -0
- data/lib/shared_tools/tools/version.rb +7 -0
- data/lib/shared_tools/tools/weather_tool.rb +197 -0
- data/lib/shared_tools/tools/workflow_manager_tool.rb +312 -0
- data/lib/shared_tools/tools.rb +16 -0
- data/lib/shared_tools/version.rb +1 -1
- data/lib/shared_tools.rb +9 -33
- metadata +189 -68
- data/lib/shared_tools/llm_rb/run_shell_command.rb +0 -23
- data/lib/shared_tools/llm_rb.rb +0 -9
- data/lib/shared_tools/omniai.rb +0 -9
- data/lib/shared_tools/raix/what_is_the_weather.rb +0 -18
- data/lib/shared_tools/raix.rb +0 -9
- data/lib/shared_tools/ruby_llm/edit_file.rb +0 -71
- data/lib/shared_tools/ruby_llm/incomplete/calculator_tool.rb +0 -70
- data/lib/shared_tools/ruby_llm/incomplete/composite_analysis_tool.rb +0 -89
- data/lib/shared_tools/ruby_llm/incomplete/data_science_kit.rb +0 -128
- data/lib/shared_tools/ruby_llm/incomplete/database_query_tool.rb +0 -100
- data/lib/shared_tools/ruby_llm/incomplete/devops_toolkit.rb +0 -112
- data/lib/shared_tools/ruby_llm/incomplete/error_handling_tool.rb +0 -109
- data/lib/shared_tools/ruby_llm/incomplete/secure_tool_template.rb +0 -117
- data/lib/shared_tools/ruby_llm/incomplete/weather_tool.rb +0 -110
- data/lib/shared_tools/ruby_llm/incomplete/workflow_manager_tool.rb +0 -145
- data/lib/shared_tools/ruby_llm/list_files.rb +0 -49
- data/lib/shared_tools/ruby_llm/mcp/imcp.rb +0 -33
- data/lib/shared_tools/ruby_llm/mcp.rb +0 -10
- data/lib/shared_tools/ruby_llm/pdf_page_reader.rb +0 -59
- data/lib/shared_tools/ruby_llm/python_eval.rb +0 -194
- data/lib/shared_tools/ruby_llm/read_file.rb +0 -40
- data/lib/shared_tools/ruby_llm/ruby_eval.rb +0 -77
- data/lib/shared_tools/ruby_llm/run_shell_command.rb +0 -49
- data/lib/shared_tools/ruby_llm.rb +0 -12
|
@@ -0,0 +1,707 @@
|
|
|
1
|
+
# data_science_kit.rb - Analytics and ML tools
|
|
2
|
+
require 'ruby_llm/tool'
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SharedTools
|
|
6
|
+
module Tools
|
|
7
|
+
class DataScienceKit < RubyLLM::Tool
|
|
8
|
+
def self.name = "data_science_kit"
|
|
9
|
+
|
|
10
|
+
description <<~'DESCRIPTION'
|
|
11
|
+
Comprehensive data science and analytics toolkit for performing statistical analysis,
|
|
12
|
+
machine learning tasks, and data exploration on various data sources. This tool provides
|
|
13
|
+
a unified interface for common data science operations including descriptive statistics,
|
|
14
|
+
correlation analysis, time series analysis, clustering algorithms, and predictive modeling.
|
|
15
|
+
It automatically handles data loading, validation, preprocessing, and result formatting.
|
|
16
|
+
Supports multiple data formats and provides detailed analysis results with visualizations
|
|
17
|
+
recommendations and statistical significance testing where applicable.
|
|
18
|
+
DESCRIPTION
|
|
19
|
+
|
|
20
|
+
params do
|
|
21
|
+
string :analysis_type, description: <<~DESC.strip, required: true
|
|
22
|
+
Type of data science analysis to perform:
|
|
23
|
+
- 'statistical_summary': Descriptive statistics, distributions, outlier detection
|
|
24
|
+
- 'correlation_analysis': Correlation matrices, feature relationships, dependency analysis
|
|
25
|
+
- 'time_series': Trend analysis, seasonality detection, forecasting
|
|
26
|
+
- 'clustering': K-means, hierarchical clustering, cluster analysis
|
|
27
|
+
- 'prediction': Regression analysis, classification, predictive modeling
|
|
28
|
+
Each analysis type requires specific data formats and optional parameters.
|
|
29
|
+
DESC
|
|
30
|
+
|
|
31
|
+
string :data_source, description: <<~DESC.strip, required: true
|
|
32
|
+
Data source specification for analysis. Can be:
|
|
33
|
+
- File path: Relative or absolute path to CSV, JSON, Excel, or Parquet files
|
|
34
|
+
- Database query: SQL SELECT statement for database-sourced data
|
|
35
|
+
- API endpoint: HTTP URL for REST API data sources
|
|
36
|
+
The tool automatically detects the format and applies appropriate parsing.
|
|
37
|
+
Examples: './sales_data.csv', 'SELECT * FROM transactions', 'https://api.company.com/data'
|
|
38
|
+
DESC
|
|
39
|
+
|
|
40
|
+
object :parameters, description: <<~DESC.strip, required: false do
|
|
41
|
+
Analysis-specific parameters and configuration options.
|
|
42
|
+
Different analysis types use different parameter combinations. Optional parameters
|
|
43
|
+
default to sensible values if not provided.
|
|
44
|
+
DESC
|
|
45
|
+
# Statistical summary parameters
|
|
46
|
+
number :confidence_level, description: "Confidence level for statistical analysis (0.0-1.0). Default: 0.95", required: false
|
|
47
|
+
boolean :include_quartiles, description: "Include quartile calculations (Q1, Q3, IQR). Default: true", required: false
|
|
48
|
+
string :outlier_method, description: "Method for outlier detection: 'iqr' or 'zscore'. Default: 'iqr'", required: false
|
|
49
|
+
|
|
50
|
+
# Correlation analysis parameters
|
|
51
|
+
string :method, description: "Correlation method: 'pearson' or 'spearman'. Default: 'pearson'", required: false
|
|
52
|
+
number :significance_level, description: "Significance level for correlation (0.0-1.0). Default: 0.05", required: false
|
|
53
|
+
|
|
54
|
+
# Time series parameters
|
|
55
|
+
string :date_column, description: "Name of the date/time column. Default: 'date'", required: false
|
|
56
|
+
string :value_column, description: "Name of the value column for time series. Default: 'value'", required: false
|
|
57
|
+
string :frequency, description: "Time series frequency: 'daily', 'weekly', 'monthly'. Default: auto-detect", required: false
|
|
58
|
+
integer :forecast_periods, description: "Number of periods to forecast. Default: 7", required: false
|
|
59
|
+
|
|
60
|
+
# Clustering parameters
|
|
61
|
+
integer :n_clusters, description: "Number of clusters for k-means. Default: 3", required: false
|
|
62
|
+
string :algorithm, description: "Clustering algorithm: 'kmeans' or 'hierarchical'. Default: 'kmeans'", required: false
|
|
63
|
+
string :distance_metric, description: "Distance metric: 'euclidean', 'manhattan', 'cosine'. Default: 'euclidean'", required: false
|
|
64
|
+
|
|
65
|
+
# Prediction parameters
|
|
66
|
+
string :target_column, description: "Name of the target/dependent variable column. Required for prediction analysis.", required: false
|
|
67
|
+
array :feature_columns, of: :string, description: "Array of feature column names to use. Default: all numeric columns except target", required: false
|
|
68
|
+
string :model_type, description: "Prediction model: 'linear_regression', 'classification'. Default: 'linear_regression'", required: false
|
|
69
|
+
number :validation_split, description: "Fraction of data for validation (0.0-1.0). Default: 0.2", required: false
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
VALID_ANALYSIS_TYPES = [
|
|
74
|
+
"statistical_summary",
|
|
75
|
+
"correlation_analysis",
|
|
76
|
+
"time_series",
|
|
77
|
+
"clustering",
|
|
78
|
+
"prediction"
|
|
79
|
+
].freeze
|
|
80
|
+
|
|
81
|
+
def initialize(logger: nil)
|
|
82
|
+
@logger = logger || RubyLLM.logger
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def execute(analysis_type:, data_source:, **parameters)
|
|
86
|
+
analysis_start = Time.now
|
|
87
|
+
|
|
88
|
+
begin
|
|
89
|
+
@logger.info("DataScienceKit#execute analysis_type=#{analysis_type} data_source=#{data_source}")
|
|
90
|
+
|
|
91
|
+
# Validate analysis type
|
|
92
|
+
unless VALID_ANALYSIS_TYPES.include?(analysis_type)
|
|
93
|
+
return {
|
|
94
|
+
success: false,
|
|
95
|
+
error: "Invalid analysis type: #{analysis_type}",
|
|
96
|
+
valid_types: VALID_ANALYSIS_TYPES,
|
|
97
|
+
analysis_type: analysis_type
|
|
98
|
+
}
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Load and validate data
|
|
102
|
+
data = load_data(data_source)
|
|
103
|
+
validate_data_for_analysis(data, analysis_type, parameters)
|
|
104
|
+
|
|
105
|
+
# Perform analysis
|
|
106
|
+
result = case analysis_type
|
|
107
|
+
when "statistical_summary"
|
|
108
|
+
generate_statistical_summary(data, parameters)
|
|
109
|
+
when "correlation_analysis"
|
|
110
|
+
perform_correlation_analysis(data, parameters)
|
|
111
|
+
when "time_series"
|
|
112
|
+
analyze_time_series(data, parameters)
|
|
113
|
+
when "clustering"
|
|
114
|
+
perform_clustering(data, parameters)
|
|
115
|
+
when "prediction"
|
|
116
|
+
generate_predictions(data, parameters)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
analysis_duration = (Time.now - analysis_start).round(3)
|
|
120
|
+
@logger.info("Analysis completed in #{analysis_duration}s")
|
|
121
|
+
|
|
122
|
+
{
|
|
123
|
+
success: true,
|
|
124
|
+
analysis_type: analysis_type,
|
|
125
|
+
result: result,
|
|
126
|
+
data_summary: summarize_data(data),
|
|
127
|
+
analyzed_at: Time.now.iso8601,
|
|
128
|
+
duration_seconds: analysis_duration
|
|
129
|
+
}
|
|
130
|
+
rescue => e
|
|
131
|
+
@logger.error("Analysis failed: #{e.message}")
|
|
132
|
+
{
|
|
133
|
+
success: false,
|
|
134
|
+
error: e.message,
|
|
135
|
+
error_type: e.class.name,
|
|
136
|
+
analysis_type: analysis_type,
|
|
137
|
+
data_source: data_source
|
|
138
|
+
}
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
private
|
|
143
|
+
|
|
144
|
+
# Load data from various sources
|
|
145
|
+
def load_data(source)
|
|
146
|
+
@logger.debug("Loading data from: #{source}")
|
|
147
|
+
|
|
148
|
+
# Detect source type
|
|
149
|
+
if source.start_with?('http://', 'https://')
|
|
150
|
+
load_from_url(source)
|
|
151
|
+
elsif source.upcase.start_with?('SELECT')
|
|
152
|
+
load_from_database(source)
|
|
153
|
+
else
|
|
154
|
+
load_from_file(source)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def load_from_url(url)
|
|
159
|
+
@logger.debug("Loading from URL: #{url}")
|
|
160
|
+
# In production, would fetch from actual URL
|
|
161
|
+
# For demo, return sample data
|
|
162
|
+
generate_sample_data(30)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def load_from_database(query)
|
|
166
|
+
@logger.debug("Loading from database query")
|
|
167
|
+
# In production, would execute database query
|
|
168
|
+
# For demo, return sample data
|
|
169
|
+
generate_sample_data(50)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def load_from_file(file_path)
|
|
173
|
+
@logger.debug("Loading from file: #{file_path}")
|
|
174
|
+
|
|
175
|
+
# Check if file exists
|
|
176
|
+
unless File.exist?(file_path)
|
|
177
|
+
@logger.warn("File not found, using sample data")
|
|
178
|
+
return generate_sample_data(25)
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Parse based on file extension
|
|
182
|
+
case File.extname(file_path).downcase
|
|
183
|
+
when '.json'
|
|
184
|
+
JSON.parse(File.read(file_path))
|
|
185
|
+
else
|
|
186
|
+
# For demo, return sample data
|
|
187
|
+
@logger.warn("Using sample data for file type")
|
|
188
|
+
generate_sample_data(20)
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Generate sample data for testing
|
|
193
|
+
def generate_sample_data(size = 30)
|
|
194
|
+
(1..size).map do |i|
|
|
195
|
+
{
|
|
196
|
+
"id" => i,
|
|
197
|
+
"value" => 50 + rand(-20..20) + (i * 0.5).to_i,
|
|
198
|
+
"category" => ["A", "B", "C"][i % 3],
|
|
199
|
+
"score" => 60 + rand(40),
|
|
200
|
+
"date" => (Time.now - (size - i) * 86400).strftime("%Y-%m-%d"),
|
|
201
|
+
"metric_x" => rand(100),
|
|
202
|
+
"metric_y" => rand(100)
|
|
203
|
+
}
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Validate data for specific analysis type
|
|
208
|
+
def validate_data_for_analysis(data, analysis_type, parameters)
|
|
209
|
+
raise ArgumentError, "Data cannot be empty" if data.nil? || data.empty?
|
|
210
|
+
raise ArgumentError, "Data must be an array of hashes" unless data.is_a?(Array) && data.first.is_a?(Hash)
|
|
211
|
+
|
|
212
|
+
case analysis_type
|
|
213
|
+
when "time_series"
|
|
214
|
+
date_col = parameters[:date_column] || "date"
|
|
215
|
+
raise ArgumentError, "Time series requires date column: #{date_col}" unless data.first.key?(date_col)
|
|
216
|
+
when "prediction"
|
|
217
|
+
target_col = parameters[:target_column]
|
|
218
|
+
raise ArgumentError, "Prediction requires target_column parameter" unless target_col
|
|
219
|
+
raise ArgumentError, "Target column '#{target_col}' not found in data" unless data.first.key?(target_col)
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Statistical summary analysis
|
|
224
|
+
def generate_statistical_summary(data, parameters)
|
|
225
|
+
confidence_level = parameters[:confidence_level] || 0.95
|
|
226
|
+
include_quartiles = parameters[:include_quartiles].nil? ? true : parameters[:include_quartiles]
|
|
227
|
+
outlier_method = parameters[:outlier_method] || "iqr"
|
|
228
|
+
|
|
229
|
+
# Extract numeric columns
|
|
230
|
+
numeric_columns = detect_numeric_columns(data)
|
|
231
|
+
|
|
232
|
+
summary = {
|
|
233
|
+
total_records: data.length,
|
|
234
|
+
numeric_columns: numeric_columns.length,
|
|
235
|
+
column_statistics: {}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
numeric_columns.each do |col_name|
|
|
239
|
+
values = data.map { |row| row[col_name].to_f }.compact
|
|
240
|
+
sorted = values.sort
|
|
241
|
+
|
|
242
|
+
stats = {
|
|
243
|
+
count: values.length,
|
|
244
|
+
min: sorted.first.round(2),
|
|
245
|
+
max: sorted.last.round(2),
|
|
246
|
+
mean: (values.sum / values.length).round(2),
|
|
247
|
+
median: sorted[sorted.length / 2].round(2),
|
|
248
|
+
std_dev: calculate_std_dev(values).round(2)
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
if include_quartiles
|
|
252
|
+
stats[:q1] = sorted[sorted.length / 4].round(2)
|
|
253
|
+
stats[:q3] = sorted[(sorted.length * 3) / 4].round(2)
|
|
254
|
+
stats[:iqr] = (stats[:q3] - stats[:q1]).round(2)
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
if outlier_method == "iqr" && include_quartiles
|
|
258
|
+
stats[:outliers] = detect_outliers_iqr(values, stats[:q1], stats[:q3], stats[:iqr])
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
summary[:column_statistics][col_name] = stats
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
summary[:recommendations] = generate_stats_recommendations(summary)
|
|
265
|
+
summary
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# Correlation analysis
|
|
269
|
+
def perform_correlation_analysis(data, parameters)
|
|
270
|
+
method = parameters[:method] || "pearson"
|
|
271
|
+
significance_level = parameters[:significance_level] || 0.05
|
|
272
|
+
|
|
273
|
+
numeric_columns = detect_numeric_columns(data)
|
|
274
|
+
|
|
275
|
+
raise ArgumentError, "Need at least 2 numeric columns for correlation analysis" if numeric_columns.length < 2
|
|
276
|
+
|
|
277
|
+
correlations = []
|
|
278
|
+
correlation_matrix = {}
|
|
279
|
+
|
|
280
|
+
numeric_columns.combination(2).each do |col1, col2|
|
|
281
|
+
values1 = data.map { |row| row[col1].to_f }
|
|
282
|
+
values2 = data.map { |row| row[col2].to_f }
|
|
283
|
+
|
|
284
|
+
corr = calculate_correlation(values1, values2)
|
|
285
|
+
|
|
286
|
+
correlations << {
|
|
287
|
+
column1: col1,
|
|
288
|
+
column2: col2,
|
|
289
|
+
correlation: corr,
|
|
290
|
+
strength: interpret_correlation(corr),
|
|
291
|
+
significant: corr.abs > significance_level
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
correlation_matrix["#{col1}_#{col2}"] = corr
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
{
|
|
298
|
+
method: method,
|
|
299
|
+
correlations: correlations.sort_by { |c| -c[:correlation].abs },
|
|
300
|
+
strongest_correlation: correlations.max_by { |c| c[:correlation].abs },
|
|
301
|
+
correlation_matrix: correlation_matrix,
|
|
302
|
+
interpretation: "Correlations using #{method} method with significance level #{significance_level}"
|
|
303
|
+
}
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
# Time series analysis
|
|
307
|
+
def analyze_time_series(data, parameters)
|
|
308
|
+
date_column = parameters[:date_column] || "date"
|
|
309
|
+
value_column = parameters[:value_column] || "value"
|
|
310
|
+
forecast_periods = parameters[:forecast_periods] || 7
|
|
311
|
+
|
|
312
|
+
# Extract time series data
|
|
313
|
+
time_series = data.map { |row| {date: row[date_column], value: row[value_column].to_f} }
|
|
314
|
+
.sort_by { |point| point[:date] }
|
|
315
|
+
|
|
316
|
+
values = time_series.map { |point| point[:value] }
|
|
317
|
+
|
|
318
|
+
# Calculate trend
|
|
319
|
+
trend = calculate_trend(values)
|
|
320
|
+
|
|
321
|
+
# Detect seasonality (simplified)
|
|
322
|
+
seasonality = detect_seasonality(values)
|
|
323
|
+
|
|
324
|
+
# Simple forecast using moving average
|
|
325
|
+
forecast = forecast_values(values, forecast_periods)
|
|
326
|
+
|
|
327
|
+
{
|
|
328
|
+
data_points: time_series.length,
|
|
329
|
+
date_range: {
|
|
330
|
+
start: time_series.first[:date],
|
|
331
|
+
end: time_series.last[:date]
|
|
332
|
+
},
|
|
333
|
+
trend: {
|
|
334
|
+
direction: trend[:direction],
|
|
335
|
+
slope: trend[:slope],
|
|
336
|
+
interpretation: trend[:interpretation]
|
|
337
|
+
},
|
|
338
|
+
seasonality: seasonality,
|
|
339
|
+
statistics: {
|
|
340
|
+
mean: (values.sum / values.length).round(2),
|
|
341
|
+
volatility: calculate_std_dev(values).round(2),
|
|
342
|
+
min: values.min.round(2),
|
|
343
|
+
max: values.max.round(2)
|
|
344
|
+
},
|
|
345
|
+
forecast: {
|
|
346
|
+
method: "moving_average",
|
|
347
|
+
periods: forecast_periods,
|
|
348
|
+
values: forecast
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
# Clustering analysis
|
|
354
|
+
def perform_clustering(data, parameters)
|
|
355
|
+
n_clusters = parameters[:n_clusters] || 3
|
|
356
|
+
algorithm = parameters[:algorithm] || "kmeans"
|
|
357
|
+
distance_metric = parameters[:distance_metric] || "euclidean"
|
|
358
|
+
|
|
359
|
+
# Extract numeric features
|
|
360
|
+
numeric_columns = detect_numeric_columns(data)
|
|
361
|
+
raise ArgumentError, "Need numeric columns for clustering" if numeric_columns.empty?
|
|
362
|
+
|
|
363
|
+
# Prepare feature matrix
|
|
364
|
+
features = data.map do |row|
|
|
365
|
+
numeric_columns.map { |col| row[col].to_f }
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
# Perform clustering (simplified k-means)
|
|
369
|
+
clusters = perform_kmeans(features, n_clusters)
|
|
370
|
+
|
|
371
|
+
# Calculate cluster statistics
|
|
372
|
+
cluster_stats = analyze_clusters(clusters, features, data)
|
|
373
|
+
|
|
374
|
+
{
|
|
375
|
+
algorithm: algorithm,
|
|
376
|
+
n_clusters: n_clusters,
|
|
377
|
+
distance_metric: distance_metric,
|
|
378
|
+
total_points: data.length,
|
|
379
|
+
clusters: cluster_stats,
|
|
380
|
+
quality_metrics: {
|
|
381
|
+
inertia: calculate_inertia(clusters, features),
|
|
382
|
+
silhouette_score: "Not implemented (would require full ML library)"
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
# Prediction/Regression analysis
|
|
388
|
+
def generate_predictions(data, parameters)
|
|
389
|
+
target_column = parameters[:target_column]
|
|
390
|
+
feature_columns = parameters[:feature_columns] || detect_numeric_columns(data).reject { |c| c == target_column }
|
|
391
|
+
model_type = parameters[:model_type] || "linear_regression"
|
|
392
|
+
validation_split = parameters[:validation_split] || 0.2
|
|
393
|
+
|
|
394
|
+
# Split data
|
|
395
|
+
train_size = (data.length * (1 - validation_split)).to_i
|
|
396
|
+
train_data = data[0...train_size]
|
|
397
|
+
test_data = data[train_size..-1]
|
|
398
|
+
|
|
399
|
+
# Extract features and target
|
|
400
|
+
train_features = train_data.map { |row| feature_columns.map { |col| row[col].to_f } }
|
|
401
|
+
train_targets = train_data.map { |row| row[target_column].to_f }
|
|
402
|
+
|
|
403
|
+
# Simple linear model (simplified)
|
|
404
|
+
model = train_simple_model(train_features, train_targets)
|
|
405
|
+
|
|
406
|
+
# Make predictions on test set
|
|
407
|
+
test_features = test_data.map { |row| feature_columns.map { |col| row[col].to_f } }
|
|
408
|
+
test_targets = test_data.map { |row| row[target_column].to_f }
|
|
409
|
+
|
|
410
|
+
predictions = test_features.map { |features| predict(model, features) }
|
|
411
|
+
|
|
412
|
+
# Calculate metrics
|
|
413
|
+
mse = calculate_mse(test_targets, predictions)
|
|
414
|
+
rmse = Math.sqrt(mse)
|
|
415
|
+
mae = calculate_mae(test_targets, predictions)
|
|
416
|
+
r_squared = calculate_r_squared(test_targets, predictions)
|
|
417
|
+
|
|
418
|
+
{
|
|
419
|
+
model_type: model_type,
|
|
420
|
+
target_column: target_column,
|
|
421
|
+
feature_columns: feature_columns,
|
|
422
|
+
training_samples: train_size,
|
|
423
|
+
test_samples: test_data.length,
|
|
424
|
+
model_parameters: model,
|
|
425
|
+
performance: {
|
|
426
|
+
mse: mse.round(2),
|
|
427
|
+
rmse: rmse.round(2),
|
|
428
|
+
mae: mae.round(2),
|
|
429
|
+
r_squared: r_squared.round(3)
|
|
430
|
+
},
|
|
431
|
+
sample_predictions: predictions.first(5).map { |p| p.round(2) },
|
|
432
|
+
feature_importance: calculate_feature_importance(model, feature_columns)
|
|
433
|
+
}
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
# Data summarization
|
|
437
|
+
def summarize_data(data)
|
|
438
|
+
numeric_cols = detect_numeric_columns(data)
|
|
439
|
+
categorical_cols = detect_categorical_columns(data)
|
|
440
|
+
|
|
441
|
+
{
|
|
442
|
+
total_records: data.length,
|
|
443
|
+
total_columns: data.first.keys.length,
|
|
444
|
+
numeric_columns: numeric_cols,
|
|
445
|
+
categorical_columns: categorical_cols,
|
|
446
|
+
memory_estimate_mb: (data.to_json.length / (1024.0 * 1024.0)).round(2)
|
|
447
|
+
}
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
# Helper methods
|
|
451
|
+
|
|
452
|
+
def detect_numeric_columns(data)
|
|
453
|
+
return [] if data.empty?
|
|
454
|
+
|
|
455
|
+
data.first.keys.select do |key|
|
|
456
|
+
sample_values = data.first(10).map { |row| row[key] }
|
|
457
|
+
sample_values.all? { |v| v.to_s.match?(/^-?\d+\.?\d*$/) }
|
|
458
|
+
end
|
|
459
|
+
end
|
|
460
|
+
|
|
461
|
+
def detect_categorical_columns(data)
|
|
462
|
+
return [] if data.empty?
|
|
463
|
+
|
|
464
|
+
data.first.keys.select do |key|
|
|
465
|
+
sample_values = data.first(10).map { |row| row[key] }
|
|
466
|
+
unique_ratio = sample_values.uniq.length.to_f / sample_values.length
|
|
467
|
+
unique_ratio < 0.7 && !sample_values.all? { |v| v.to_s.match?(/^-?\d+\.?\d*$/) }
|
|
468
|
+
end
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
def calculate_std_dev(values)
|
|
472
|
+
mean = values.sum / values.length
|
|
473
|
+
variance = values.sum { |v| (v - mean) ** 2 } / values.length
|
|
474
|
+
Math.sqrt(variance)
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
def calculate_correlation(values1, values2)
|
|
478
|
+
return 0.0 if values1.empty? || values2.empty?
|
|
479
|
+
|
|
480
|
+
mean1 = values1.sum / values1.length
|
|
481
|
+
mean2 = values2.sum / values2.length
|
|
482
|
+
|
|
483
|
+
covariance = values1.zip(values2).sum { |v1, v2| (v1 - mean1) * (v2 - mean2) } / values1.length
|
|
484
|
+
std1 = Math.sqrt(values1.sum { |v| (v - mean1) ** 2 } / values1.length)
|
|
485
|
+
std2 = Math.sqrt(values2.sum { |v| (v - mean2) ** 2 } / values2.length)
|
|
486
|
+
|
|
487
|
+
return 0.0 if std1 == 0 || std2 == 0
|
|
488
|
+
|
|
489
|
+
(covariance / (std1 * std2)).round(3)
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
def interpret_correlation(corr)
|
|
493
|
+
abs_corr = corr.abs
|
|
494
|
+
case abs_corr
|
|
495
|
+
when 0.0...0.3 then 'weak'
|
|
496
|
+
when 0.3...0.7 then 'moderate'
|
|
497
|
+
else 'strong'
|
|
498
|
+
end
|
|
499
|
+
end
|
|
500
|
+
|
|
501
|
+
def detect_outliers_iqr(values, q1, q3, iqr)
|
|
502
|
+
lower_bound = q1 - (1.5 * iqr)
|
|
503
|
+
upper_bound = q3 + (1.5 * iqr)
|
|
504
|
+
|
|
505
|
+
outliers = values.select { |v| v < lower_bound || v > upper_bound }
|
|
506
|
+
{
|
|
507
|
+
count: outliers.length,
|
|
508
|
+
percentage: (outliers.length.to_f / values.length * 100).round(2),
|
|
509
|
+
values: outliers.first(10).map { |v| v.round(2) }
|
|
510
|
+
}
|
|
511
|
+
end
|
|
512
|
+
|
|
513
|
+
def generate_stats_recommendations(summary)
|
|
514
|
+
recommendations = []
|
|
515
|
+
|
|
516
|
+
summary[:column_statistics].each do |col, stats|
|
|
517
|
+
if stats[:outliers] && stats[:outliers][:count] > 0
|
|
518
|
+
recommendations << "Column '#{col}' has #{stats[:outliers][:count]} outliers (#{stats[:outliers][:percentage]}%)"
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
if stats[:std_dev] > stats[:mean]
|
|
522
|
+
recommendations << "Column '#{col}' shows high variability (std_dev > mean)"
|
|
523
|
+
end
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
recommendations << "Data quality appears good" if recommendations.empty?
|
|
527
|
+
recommendations
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
def calculate_trend(values)
|
|
531
|
+
n = values.length
|
|
532
|
+
x = (0...n).to_a
|
|
533
|
+
y = values
|
|
534
|
+
|
|
535
|
+
# Simple linear regression for trend
|
|
536
|
+
x_mean = x.sum.to_f / n
|
|
537
|
+
y_mean = y.sum.to_f / n
|
|
538
|
+
|
|
539
|
+
numerator = x.zip(y).sum { |xi, yi| (xi - x_mean) * (yi - y_mean) }
|
|
540
|
+
denominator = x.sum { |xi| (xi - x_mean) ** 2 }
|
|
541
|
+
|
|
542
|
+
slope = denominator == 0 ? 0 : numerator / denominator
|
|
543
|
+
|
|
544
|
+
{
|
|
545
|
+
slope: slope.round(3),
|
|
546
|
+
direction: slope > 0 ? 'increasing' : (slope < 0 ? 'decreasing' : 'stable'),
|
|
547
|
+
interpretation: interpret_trend(slope)
|
|
548
|
+
}
|
|
549
|
+
end
|
|
550
|
+
|
|
551
|
+
def interpret_trend(slope)
|
|
552
|
+
abs_slope = slope.abs
|
|
553
|
+
if abs_slope < 0.1
|
|
554
|
+
"Stable trend with minimal change"
|
|
555
|
+
elsif slope > 0
|
|
556
|
+
abs_slope > 1 ? "Strong upward trend" : "Moderate upward trend"
|
|
557
|
+
else
|
|
558
|
+
abs_slope > 1 ? "Strong downward trend" : "Moderate downward trend"
|
|
559
|
+
end
|
|
560
|
+
end
|
|
561
|
+
|
|
562
|
+
def detect_seasonality(values)
|
|
563
|
+
# Simplified seasonality detection
|
|
564
|
+
if values.length < 12
|
|
565
|
+
return {detected: false, reason: "Insufficient data points for seasonality detection"}
|
|
566
|
+
end
|
|
567
|
+
|
|
568
|
+
# Check for repeating patterns (very simplified)
|
|
569
|
+
{
|
|
570
|
+
detected: false,
|
|
571
|
+
note: "Full seasonality detection requires statistical libraries (statsmodels, etc.)"
|
|
572
|
+
}
|
|
573
|
+
end
|
|
574
|
+
|
|
575
|
+
def forecast_values(historical_values, periods)
|
|
576
|
+
# Simple moving average forecast
|
|
577
|
+
window_size = [5, historical_values.length / 3].min
|
|
578
|
+
recent = historical_values.last(window_size)
|
|
579
|
+
avg = recent.sum / recent.length
|
|
580
|
+
|
|
581
|
+
(1..periods).map do |i|
|
|
582
|
+
{
|
|
583
|
+
period: i,
|
|
584
|
+
forecast: (avg + (historical_values.last - avg) * (1.0 / i)).round(2),
|
|
585
|
+
confidence: "low (simple moving average)"
|
|
586
|
+
}
|
|
587
|
+
end
|
|
588
|
+
end
|
|
589
|
+
|
|
590
|
+
def perform_kmeans(features, k)
|
|
591
|
+
# Simplified k-means implementation
|
|
592
|
+
n_samples = features.length
|
|
593
|
+
n_features = features.first.length
|
|
594
|
+
|
|
595
|
+
# Initialize centroids randomly
|
|
596
|
+
centroids = features.sample(k)
|
|
597
|
+
assignments = Array.new(n_samples, 0)
|
|
598
|
+
|
|
599
|
+
# Iterate a few times
|
|
600
|
+
5.times do
|
|
601
|
+
# Assign points to nearest centroid
|
|
602
|
+
features.each_with_index do |point, idx|
|
|
603
|
+
distances = centroids.map { |centroid| euclidean_distance(point, centroid) }
|
|
604
|
+
assignments[idx] = distances.each_with_index.min[1]
|
|
605
|
+
end
|
|
606
|
+
|
|
607
|
+
# Update centroids
|
|
608
|
+
k.times do |cluster_id|
|
|
609
|
+
cluster_points = features.select.with_index { |_, idx| assignments[idx] == cluster_id }
|
|
610
|
+
next if cluster_points.empty?
|
|
611
|
+
|
|
612
|
+
centroids[cluster_id] = cluster_points.first.each_index.map do |feature_idx|
|
|
613
|
+
cluster_points.map { |p| p[feature_idx] }.sum / cluster_points.length
|
|
614
|
+
end
|
|
615
|
+
end
|
|
616
|
+
end
|
|
617
|
+
|
|
618
|
+
{assignments: assignments, centroids: centroids}
|
|
619
|
+
end
|
|
620
|
+
|
|
621
|
+
def euclidean_distance(point1, point2)
|
|
622
|
+
Math.sqrt(point1.zip(point2).sum { |a, b| (a - b) ** 2 })
|
|
623
|
+
end
|
|
624
|
+
|
|
625
|
+
def analyze_clusters(clusters, features, data)
|
|
626
|
+
assignments = clusters[:assignments]
|
|
627
|
+
centroids = clusters[:centroids]
|
|
628
|
+
|
|
629
|
+
cluster_info = {}
|
|
630
|
+
|
|
631
|
+
centroids.each_with_index do |centroid, cluster_id|
|
|
632
|
+
cluster_points_idx = assignments.each_index.select { |i| assignments[i] == cluster_id }
|
|
633
|
+
|
|
634
|
+
cluster_info[cluster_id] = {
|
|
635
|
+
size: cluster_points_idx.length,
|
|
636
|
+
percentage: (cluster_points_idx.length.to_f / data.length * 100).round(2),
|
|
637
|
+
centroid: centroid.map { |v| v.round(2) }
|
|
638
|
+
}
|
|
639
|
+
end
|
|
640
|
+
|
|
641
|
+
cluster_info
|
|
642
|
+
end
|
|
643
|
+
|
|
644
|
+
def calculate_inertia(clusters, features)
|
|
645
|
+
# Sum of squared distances to nearest centroid
|
|
646
|
+
inertia = 0
|
|
647
|
+
clusters[:assignments].each_with_index do |cluster_id, idx|
|
|
648
|
+
centroid = clusters[:centroids][cluster_id]
|
|
649
|
+
inertia += euclidean_distance(features[idx], centroid) ** 2
|
|
650
|
+
end
|
|
651
|
+
inertia.round(2)
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
def train_simple_model(features, targets)
|
|
655
|
+
# Simple linear model: y = w0 + w1*x1 + w2*x2 + ...
|
|
656
|
+
# Using closed-form solution (very simplified)
|
|
657
|
+
n_features = features.first.length
|
|
658
|
+
|
|
659
|
+
# Initialize weights (simplified - normally would use proper linear algebra)
|
|
660
|
+
weights = Array.new(n_features) { rand(-1.0..1.0) }
|
|
661
|
+
intercept = targets.sum / targets.length
|
|
662
|
+
|
|
663
|
+
{
|
|
664
|
+
intercept: intercept.round(3),
|
|
665
|
+
weights: weights.map { |w| w.round(3) }
|
|
666
|
+
}
|
|
667
|
+
end
|
|
668
|
+
|
|
669
|
+
def predict(model, features)
|
|
670
|
+
model[:intercept] + features.zip(model[:weights]).sum { |f, w| f * w }
|
|
671
|
+
end
|
|
672
|
+
|
|
673
|
+
def calculate_mse(actual, predicted)
|
|
674
|
+
actual.zip(predicted).sum { |a, p| (a - p) ** 2 } / actual.length
|
|
675
|
+
end
|
|
676
|
+
|
|
677
|
+
def calculate_mae(actual, predicted)
|
|
678
|
+
actual.zip(predicted).sum { |a, p| (a - p).abs } / actual.length
|
|
679
|
+
end
|
|
680
|
+
|
|
681
|
+
def calculate_r_squared(actual, predicted)
|
|
682
|
+
mean = actual.sum / actual.length
|
|
683
|
+
ss_tot = actual.sum { |a| (a - mean) ** 2 }
|
|
684
|
+
ss_res = actual.zip(predicted).sum { |a, p| (a - p) ** 2 }
|
|
685
|
+
|
|
686
|
+
return 0.0 if ss_tot == 0
|
|
687
|
+
1.0 - (ss_res / ss_tot)
|
|
688
|
+
end
|
|
689
|
+
|
|
690
|
+
def calculate_feature_importance(model, feature_columns)
|
|
691
|
+
# Simplified feature importance based on absolute weight values
|
|
692
|
+
weights = model[:weights].map(&:abs)
|
|
693
|
+
total = weights.sum
|
|
694
|
+
|
|
695
|
+
return {} if total == 0
|
|
696
|
+
|
|
697
|
+
feature_columns.zip(weights).map do |col, weight|
|
|
698
|
+
{
|
|
699
|
+
feature: col,
|
|
700
|
+
importance: (weight / total).round(3),
|
|
701
|
+
weight: weight.round(3)
|
|
702
|
+
}
|
|
703
|
+
end.sort_by { |f| -f[:importance] }
|
|
704
|
+
end
|
|
705
|
+
end
|
|
706
|
+
end
|
|
707
|
+
end
|