mseep-rmcp 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_rmcp-0.3.3.dist-info/METADATA +50 -0
- mseep_rmcp-0.3.3.dist-info/RECORD +34 -0
- mseep_rmcp-0.3.3.dist-info/WHEEL +5 -0
- mseep_rmcp-0.3.3.dist-info/entry_points.txt +2 -0
- mseep_rmcp-0.3.3.dist-info/licenses/LICENSE +21 -0
- mseep_rmcp-0.3.3.dist-info/top_level.txt +1 -0
- rmcp/__init__.py +31 -0
- rmcp/cli.py +317 -0
- rmcp/core/__init__.py +14 -0
- rmcp/core/context.py +150 -0
- rmcp/core/schemas.py +156 -0
- rmcp/core/server.py +261 -0
- rmcp/r_assets/__init__.py +8 -0
- rmcp/r_integration.py +112 -0
- rmcp/registries/__init__.py +26 -0
- rmcp/registries/prompts.py +316 -0
- rmcp/registries/resources.py +266 -0
- rmcp/registries/tools.py +223 -0
- rmcp/scripts/__init__.py +9 -0
- rmcp/security/__init__.py +15 -0
- rmcp/security/vfs.py +233 -0
- rmcp/tools/descriptive.py +279 -0
- rmcp/tools/econometrics.py +250 -0
- rmcp/tools/fileops.py +315 -0
- rmcp/tools/machine_learning.py +299 -0
- rmcp/tools/regression.py +287 -0
- rmcp/tools/statistical_tests.py +332 -0
- rmcp/tools/timeseries.py +239 -0
- rmcp/tools/transforms.py +293 -0
- rmcp/tools/visualization.py +590 -0
- rmcp/transport/__init__.py +16 -0
- rmcp/transport/base.py +130 -0
- rmcp/transport/jsonrpc.py +243 -0
- rmcp/transport/stdio.py +201 -0
rmcp/tools/regression.py
ADDED
@@ -0,0 +1,287 @@
|
|
1
|
+
"""
|
2
|
+
Regression analysis tools for RMCP MCP server.
|
3
|
+
|
4
|
+
Production-ready statistical modeling tools using the new registry architecture.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from typing import Dict, Any
|
8
|
+
from ..registries.tools import tool
|
9
|
+
from ..core.schemas import table_schema, formula_schema
|
10
|
+
from ..r_integration import execute_r_script
|
11
|
+
|
12
|
+
|
13
|
+
@tool(
|
14
|
+
name="linear_model",
|
15
|
+
input_schema={
|
16
|
+
"type": "object",
|
17
|
+
"properties": {
|
18
|
+
"data": table_schema(required_columns=None),
|
19
|
+
"formula": formula_schema(),
|
20
|
+
"weights": {"type": "array", "items": {"type": "number"}},
|
21
|
+
"na_action": {"type": "string", "enum": ["na.omit", "na.exclude", "na.fail"]},
|
22
|
+
},
|
23
|
+
"required": ["data", "formula"]
|
24
|
+
},
|
25
|
+
description="Fit linear regression model with comprehensive diagnostics"
|
26
|
+
)
|
27
|
+
async def linear_model(context, params):
|
28
|
+
"""Fit linear regression model."""
|
29
|
+
|
30
|
+
await context.info("Fitting linear regression model")
|
31
|
+
|
32
|
+
r_script = '''
|
33
|
+
data <- as.data.frame(args$data)
|
34
|
+
formula <- as.formula(args$formula)
|
35
|
+
|
36
|
+
# Handle optional parameters
|
37
|
+
weights <- args$weights
|
38
|
+
na_action <- args$na_action %||% "na.omit"
|
39
|
+
|
40
|
+
# Fit model
|
41
|
+
if (!is.null(weights)) {
|
42
|
+
model <- lm(formula, data = data, weights = weights, na.action = get(na_action))
|
43
|
+
} else {
|
44
|
+
model <- lm(formula, data = data, na.action = get(na_action))
|
45
|
+
}
|
46
|
+
|
47
|
+
# Get comprehensive results
|
48
|
+
summary_model <- summary(model)
|
49
|
+
|
50
|
+
result <- list(
|
51
|
+
coefficients = as.list(coef(model)),
|
52
|
+
std_errors = as.list(summary_model$coefficients[, "Std. Error"]),
|
53
|
+
t_values = as.list(summary_model$coefficients[, "t value"]),
|
54
|
+
p_values = as.list(summary_model$coefficients[, "Pr(>|t|)"]),
|
55
|
+
r_squared = summary_model$r.squared,
|
56
|
+
adj_r_squared = summary_model$adj.r.squared,
|
57
|
+
fstatistic = summary_model$fstatistic[1],
|
58
|
+
f_pvalue = pf(summary_model$fstatistic[1],
|
59
|
+
summary_model$fstatistic[2],
|
60
|
+
summary_model$fstatistic[3], lower.tail = FALSE),
|
61
|
+
residual_se = summary_model$sigma,
|
62
|
+
df_residual = summary_model$df[2],
|
63
|
+
fitted_values = as.numeric(fitted(model)),
|
64
|
+
residuals = as.numeric(residuals(model)),
|
65
|
+
n_obs = nrow(model$model),
|
66
|
+
method = "lm"
|
67
|
+
)
|
68
|
+
'''
|
69
|
+
|
70
|
+
try:
|
71
|
+
result = execute_r_script(r_script, params)
|
72
|
+
await context.info("Linear model fitted successfully",
|
73
|
+
r_squared=result.get("r_squared"),
|
74
|
+
n_obs=result.get("n_obs"))
|
75
|
+
return result
|
76
|
+
|
77
|
+
except Exception as e:
|
78
|
+
await context.error("Linear model fitting failed", error=str(e))
|
79
|
+
raise
|
80
|
+
|
81
|
+
|
82
|
+
@tool(
|
83
|
+
name="correlation_analysis",
|
84
|
+
input_schema={
|
85
|
+
"type": "object",
|
86
|
+
"properties": {
|
87
|
+
"data": table_schema(),
|
88
|
+
"variables": {
|
89
|
+
"type": "array",
|
90
|
+
"items": {"type": "string"},
|
91
|
+
"description": "Variables to include in correlation analysis"
|
92
|
+
},
|
93
|
+
"method": {
|
94
|
+
"type": "string",
|
95
|
+
"enum": ["pearson", "spearman", "kendall"],
|
96
|
+
"description": "Correlation method"
|
97
|
+
},
|
98
|
+
"use": {
|
99
|
+
"type": "string",
|
100
|
+
"enum": ["everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs"],
|
101
|
+
"description": "Missing value handling"
|
102
|
+
}
|
103
|
+
},
|
104
|
+
"required": ["data"]
|
105
|
+
},
|
106
|
+
description="Comprehensive correlation analysis with significance tests"
|
107
|
+
)
|
108
|
+
async def correlation_analysis(context, params):
|
109
|
+
"""Perform correlation analysis."""
|
110
|
+
|
111
|
+
await context.info("Computing correlation matrix")
|
112
|
+
|
113
|
+
r_script = '''
|
114
|
+
data <- as.data.frame(args$data)
|
115
|
+
variables <- args$variables
|
116
|
+
method <- args$method %||% "pearson"
|
117
|
+
use <- args$use %||% "complete.obs"
|
118
|
+
|
119
|
+
# Select variables if specified
|
120
|
+
if (!is.null(variables)) {
|
121
|
+
# Validate variables exist
|
122
|
+
missing_vars <- setdiff(variables, names(data))
|
123
|
+
if (length(missing_vars) > 0) {
|
124
|
+
stop(paste("Variables not found:", paste(missing_vars, collapse = ", ")))
|
125
|
+
}
|
126
|
+
data <- data[, variables, drop = FALSE]
|
127
|
+
}
|
128
|
+
|
129
|
+
# Select only numeric variables
|
130
|
+
numeric_vars <- sapply(data, is.numeric)
|
131
|
+
if (sum(numeric_vars) < 2) {
|
132
|
+
stop("Need at least 2 numeric variables for correlation analysis")
|
133
|
+
}
|
134
|
+
numeric_data <- data[, numeric_vars, drop = FALSE]
|
135
|
+
|
136
|
+
# Compute correlation matrix
|
137
|
+
cor_matrix <- cor(numeric_data, method = method, use = use)
|
138
|
+
|
139
|
+
# Compute significance tests
|
140
|
+
n <- nrow(numeric_data)
|
141
|
+
cor_test_results <- list()
|
142
|
+
|
143
|
+
for (i in 1:(ncol(numeric_data)-1)) {
|
144
|
+
for (j in (i+1):ncol(numeric_data)) {
|
145
|
+
var1 <- names(numeric_data)[i]
|
146
|
+
var2 <- names(numeric_data)[j]
|
147
|
+
|
148
|
+
test_result <- cor.test(numeric_data[,i], numeric_data[,j],
|
149
|
+
method = method, use = use)
|
150
|
+
|
151
|
+
cor_test_results[[paste(var1, var2, sep = "_")]] <- list(
|
152
|
+
correlation = test_result$estimate,
|
153
|
+
p_value = test_result$p.value,
|
154
|
+
conf_int_lower = if (!is.null(test_result$conf.int)) test_result$conf.int[1] else NA,
|
155
|
+
conf_int_upper = if (!is.null(test_result$conf.int)) test_result$conf.int[2] else NA
|
156
|
+
)
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
result <- list(
|
161
|
+
correlation_matrix = as.list(as.data.frame(cor_matrix)),
|
162
|
+
significance_tests = cor_test_results,
|
163
|
+
method = method,
|
164
|
+
n_obs = n,
|
165
|
+
variables = names(numeric_data)
|
166
|
+
)
|
167
|
+
'''
|
168
|
+
|
169
|
+
try:
|
170
|
+
result = execute_r_script(r_script, params)
|
171
|
+
await context.info("Correlation analysis completed",
|
172
|
+
n_variables=len(result.get("variables", [])),
|
173
|
+
method=result.get("method"))
|
174
|
+
return result
|
175
|
+
|
176
|
+
except Exception as e:
|
177
|
+
await context.error("Correlation analysis failed", error=str(e))
|
178
|
+
raise
|
179
|
+
|
180
|
+
|
181
|
+
@tool(
|
182
|
+
name="logistic_regression",
|
183
|
+
input_schema={
|
184
|
+
"type": "object",
|
185
|
+
"properties": {
|
186
|
+
"data": table_schema(),
|
187
|
+
"formula": formula_schema(),
|
188
|
+
"family": {
|
189
|
+
"type": "string",
|
190
|
+
"enum": ["binomial", "poisson", "gamma", "inverse.gaussian"],
|
191
|
+
"description": "Error distribution family"
|
192
|
+
},
|
193
|
+
"link": {
|
194
|
+
"type": "string",
|
195
|
+
"enum": ["logit", "probit", "cloglog", "cauchit"],
|
196
|
+
"description": "Link function for binomial family"
|
197
|
+
}
|
198
|
+
},
|
199
|
+
"required": ["data", "formula"]
|
200
|
+
},
|
201
|
+
description="Fit generalized linear model (logistic regression)"
|
202
|
+
)
|
203
|
+
async def logistic_regression(context, params):
|
204
|
+
"""Fit logistic regression model."""
|
205
|
+
|
206
|
+
await context.info("Fitting logistic regression model")
|
207
|
+
|
208
|
+
r_script = '''
|
209
|
+
data <- as.data.frame(args$data)
|
210
|
+
formula <- as.formula(args$formula)
|
211
|
+
family <- args$family %||% "binomial"
|
212
|
+
link <- args$link %||% "logit"
|
213
|
+
|
214
|
+
# Prepare family specification
|
215
|
+
if (family == "binomial") {
|
216
|
+
family_spec <- binomial(link = link)
|
217
|
+
} else {
|
218
|
+
family_spec <- get(family)()
|
219
|
+
}
|
220
|
+
|
221
|
+
# Fit GLM
|
222
|
+
model <- glm(formula, data = data, family = family_spec)
|
223
|
+
summary_model <- summary(model)
|
224
|
+
|
225
|
+
# Additional diagnostics for logistic regression
|
226
|
+
if (family == "binomial") {
|
227
|
+
# Odds ratios
|
228
|
+
odds_ratios <- exp(coef(model))
|
229
|
+
|
230
|
+
# McFadden's R-squared
|
231
|
+
ll_null <- logLik(glm(update(formula, . ~ 1), data = data, family = family_spec))
|
232
|
+
ll_model <- logLik(model)
|
233
|
+
mcfadden_r2 <- 1 - (ll_model / ll_null)
|
234
|
+
|
235
|
+
# Predicted probabilities
|
236
|
+
predicted_probs <- fitted(model)
|
237
|
+
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
|
238
|
+
|
239
|
+
# Confusion matrix (if binary outcome)
|
240
|
+
actual <- model.response(model.frame(model))
|
241
|
+
if (all(actual %in% c(0, 1))) {
|
242
|
+
confusion <- table(actual, predicted_classes)
|
243
|
+
accuracy <- sum(diag(confusion)) / sum(confusion)
|
244
|
+
} else {
|
245
|
+
confusion <- NULL
|
246
|
+
accuracy <- NULL
|
247
|
+
}
|
248
|
+
}
|
249
|
+
|
250
|
+
result <- list(
|
251
|
+
coefficients = as.list(coef(model)),
|
252
|
+
std_errors = as.list(summary_model$coefficients[, "Std. Error"]),
|
253
|
+
z_values = as.list(summary_model$coefficients[, "z value"]),
|
254
|
+
p_values = as.list(summary_model$coefficients[, "Pr(>|z|)"]),
|
255
|
+
deviance = model$deviance,
|
256
|
+
null_deviance = model$null.deviance,
|
257
|
+
aic = AIC(model),
|
258
|
+
bic = BIC(model),
|
259
|
+
fitted_values = as.numeric(fitted(model)),
|
260
|
+
residuals = as.numeric(residuals(model, type = "deviance")),
|
261
|
+
n_obs = nobs(model),
|
262
|
+
family = family,
|
263
|
+
link = link
|
264
|
+
)
|
265
|
+
|
266
|
+
# Add logistic-specific results
|
267
|
+
if (family == "binomial") {
|
268
|
+
result$odds_ratios <- as.list(odds_ratios)
|
269
|
+
result$mcfadden_r_squared <- as.numeric(mcfadden_r2)
|
270
|
+
result$predicted_probabilities <- predicted_probs
|
271
|
+
if (!is.null(accuracy)) {
|
272
|
+
result$accuracy <- accuracy
|
273
|
+
result$confusion_matrix <- as.list(as.data.frame.matrix(confusion))
|
274
|
+
}
|
275
|
+
}
|
276
|
+
'''
|
277
|
+
|
278
|
+
try:
|
279
|
+
result = execute_r_script(r_script, params)
|
280
|
+
await context.info("Logistic regression fitted successfully",
|
281
|
+
aic=result.get("aic"),
|
282
|
+
n_obs=result.get("n_obs"))
|
283
|
+
return result
|
284
|
+
|
285
|
+
except Exception as e:
|
286
|
+
await context.error("Logistic regression fitting failed", error=str(e))
|
287
|
+
raise
|
@@ -0,0 +1,332 @@
|
|
1
|
+
"""
|
2
|
+
Statistical hypothesis testing tools for RMCP.
|
3
|
+
|
4
|
+
Comprehensive statistical testing capabilities.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from typing import Dict, Any
|
8
|
+
from ..registries.tools import tool
|
9
|
+
from ..core.schemas import table_schema
|
10
|
+
from ..r_integration import execute_r_script
|
11
|
+
|
12
|
+
|
13
|
+
@tool(
|
14
|
+
name="t_test",
|
15
|
+
input_schema={
|
16
|
+
"type": "object",
|
17
|
+
"properties": {
|
18
|
+
"data": table_schema(),
|
19
|
+
"variable": {"type": "string"},
|
20
|
+
"group": {"type": "string"},
|
21
|
+
"mu": {"type": "number", "default": 0},
|
22
|
+
"alternative": {"type": "string", "enum": ["two.sided", "less", "greater"], "default": "two.sided"},
|
23
|
+
"paired": {"type": "boolean", "default": False},
|
24
|
+
"var_equal": {"type": "boolean", "default": True}
|
25
|
+
},
|
26
|
+
"required": ["data", "variable"]
|
27
|
+
},
|
28
|
+
description="Perform t-tests (one-sample, two-sample, paired)"
|
29
|
+
)
|
30
|
+
async def t_test(context, params):
|
31
|
+
"""Perform t-test analysis."""
|
32
|
+
|
33
|
+
await context.info("Performing t-test")
|
34
|
+
|
35
|
+
r_script = '''
|
36
|
+
data <- as.data.frame(args$data)
|
37
|
+
variable <- args$variable
|
38
|
+
group <- args$group
|
39
|
+
mu <- args$mu %||% 0
|
40
|
+
alternative <- args$alternative %||% "two.sided"
|
41
|
+
paired <- args$paired %||% FALSE
|
42
|
+
var_equal <- args$var_equal %||% TRUE
|
43
|
+
|
44
|
+
if (is.null(group)) {
|
45
|
+
# One-sample t-test
|
46
|
+
test_result <- t.test(data[[variable]], mu = mu, alternative = alternative)
|
47
|
+
test_type <- "One-sample t-test"
|
48
|
+
|
49
|
+
result <- list(
|
50
|
+
test_type = test_type,
|
51
|
+
statistic = as.numeric(test_result$statistic),
|
52
|
+
df = test_result$parameter,
|
53
|
+
p_value = test_result$p.value,
|
54
|
+
confidence_interval = as.numeric(test_result$conf.int),
|
55
|
+
mean = as.numeric(test_result$estimate),
|
56
|
+
null_value = mu,
|
57
|
+
alternative = alternative,
|
58
|
+
n_obs = length(data[[variable]][!is.na(data[[variable]])])
|
59
|
+
)
|
60
|
+
|
61
|
+
} else {
|
62
|
+
# Two-sample t-test
|
63
|
+
group_values <- data[[group]]
|
64
|
+
unique_groups <- unique(group_values[!is.na(group_values)])
|
65
|
+
|
66
|
+
if (length(unique_groups) != 2) {
|
67
|
+
stop("Group variable must have exactly 2 levels")
|
68
|
+
}
|
69
|
+
|
70
|
+
x <- data[[variable]][group_values == unique_groups[1]]
|
71
|
+
y <- data[[variable]][group_values == unique_groups[2]]
|
72
|
+
|
73
|
+
test_result <- t.test(x, y, alternative = alternative, paired = paired, var.equal = var_equal)
|
74
|
+
test_type <- if (paired) "Paired t-test" else "Two-sample t-test"
|
75
|
+
|
76
|
+
result <- list(
|
77
|
+
test_type = test_type,
|
78
|
+
statistic = as.numeric(test_result$statistic),
|
79
|
+
df = test_result$parameter,
|
80
|
+
p_value = test_result$p.value,
|
81
|
+
confidence_interval = as.numeric(test_result$conf.int),
|
82
|
+
mean_x = as.numeric(test_result$estimate[1]),
|
83
|
+
mean_y = as.numeric(test_result$estimate[2]),
|
84
|
+
mean_difference = as.numeric(test_result$estimate[1] - test_result$estimate[2]),
|
85
|
+
groups = unique_groups,
|
86
|
+
alternative = alternative,
|
87
|
+
paired = paired,
|
88
|
+
n_obs_x = length(x[!is.na(x)]),
|
89
|
+
n_obs_y = length(y[!is.na(y)])
|
90
|
+
)
|
91
|
+
}
|
92
|
+
'''
|
93
|
+
|
94
|
+
try:
|
95
|
+
result = execute_r_script(r_script, params)
|
96
|
+
await context.info("T-test completed successfully")
|
97
|
+
return result
|
98
|
+
|
99
|
+
except Exception as e:
|
100
|
+
await context.error("T-test failed", error=str(e))
|
101
|
+
raise
|
102
|
+
|
103
|
+
|
104
|
+
@tool(
|
105
|
+
name="anova",
|
106
|
+
input_schema={
|
107
|
+
"type": "object",
|
108
|
+
"properties": {
|
109
|
+
"data": table_schema(),
|
110
|
+
"formula": {"type": "string"},
|
111
|
+
"type": {"type": "string", "enum": ["I", "II", "III"], "default": "I"}
|
112
|
+
},
|
113
|
+
"required": ["data", "formula"]
|
114
|
+
},
|
115
|
+
description="Analysis of Variance (ANOVA) with multiple types"
|
116
|
+
)
|
117
|
+
async def anova(context, params):
|
118
|
+
"""Perform ANOVA analysis."""
|
119
|
+
|
120
|
+
await context.info("Performing ANOVA")
|
121
|
+
|
122
|
+
r_script = '''
|
123
|
+
data <- as.data.frame(args$data)
|
124
|
+
formula <- as.formula(args$formula)
|
125
|
+
anova_type <- args$type %||% "I"
|
126
|
+
|
127
|
+
# Fit the model
|
128
|
+
model <- lm(formula, data = data)
|
129
|
+
|
130
|
+
# Perform ANOVA
|
131
|
+
if (anova_type == "I") {
|
132
|
+
anova_result <- anova(model)
|
133
|
+
anova_table <- anova_result
|
134
|
+
} else {
|
135
|
+
if (!require(car)) install.packages("car", quietly = TRUE)
|
136
|
+
library(car)
|
137
|
+
anova_table <- Anova(model, type = as.numeric(substr(anova_type, 1, 1)))
|
138
|
+
}
|
139
|
+
|
140
|
+
# Extract ANOVA table
|
141
|
+
result <- list(
|
142
|
+
anova_table = list(
|
143
|
+
terms = rownames(anova_table),
|
144
|
+
df = anova_table[["Df"]],
|
145
|
+
sum_sq = anova_table[["Sum Sq"]] %||% anova_table[["Sum of Sq"]],
|
146
|
+
mean_sq = anova_table[["Mean Sq"]] %||% (anova_table[["Sum of Sq"]] / anova_table[["Df"]]),
|
147
|
+
f_value = anova_table[["F value"]] %||% anova_table[["F"]],
|
148
|
+
p_value = anova_table[["Pr(>F)"]] %||% anova_table[["Pr(>F)"]]
|
149
|
+
),
|
150
|
+
model_summary = list(
|
151
|
+
r_squared = summary(model)$r.squared,
|
152
|
+
adj_r_squared = summary(model)$adj.r.squared,
|
153
|
+
residual_se = summary(model)$sigma,
|
154
|
+
df_residual = summary(model)$df[2],
|
155
|
+
n_obs = nrow(model$model)
|
156
|
+
),
|
157
|
+
formula = deparse(formula),
|
158
|
+
anova_type = paste("Type", anova_type)
|
159
|
+
)
|
160
|
+
'''
|
161
|
+
|
162
|
+
try:
|
163
|
+
result = execute_r_script(r_script, params)
|
164
|
+
await context.info("ANOVA completed successfully")
|
165
|
+
return result
|
166
|
+
|
167
|
+
except Exception as e:
|
168
|
+
await context.error("ANOVA failed", error=str(e))
|
169
|
+
raise
|
170
|
+
|
171
|
+
|
172
|
+
@tool(
|
173
|
+
name="chi_square_test",
|
174
|
+
input_schema={
|
175
|
+
"type": "object",
|
176
|
+
"properties": {
|
177
|
+
"data": table_schema(),
|
178
|
+
"x": {"type": "string"},
|
179
|
+
"y": {"type": "string"},
|
180
|
+
"test_type": {"type": "string", "enum": ["independence", "goodness_of_fit"], "default": "independence"},
|
181
|
+
"expected": {"type": "array", "items": {"type": "number"}}
|
182
|
+
},
|
183
|
+
"required": ["data"]
|
184
|
+
},
|
185
|
+
description="Chi-square tests for independence and goodness of fit"
|
186
|
+
)
|
187
|
+
async def chi_square_test(context, params):
|
188
|
+
"""Perform chi-square tests."""
|
189
|
+
|
190
|
+
await context.info("Performing chi-square test")
|
191
|
+
|
192
|
+
r_script = '''
|
193
|
+
data <- as.data.frame(args$data)
|
194
|
+
x_var <- args$x
|
195
|
+
y_var <- args$y
|
196
|
+
test_type <- args$test_type %||% "independence"
|
197
|
+
expected <- args$expected
|
198
|
+
|
199
|
+
if (test_type == "independence") {
|
200
|
+
if (is.null(x_var) || is.null(y_var)) {
|
201
|
+
stop("Both x and y variables required for independence test")
|
202
|
+
}
|
203
|
+
|
204
|
+
# Create contingency table
|
205
|
+
cont_table <- table(data[[x_var]], data[[y_var]])
|
206
|
+
test_result <- chisq.test(cont_table)
|
207
|
+
|
208
|
+
result <- list(
|
209
|
+
test_type = "Chi-square test of independence",
|
210
|
+
contingency_table = as.matrix(cont_table),
|
211
|
+
statistic = as.numeric(test_result$statistic),
|
212
|
+
df = test_result$parameter,
|
213
|
+
p_value = test_result$p.value,
|
214
|
+
expected_frequencies = as.matrix(test_result$expected),
|
215
|
+
residuals = as.matrix(test_result$residuals),
|
216
|
+
x_variable = x_var,
|
217
|
+
y_variable = y_var,
|
218
|
+
cramers_v = sqrt(test_result$statistic / (sum(cont_table) * (min(dim(cont_table)) - 1)))
|
219
|
+
)
|
220
|
+
|
221
|
+
} else {
|
222
|
+
# Goodness of fit test
|
223
|
+
if (is.null(x_var)) {
|
224
|
+
stop("x variable required for goodness of fit test")
|
225
|
+
}
|
226
|
+
|
227
|
+
observed <- table(data[[x_var]])
|
228
|
+
|
229
|
+
if (!is.null(expected)) {
|
230
|
+
test_result <- chisq.test(observed, p = expected)
|
231
|
+
} else {
|
232
|
+
test_result <- chisq.test(observed)
|
233
|
+
}
|
234
|
+
|
235
|
+
result <- list(
|
236
|
+
test_type = "Chi-square goodness of fit test",
|
237
|
+
observed_frequencies = as.numeric(observed),
|
238
|
+
expected_frequencies = as.numeric(test_result$expected),
|
239
|
+
statistic = as.numeric(test_result$statistic),
|
240
|
+
df = test_result$parameter,
|
241
|
+
p_value = test_result$p.value,
|
242
|
+
residuals = as.numeric(test_result$residuals),
|
243
|
+
categories = names(observed)
|
244
|
+
)
|
245
|
+
}
|
246
|
+
'''
|
247
|
+
|
248
|
+
try:
|
249
|
+
result = execute_r_script(r_script, params)
|
250
|
+
await context.info("Chi-square test completed successfully")
|
251
|
+
return result
|
252
|
+
|
253
|
+
except Exception as e:
|
254
|
+
await context.error("Chi-square test failed", error=str(e))
|
255
|
+
raise
|
256
|
+
|
257
|
+
|
258
|
+
@tool(
|
259
|
+
name="normality_test",
|
260
|
+
input_schema={
|
261
|
+
"type": "object",
|
262
|
+
"properties": {
|
263
|
+
"data": table_schema(),
|
264
|
+
"variable": {"type": "string"},
|
265
|
+
"test": {"type": "string", "enum": ["shapiro", "jarque_bera", "anderson"], "default": "shapiro"}
|
266
|
+
},
|
267
|
+
"required": ["data", "variable"]
|
268
|
+
},
|
269
|
+
description="Test variables for normality (Shapiro-Wilk, Jarque-Bera, Anderson-Darling)"
|
270
|
+
)
|
271
|
+
async def normality_test(context, params):
|
272
|
+
"""Test for normality."""
|
273
|
+
|
274
|
+
await context.info("Testing for normality")
|
275
|
+
|
276
|
+
r_script = '''
|
277
|
+
data <- as.data.frame(args$data)
|
278
|
+
variable <- args$variable
|
279
|
+
test_type <- args$test %||% "shapiro"
|
280
|
+
|
281
|
+
values <- data[[variable]]
|
282
|
+
values <- values[!is.na(values)]
|
283
|
+
|
284
|
+
if (test_type == "shapiro") {
|
285
|
+
test_result <- shapiro.test(values)
|
286
|
+
result <- list(
|
287
|
+
test_name = "Shapiro-Wilk normality test",
|
288
|
+
statistic = as.numeric(test_result$statistic),
|
289
|
+
p_value = test_result$p.value,
|
290
|
+
is_normal = test_result$p.value > 0.05
|
291
|
+
)
|
292
|
+
|
293
|
+
} else if (test_type == "jarque_bera") {
|
294
|
+
if (!require(tseries)) install.packages("tseries", quietly = TRUE)
|
295
|
+
library(tseries)
|
296
|
+
test_result <- jarque.bera.test(values)
|
297
|
+
result <- list(
|
298
|
+
test_name = "Jarque-Bera normality test",
|
299
|
+
statistic = as.numeric(test_result$statistic),
|
300
|
+
df = test_result$parameter,
|
301
|
+
p_value = test_result$p.value,
|
302
|
+
is_normal = test_result$p.value > 0.05
|
303
|
+
)
|
304
|
+
|
305
|
+
} else if (test_type == "anderson") {
|
306
|
+
if (!require(nortest)) install.packages("nortest", quietly = TRUE)
|
307
|
+
library(nortest)
|
308
|
+
test_result <- ad.test(values)
|
309
|
+
result <- list(
|
310
|
+
test_name = "Anderson-Darling normality test",
|
311
|
+
statistic = as.numeric(test_result$statistic),
|
312
|
+
p_value = test_result$p.value,
|
313
|
+
is_normal = test_result$p.value > 0.05
|
314
|
+
)
|
315
|
+
}
|
316
|
+
|
317
|
+
result$variable <- variable
|
318
|
+
result$n_obs <- length(values)
|
319
|
+
result$mean <- mean(values)
|
320
|
+
result$sd <- sd(values)
|
321
|
+
result$skewness <- (sum((values - mean(values))^3) / length(values)) / (sd(values)^3)
|
322
|
+
result$kurtosis <- (sum((values - mean(values))^4) / length(values)) / (sd(values)^4) - 3
|
323
|
+
'''
|
324
|
+
|
325
|
+
try:
|
326
|
+
result = execute_r_script(r_script, params)
|
327
|
+
await context.info("Normality test completed successfully")
|
328
|
+
return result
|
329
|
+
|
330
|
+
except Exception as e:
|
331
|
+
await context.error("Normality test failed", error=str(e))
|
332
|
+
raise
|