mseep-rmcp 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,287 @@
1
+ """
2
+ Regression analysis tools for RMCP MCP server.
3
+
4
+ Production-ready statistical modeling tools using the new registry architecture.
5
+ """
6
+
7
+ from typing import Dict, Any
8
+ from ..registries.tools import tool
9
+ from ..core.schemas import table_schema, formula_schema
10
+ from ..r_integration import execute_r_script
11
+
12
+
13
+ @tool(
14
+ name="linear_model",
15
+ input_schema={
16
+ "type": "object",
17
+ "properties": {
18
+ "data": table_schema(required_columns=None),
19
+ "formula": formula_schema(),
20
+ "weights": {"type": "array", "items": {"type": "number"}},
21
+ "na_action": {"type": "string", "enum": ["na.omit", "na.exclude", "na.fail"]},
22
+ },
23
+ "required": ["data", "formula"]
24
+ },
25
+ description="Fit linear regression model with comprehensive diagnostics"
26
+ )
27
+ async def linear_model(context, params):
28
+ """Fit linear regression model."""
29
+
30
+ await context.info("Fitting linear regression model")
31
+
32
+ r_script = '''
33
+ data <- as.data.frame(args$data)
34
+ formula <- as.formula(args$formula)
35
+
36
+ # Handle optional parameters
37
+ weights <- args$weights
38
+ na_action <- args$na_action %||% "na.omit"
39
+
40
+ # Fit model
41
+ if (!is.null(weights)) {
42
+ model <- lm(formula, data = data, weights = weights, na.action = get(na_action))
43
+ } else {
44
+ model <- lm(formula, data = data, na.action = get(na_action))
45
+ }
46
+
47
+ # Get comprehensive results
48
+ summary_model <- summary(model)
49
+
50
+ result <- list(
51
+ coefficients = as.list(coef(model)),
52
+ std_errors = as.list(summary_model$coefficients[, "Std. Error"]),
53
+ t_values = as.list(summary_model$coefficients[, "t value"]),
54
+ p_values = as.list(summary_model$coefficients[, "Pr(>|t|)"]),
55
+ r_squared = summary_model$r.squared,
56
+ adj_r_squared = summary_model$adj.r.squared,
57
+ fstatistic = summary_model$fstatistic[1],
58
+ f_pvalue = pf(summary_model$fstatistic[1],
59
+ summary_model$fstatistic[2],
60
+ summary_model$fstatistic[3], lower.tail = FALSE),
61
+ residual_se = summary_model$sigma,
62
+ df_residual = summary_model$df[2],
63
+ fitted_values = as.numeric(fitted(model)),
64
+ residuals = as.numeric(residuals(model)),
65
+ n_obs = nrow(model$model),
66
+ method = "lm"
67
+ )
68
+ '''
69
+
70
+ try:
71
+ result = execute_r_script(r_script, params)
72
+ await context.info("Linear model fitted successfully",
73
+ r_squared=result.get("r_squared"),
74
+ n_obs=result.get("n_obs"))
75
+ return result
76
+
77
+ except Exception as e:
78
+ await context.error("Linear model fitting failed", error=str(e))
79
+ raise
80
+
81
+
82
+ @tool(
83
+ name="correlation_analysis",
84
+ input_schema={
85
+ "type": "object",
86
+ "properties": {
87
+ "data": table_schema(),
88
+ "variables": {
89
+ "type": "array",
90
+ "items": {"type": "string"},
91
+ "description": "Variables to include in correlation analysis"
92
+ },
93
+ "method": {
94
+ "type": "string",
95
+ "enum": ["pearson", "spearman", "kendall"],
96
+ "description": "Correlation method"
97
+ },
98
+ "use": {
99
+ "type": "string",
100
+ "enum": ["everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs"],
101
+ "description": "Missing value handling"
102
+ }
103
+ },
104
+ "required": ["data"]
105
+ },
106
+ description="Comprehensive correlation analysis with significance tests"
107
+ )
108
+ async def correlation_analysis(context, params):
109
+ """Perform correlation analysis."""
110
+
111
+ await context.info("Computing correlation matrix")
112
+
113
+ r_script = '''
114
+ data <- as.data.frame(args$data)
115
+ variables <- args$variables
116
+ method <- args$method %||% "pearson"
117
+ use <- args$use %||% "complete.obs"
118
+
119
+ # Select variables if specified
120
+ if (!is.null(variables)) {
121
+ # Validate variables exist
122
+ missing_vars <- setdiff(variables, names(data))
123
+ if (length(missing_vars) > 0) {
124
+ stop(paste("Variables not found:", paste(missing_vars, collapse = ", ")))
125
+ }
126
+ data <- data[, variables, drop = FALSE]
127
+ }
128
+
129
+ # Select only numeric variables
130
+ numeric_vars <- sapply(data, is.numeric)
131
+ if (sum(numeric_vars) < 2) {
132
+ stop("Need at least 2 numeric variables for correlation analysis")
133
+ }
134
+ numeric_data <- data[, numeric_vars, drop = FALSE]
135
+
136
+ # Compute correlation matrix
137
+ cor_matrix <- cor(numeric_data, method = method, use = use)
138
+
139
+ # Compute significance tests
140
+ n <- nrow(numeric_data)
141
+ cor_test_results <- list()
142
+
143
+ for (i in 1:(ncol(numeric_data)-1)) {
144
+ for (j in (i+1):ncol(numeric_data)) {
145
+ var1 <- names(numeric_data)[i]
146
+ var2 <- names(numeric_data)[j]
147
+
148
+ test_result <- cor.test(numeric_data[,i], numeric_data[,j],
149
+ method = method, use = use)
150
+
151
+ cor_test_results[[paste(var1, var2, sep = "_")]] <- list(
152
+ correlation = test_result$estimate,
153
+ p_value = test_result$p.value,
154
+ conf_int_lower = if (!is.null(test_result$conf.int)) test_result$conf.int[1] else NA,
155
+ conf_int_upper = if (!is.null(test_result$conf.int)) test_result$conf.int[2] else NA
156
+ )
157
+ }
158
+ }
159
+
160
+ result <- list(
161
+ correlation_matrix = as.list(as.data.frame(cor_matrix)),
162
+ significance_tests = cor_test_results,
163
+ method = method,
164
+ n_obs = n,
165
+ variables = names(numeric_data)
166
+ )
167
+ '''
168
+
169
+ try:
170
+ result = execute_r_script(r_script, params)
171
+ await context.info("Correlation analysis completed",
172
+ n_variables=len(result.get("variables", [])),
173
+ method=result.get("method"))
174
+ return result
175
+
176
+ except Exception as e:
177
+ await context.error("Correlation analysis failed", error=str(e))
178
+ raise
179
+
180
+
181
+ @tool(
182
+ name="logistic_regression",
183
+ input_schema={
184
+ "type": "object",
185
+ "properties": {
186
+ "data": table_schema(),
187
+ "formula": formula_schema(),
188
+ "family": {
189
+ "type": "string",
190
+ "enum": ["binomial", "poisson", "gamma", "inverse.gaussian"],
191
+ "description": "Error distribution family"
192
+ },
193
+ "link": {
194
+ "type": "string",
195
+ "enum": ["logit", "probit", "cloglog", "cauchit"],
196
+ "description": "Link function for binomial family"
197
+ }
198
+ },
199
+ "required": ["data", "formula"]
200
+ },
201
+ description="Fit generalized linear model (logistic regression)"
202
+ )
203
+ async def logistic_regression(context, params):
204
+ """Fit logistic regression model."""
205
+
206
+ await context.info("Fitting logistic regression model")
207
+
208
+ r_script = '''
209
+ data <- as.data.frame(args$data)
210
+ formula <- as.formula(args$formula)
211
+ family <- args$family %||% "binomial"
212
+ link <- args$link %||% "logit"
213
+
214
+ # Prepare family specification
215
+ if (family == "binomial") {
216
+ family_spec <- binomial(link = link)
217
+ } else {
218
+ family_spec <- get(family)()
219
+ }
220
+
221
+ # Fit GLM
222
+ model <- glm(formula, data = data, family = family_spec)
223
+ summary_model <- summary(model)
224
+
225
+ # Additional diagnostics for logistic regression
226
+ if (family == "binomial") {
227
+ # Odds ratios
228
+ odds_ratios <- exp(coef(model))
229
+
230
+ # McFadden's R-squared
231
+ ll_null <- logLik(glm(update(formula, . ~ 1), data = data, family = family_spec))
232
+ ll_model <- logLik(model)
233
+ mcfadden_r2 <- 1 - (ll_model / ll_null)
234
+
235
+ # Predicted probabilities
236
+ predicted_probs <- fitted(model)
237
+ predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
238
+
239
+ # Confusion matrix (if binary outcome)
240
+ actual <- model.response(model.frame(model))
241
+ if (all(actual %in% c(0, 1))) {
242
+ confusion <- table(actual, predicted_classes)
243
+ accuracy <- sum(diag(confusion)) / sum(confusion)
244
+ } else {
245
+ confusion <- NULL
246
+ accuracy <- NULL
247
+ }
248
+ }
249
+
250
+ result <- list(
251
+ coefficients = as.list(coef(model)),
252
+ std_errors = as.list(summary_model$coefficients[, "Std. Error"]),
253
+ z_values = as.list(summary_model$coefficients[, "z value"]),
254
+ p_values = as.list(summary_model$coefficients[, "Pr(>|z|)"]),
255
+ deviance = model$deviance,
256
+ null_deviance = model$null.deviance,
257
+ aic = AIC(model),
258
+ bic = BIC(model),
259
+ fitted_values = as.numeric(fitted(model)),
260
+ residuals = as.numeric(residuals(model, type = "deviance")),
261
+ n_obs = nobs(model),
262
+ family = family,
263
+ link = link
264
+ )
265
+
266
+ # Add logistic-specific results
267
+ if (family == "binomial") {
268
+ result$odds_ratios <- as.list(odds_ratios)
269
+ result$mcfadden_r_squared <- as.numeric(mcfadden_r2)
270
+ result$predicted_probabilities <- predicted_probs
271
+ if (!is.null(accuracy)) {
272
+ result$accuracy <- accuracy
273
+ result$confusion_matrix <- as.list(as.data.frame.matrix(confusion))
274
+ }
275
+ }
276
+ '''
277
+
278
+ try:
279
+ result = execute_r_script(r_script, params)
280
+ await context.info("Logistic regression fitted successfully",
281
+ aic=result.get("aic"),
282
+ n_obs=result.get("n_obs"))
283
+ return result
284
+
285
+ except Exception as e:
286
+ await context.error("Logistic regression fitting failed", error=str(e))
287
+ raise
@@ -0,0 +1,332 @@
1
+ """
2
+ Statistical hypothesis testing tools for RMCP.
3
+
4
+ Comprehensive statistical testing capabilities.
5
+ """
6
+
7
+ from typing import Dict, Any
8
+ from ..registries.tools import tool
9
+ from ..core.schemas import table_schema
10
+ from ..r_integration import execute_r_script
11
+
12
+
13
+ @tool(
14
+ name="t_test",
15
+ input_schema={
16
+ "type": "object",
17
+ "properties": {
18
+ "data": table_schema(),
19
+ "variable": {"type": "string"},
20
+ "group": {"type": "string"},
21
+ "mu": {"type": "number", "default": 0},
22
+ "alternative": {"type": "string", "enum": ["two.sided", "less", "greater"], "default": "two.sided"},
23
+ "paired": {"type": "boolean", "default": False},
24
+ "var_equal": {"type": "boolean", "default": True}
25
+ },
26
+ "required": ["data", "variable"]
27
+ },
28
+ description="Perform t-tests (one-sample, two-sample, paired)"
29
+ )
30
+ async def t_test(context, params):
31
+ """Perform t-test analysis."""
32
+
33
+ await context.info("Performing t-test")
34
+
35
+ r_script = '''
36
+ data <- as.data.frame(args$data)
37
+ variable <- args$variable
38
+ group <- args$group
39
+ mu <- args$mu %||% 0
40
+ alternative <- args$alternative %||% "two.sided"
41
+ paired <- args$paired %||% FALSE
42
+ var_equal <- args$var_equal %||% TRUE
43
+
44
+ if (is.null(group)) {
45
+ # One-sample t-test
46
+ test_result <- t.test(data[[variable]], mu = mu, alternative = alternative)
47
+ test_type <- "One-sample t-test"
48
+
49
+ result <- list(
50
+ test_type = test_type,
51
+ statistic = as.numeric(test_result$statistic),
52
+ df = test_result$parameter,
53
+ p_value = test_result$p.value,
54
+ confidence_interval = as.numeric(test_result$conf.int),
55
+ mean = as.numeric(test_result$estimate),
56
+ null_value = mu,
57
+ alternative = alternative,
58
+ n_obs = length(data[[variable]][!is.na(data[[variable]])])
59
+ )
60
+
61
+ } else {
62
+ # Two-sample t-test
63
+ group_values <- data[[group]]
64
+ unique_groups <- unique(group_values[!is.na(group_values)])
65
+
66
+ if (length(unique_groups) != 2) {
67
+ stop("Group variable must have exactly 2 levels")
68
+ }
69
+
70
+ x <- data[[variable]][group_values == unique_groups[1]]
71
+ y <- data[[variable]][group_values == unique_groups[2]]
72
+
73
+ test_result <- t.test(x, y, alternative = alternative, paired = paired, var.equal = var_equal)
74
+ test_type <- if (paired) "Paired t-test" else "Two-sample t-test"
75
+
76
+ result <- list(
77
+ test_type = test_type,
78
+ statistic = as.numeric(test_result$statistic),
79
+ df = test_result$parameter,
80
+ p_value = test_result$p.value,
81
+ confidence_interval = as.numeric(test_result$conf.int),
82
+ mean_x = as.numeric(test_result$estimate[1]),
83
+ mean_y = as.numeric(test_result$estimate[2]),
84
+ mean_difference = as.numeric(test_result$estimate[1] - test_result$estimate[2]),
85
+ groups = unique_groups,
86
+ alternative = alternative,
87
+ paired = paired,
88
+ n_obs_x = length(x[!is.na(x)]),
89
+ n_obs_y = length(y[!is.na(y)])
90
+ )
91
+ }
92
+ '''
93
+
94
+ try:
95
+ result = execute_r_script(r_script, params)
96
+ await context.info("T-test completed successfully")
97
+ return result
98
+
99
+ except Exception as e:
100
+ await context.error("T-test failed", error=str(e))
101
+ raise
102
+
103
+
104
+ @tool(
105
+ name="anova",
106
+ input_schema={
107
+ "type": "object",
108
+ "properties": {
109
+ "data": table_schema(),
110
+ "formula": {"type": "string"},
111
+ "type": {"type": "string", "enum": ["I", "II", "III"], "default": "I"}
112
+ },
113
+ "required": ["data", "formula"]
114
+ },
115
+ description="Analysis of Variance (ANOVA) with multiple types"
116
+ )
117
+ async def anova(context, params):
118
+ """Perform ANOVA analysis."""
119
+
120
+ await context.info("Performing ANOVA")
121
+
122
+ r_script = '''
123
+ data <- as.data.frame(args$data)
124
+ formula <- as.formula(args$formula)
125
+ anova_type <- args$type %||% "I"
126
+
127
+ # Fit the model
128
+ model <- lm(formula, data = data)
129
+
130
+ # Perform ANOVA
131
+ if (anova_type == "I") {
132
+ anova_result <- anova(model)
133
+ anova_table <- anova_result
134
+ } else {
135
+ if (!require(car)) install.packages("car", quietly = TRUE)
136
+ library(car)
137
+ anova_table <- Anova(model, type = as.numeric(substr(anova_type, 1, 1)))
138
+ }
139
+
140
+ # Extract ANOVA table
141
+ result <- list(
142
+ anova_table = list(
143
+ terms = rownames(anova_table),
144
+ df = anova_table[["Df"]],
145
+ sum_sq = anova_table[["Sum Sq"]] %||% anova_table[["Sum of Sq"]],
146
+ mean_sq = anova_table[["Mean Sq"]] %||% (anova_table[["Sum of Sq"]] / anova_table[["Df"]]),
147
+ f_value = anova_table[["F value"]] %||% anova_table[["F"]],
148
+ p_value = anova_table[["Pr(>F)"]] %||% anova_table[["Pr(>F)"]]
149
+ ),
150
+ model_summary = list(
151
+ r_squared = summary(model)$r.squared,
152
+ adj_r_squared = summary(model)$adj.r.squared,
153
+ residual_se = summary(model)$sigma,
154
+ df_residual = summary(model)$df[2],
155
+ n_obs = nrow(model$model)
156
+ ),
157
+ formula = deparse(formula),
158
+ anova_type = paste("Type", anova_type)
159
+ )
160
+ '''
161
+
162
+ try:
163
+ result = execute_r_script(r_script, params)
164
+ await context.info("ANOVA completed successfully")
165
+ return result
166
+
167
+ except Exception as e:
168
+ await context.error("ANOVA failed", error=str(e))
169
+ raise
170
+
171
+
172
+ @tool(
173
+ name="chi_square_test",
174
+ input_schema={
175
+ "type": "object",
176
+ "properties": {
177
+ "data": table_schema(),
178
+ "x": {"type": "string"},
179
+ "y": {"type": "string"},
180
+ "test_type": {"type": "string", "enum": ["independence", "goodness_of_fit"], "default": "independence"},
181
+ "expected": {"type": "array", "items": {"type": "number"}}
182
+ },
183
+ "required": ["data"]
184
+ },
185
+ description="Chi-square tests for independence and goodness of fit"
186
+ )
187
+ async def chi_square_test(context, params):
188
+ """Perform chi-square tests."""
189
+
190
+ await context.info("Performing chi-square test")
191
+
192
+ r_script = '''
193
+ data <- as.data.frame(args$data)
194
+ x_var <- args$x
195
+ y_var <- args$y
196
+ test_type <- args$test_type %||% "independence"
197
+ expected <- args$expected
198
+
199
+ if (test_type == "independence") {
200
+ if (is.null(x_var) || is.null(y_var)) {
201
+ stop("Both x and y variables required for independence test")
202
+ }
203
+
204
+ # Create contingency table
205
+ cont_table <- table(data[[x_var]], data[[y_var]])
206
+ test_result <- chisq.test(cont_table)
207
+
208
+ result <- list(
209
+ test_type = "Chi-square test of independence",
210
+ contingency_table = as.matrix(cont_table),
211
+ statistic = as.numeric(test_result$statistic),
212
+ df = test_result$parameter,
213
+ p_value = test_result$p.value,
214
+ expected_frequencies = as.matrix(test_result$expected),
215
+ residuals = as.matrix(test_result$residuals),
216
+ x_variable = x_var,
217
+ y_variable = y_var,
218
+ cramers_v = sqrt(test_result$statistic / (sum(cont_table) * (min(dim(cont_table)) - 1)))
219
+ )
220
+
221
+ } else {
222
+ # Goodness of fit test
223
+ if (is.null(x_var)) {
224
+ stop("x variable required for goodness of fit test")
225
+ }
226
+
227
+ observed <- table(data[[x_var]])
228
+
229
+ if (!is.null(expected)) {
230
+ test_result <- chisq.test(observed, p = expected)
231
+ } else {
232
+ test_result <- chisq.test(observed)
233
+ }
234
+
235
+ result <- list(
236
+ test_type = "Chi-square goodness of fit test",
237
+ observed_frequencies = as.numeric(observed),
238
+ expected_frequencies = as.numeric(test_result$expected),
239
+ statistic = as.numeric(test_result$statistic),
240
+ df = test_result$parameter,
241
+ p_value = test_result$p.value,
242
+ residuals = as.numeric(test_result$residuals),
243
+ categories = names(observed)
244
+ )
245
+ }
246
+ '''
247
+
248
+ try:
249
+ result = execute_r_script(r_script, params)
250
+ await context.info("Chi-square test completed successfully")
251
+ return result
252
+
253
+ except Exception as e:
254
+ await context.error("Chi-square test failed", error=str(e))
255
+ raise
256
+
257
+
258
+ @tool(
259
+ name="normality_test",
260
+ input_schema={
261
+ "type": "object",
262
+ "properties": {
263
+ "data": table_schema(),
264
+ "variable": {"type": "string"},
265
+ "test": {"type": "string", "enum": ["shapiro", "jarque_bera", "anderson"], "default": "shapiro"}
266
+ },
267
+ "required": ["data", "variable"]
268
+ },
269
+ description="Test variables for normality (Shapiro-Wilk, Jarque-Bera, Anderson-Darling)"
270
+ )
271
+ async def normality_test(context, params):
272
+ """Test for normality."""
273
+
274
+ await context.info("Testing for normality")
275
+
276
+ r_script = '''
277
+ data <- as.data.frame(args$data)
278
+ variable <- args$variable
279
+ test_type <- args$test %||% "shapiro"
280
+
281
+ values <- data[[variable]]
282
+ values <- values[!is.na(values)]
283
+
284
+ if (test_type == "shapiro") {
285
+ test_result <- shapiro.test(values)
286
+ result <- list(
287
+ test_name = "Shapiro-Wilk normality test",
288
+ statistic = as.numeric(test_result$statistic),
289
+ p_value = test_result$p.value,
290
+ is_normal = test_result$p.value > 0.05
291
+ )
292
+
293
+ } else if (test_type == "jarque_bera") {
294
+ if (!require(tseries)) install.packages("tseries", quietly = TRUE)
295
+ library(tseries)
296
+ test_result <- jarque.bera.test(values)
297
+ result <- list(
298
+ test_name = "Jarque-Bera normality test",
299
+ statistic = as.numeric(test_result$statistic),
300
+ df = test_result$parameter,
301
+ p_value = test_result$p.value,
302
+ is_normal = test_result$p.value > 0.05
303
+ )
304
+
305
+ } else if (test_type == "anderson") {
306
+ if (!require(nortest)) install.packages("nortest", quietly = TRUE)
307
+ library(nortest)
308
+ test_result <- ad.test(values)
309
+ result <- list(
310
+ test_name = "Anderson-Darling normality test",
311
+ statistic = as.numeric(test_result$statistic),
312
+ p_value = test_result$p.value,
313
+ is_normal = test_result$p.value > 0.05
314
+ )
315
+ }
316
+
317
+ result$variable <- variable
318
+ result$n_obs <- length(values)
319
+ result$mean <- mean(values)
320
+ result$sd <- sd(values)
321
+ result$skewness <- (sum((values - mean(values))^3) / length(values)) / (sd(values)^3)
322
+ result$kurtosis <- (sum((values - mean(values))^4) / length(values)) / (sd(values)^4) - 3
323
+ '''
324
+
325
+ try:
326
+ result = execute_r_script(r_script, params)
327
+ await context.info("Normality test completed successfully")
328
+ return result
329
+
330
+ except Exception as e:
331
+ await context.error("Normality test failed", error=str(e))
332
+ raise