mseep-rmcp 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,279 @@
1
+ """
2
+ Descriptive statistics tools for RMCP.
3
+
4
+ Comprehensive data exploration and summary capabilities.
5
+ """
6
+
7
+ from typing import Dict, Any
8
+ from ..registries.tools import tool
9
+ from ..core.schemas import table_schema
10
+ from ..r_integration import execute_r_script
11
+
12
+
13
+ @tool(
14
+ name="summary_stats",
15
+ input_schema={
16
+ "type": "object",
17
+ "properties": {
18
+ "data": table_schema(),
19
+ "variables": {"type": "array", "items": {"type": "string"}},
20
+ "group_by": {"type": "string"},
21
+ "percentiles": {"type": "array", "items": {"type": "number"}, "default": [0.25, 0.5, 0.75]}
22
+ },
23
+ "required": ["data"]
24
+ },
25
+ description="Comprehensive descriptive statistics with optional grouping"
26
+ )
27
+ async def summary_stats(context, params):
28
+ """Compute comprehensive descriptive statistics."""
29
+
30
+ await context.info("Computing summary statistics")
31
+
32
+ r_script = '''
33
+ if (!require(dplyr)) install.packages("dplyr", quietly = TRUE)
34
+ library(dplyr)
35
+
36
+ data <- as.data.frame(args$data)
37
+ variables <- args$variables
38
+ group_by <- args$group_by
39
+ percentiles <- args$percentiles %||% c(0.25, 0.5, 0.75)
40
+
41
+ # Select variables to analyze
42
+ if (is.null(variables)) {
43
+ numeric_vars <- names(data)[sapply(data, is.numeric)]
44
+ if (length(numeric_vars) == 0) {
45
+ stop("No numeric variables found in data")
46
+ }
47
+ variables <- numeric_vars
48
+ }
49
+
50
+ # Function to compute detailed stats
51
+ compute_stats <- function(x) {
52
+ x_clean <- x[!is.na(x)]
53
+ if (length(x_clean) == 0) {
54
+ return(list(
55
+ n = 0, n_missing = length(x), mean = NA, sd = NA, min = NA, max = NA,
56
+ q25 = NA, median = NA, q75 = NA, skewness = NA, kurtosis = NA
57
+ ))
58
+ }
59
+
60
+ stats <- list(
61
+ n = length(x_clean),
62
+ n_missing = sum(is.na(x)),
63
+ mean = mean(x_clean),
64
+ sd = sd(x_clean),
65
+ min = min(x_clean),
66
+ max = max(x_clean),
67
+ range = max(x_clean) - min(x_clean),
68
+ skewness = (sum((x_clean - mean(x_clean))^3) / length(x_clean)) / (sd(x_clean)^3),
69
+ kurtosis = (sum((x_clean - mean(x_clean))^4) / length(x_clean)) / (sd(x_clean)^4) - 3
70
+ )
71
+
72
+ # Add percentiles
73
+ for (i in seq_along(percentiles)) {
74
+ pct_name <- paste0("p", percentiles[i] * 100)
75
+ stats[[pct_name]] <- quantile(x_clean, percentiles[i])
76
+ }
77
+
78
+ return(stats)
79
+ }
80
+
81
+ if (is.null(group_by)) {
82
+ # Overall statistics
83
+ stats_list <- list()
84
+ for (var in variables) {
85
+ stats_list[[var]] <- compute_stats(data[[var]])
86
+ }
87
+
88
+ result <- list(
89
+ statistics = stats_list,
90
+ variables = variables,
91
+ n_obs = nrow(data),
92
+ grouped = FALSE
93
+ )
94
+
95
+ } else {
96
+ # Grouped statistics
97
+ grouped_stats <- list()
98
+ groups <- unique(data[[group_by]][!is.na(data[[group_by]])])
99
+
100
+ for (group_val in groups) {
101
+ group_data <- data[data[[group_by]] == group_val, ]
102
+ group_stats <- list()
103
+
104
+ for (var in variables) {
105
+ group_stats[[var]] <- compute_stats(group_data[[var]])
106
+ }
107
+
108
+ grouped_stats[[as.character(group_val)]] <- group_stats
109
+ }
110
+
111
+ result <- list(
112
+ statistics = grouped_stats,
113
+ variables = variables,
114
+ group_by = group_by,
115
+ groups = as.character(groups),
116
+ n_obs = nrow(data),
117
+ grouped = TRUE
118
+ )
119
+ }
120
+ '''
121
+
122
+ try:
123
+ result = execute_r_script(r_script, params)
124
+ await context.info("Summary statistics computed successfully")
125
+ return result
126
+
127
+ except Exception as e:
128
+ await context.error("Summary statistics failed", error=str(e))
129
+ raise
130
+
131
+
132
+ @tool(
133
+ name="outlier_detection",
134
+ input_schema={
135
+ "type": "object",
136
+ "properties": {
137
+ "data": table_schema(),
138
+ "variable": {"type": "string"},
139
+ "method": {"type": "string", "enum": ["iqr", "z_score", "modified_z"], "default": "iqr"},
140
+ "threshold": {"type": "number", "minimum": 0, "default": 3.0}
141
+ },
142
+ "required": ["data", "variable"]
143
+ },
144
+ description="Detect outliers using IQR, Z-score, or Modified Z-score methods"
145
+ )
146
+ async def outlier_detection(context, params):
147
+ """Detect outliers in data."""
148
+
149
+ await context.info("Detecting outliers")
150
+
151
+ r_script = '''
152
+ data <- as.data.frame(args$data)
153
+ variable <- args$variable
154
+ method <- args$method %||% "iqr"
155
+ threshold <- args$threshold %||% 3.0
156
+
157
+ values <- data[[variable]]
158
+ values_clean <- values[!is.na(values)]
159
+
160
+ if (method == "iqr") {
161
+ Q1 <- quantile(values_clean, 0.25)
162
+ Q3 <- quantile(values_clean, 0.75)
163
+ IQR <- Q3 - Q1
164
+ lower_bound <- Q1 - 1.5 * IQR
165
+ upper_bound <- Q3 + 1.5 * IQR
166
+ outliers <- which(values < lower_bound | values > upper_bound)
167
+
168
+ bounds <- list(lower = lower_bound, upper = upper_bound, iqr = IQR)
169
+
170
+ } else if (method == "z_score") {
171
+ mean_val <- mean(values_clean)
172
+ sd_val <- sd(values_clean)
173
+ z_scores <- abs((values - mean_val) / sd_val)
174
+ outliers <- which(z_scores > threshold)
175
+
176
+ bounds <- list(threshold = threshold, mean = mean_val, sd = sd_val)
177
+
178
+ } else if (method == "modified_z") {
179
+ median_val <- median(values_clean)
180
+ mad_val <- mad(values_clean)
181
+ modified_z <- abs(0.6745 * (values - median_val) / mad_val)
182
+ outliers <- which(modified_z > threshold)
183
+
184
+ bounds <- list(threshold = threshold, median = median_val, mad = mad_val)
185
+ }
186
+
187
+ result <- list(
188
+ method = method,
189
+ outlier_indices = outliers,
190
+ outlier_values = values[outliers],
191
+ n_outliers = length(outliers),
192
+ n_obs = length(values[!is.na(values)]),
193
+ outlier_percentage = length(outliers) / length(values_clean) * 100,
194
+ bounds = bounds,
195
+ variable = variable
196
+ )
197
+ '''
198
+
199
+ try:
200
+ result = execute_r_script(r_script, params)
201
+ await context.info("Outlier detection completed successfully")
202
+ return result
203
+
204
+ except Exception as e:
205
+ await context.error("Outlier detection failed", error=str(e))
206
+ raise
207
+
208
+
209
+ @tool(
210
+ name="frequency_table",
211
+ input_schema={
212
+ "type": "object",
213
+ "properties": {
214
+ "data": table_schema(),
215
+ "variables": {"type": "array", "items": {"type": "string"}},
216
+ "include_percentages": {"type": "boolean", "default": True},
217
+ "sort_by": {"type": "string", "enum": ["frequency", "value"], "default": "frequency"}
218
+ },
219
+ "required": ["data", "variables"]
220
+ },
221
+ description="Generate frequency tables with counts and percentages"
222
+ )
223
+ async def frequency_table(context, params):
224
+ """Generate frequency tables."""
225
+
226
+ await context.info("Creating frequency tables")
227
+
228
+ r_script = '''
229
+ data <- as.data.frame(args$data)
230
+ variables <- args$variables
231
+ include_percentages <- args$include_percentages %||% TRUE
232
+ sort_by <- args$sort_by %||% "frequency"
233
+
234
+ freq_tables <- list()
235
+
236
+ for (var in variables) {
237
+ values <- data[[var]]
238
+ freq_table <- table(values, useNA = "ifany")
239
+
240
+ # Sort if requested
241
+ if (sort_by == "frequency") {
242
+ freq_table <- sort(freq_table, decreasing = TRUE)
243
+ }
244
+
245
+ freq_data <- list(
246
+ values = names(freq_table),
247
+ frequencies = as.numeric(freq_table),
248
+ n_total = length(values[!is.na(values)])
249
+ )
250
+
251
+ if (include_percentages) {
252
+ freq_data$percentages <- as.numeric(freq_table) / sum(freq_table) * 100
253
+ }
254
+
255
+ # Add missing value info
256
+ n_missing <- sum(is.na(values))
257
+ if (n_missing > 0) {
258
+ freq_data$n_missing <- n_missing
259
+ freq_data$missing_percentage <- n_missing / length(values) * 100
260
+ }
261
+
262
+ freq_tables[[var]] <- freq_data
263
+ }
264
+
265
+ result <- list(
266
+ frequency_tables = freq_tables,
267
+ variables = variables,
268
+ total_observations = nrow(data)
269
+ )
270
+ '''
271
+
272
+ try:
273
+ result = execute_r_script(r_script, params)
274
+ await context.info("Frequency tables created successfully")
275
+ return result
276
+
277
+ except Exception as e:
278
+ await context.error("Frequency table creation failed", error=str(e))
279
+ raise
@@ -0,0 +1,250 @@
1
+ """
2
+ Econometric analysis tools for RMCP.
3
+
4
+ Advanced econometric modeling for panel data, instrumental variables, etc.
5
+ """
6
+
7
+ from typing import Dict, Any
8
+ from ..registries.tools import tool
9
+ from ..core.schemas import table_schema, formula_schema
10
+ from ..r_integration import execute_r_script
11
+
12
+
13
+ @tool(
14
+ name="panel_regression",
15
+ input_schema={
16
+ "type": "object",
17
+ "properties": {
18
+ "data": table_schema(),
19
+ "formula": formula_schema(),
20
+ "id_variable": {"type": "string"},
21
+ "time_variable": {"type": "string"},
22
+ "model": {"type": "string", "enum": ["pooling", "within", "between", "random"], "default": "within"},
23
+ "robust": {"type": "boolean", "default": True}
24
+ },
25
+ "required": ["data", "formula", "id_variable", "time_variable"]
26
+ },
27
+ description="Panel data regression with fixed/random effects"
28
+ )
29
+ async def panel_regression(context, params):
30
+ """Perform panel data regression."""
31
+
32
+ await context.info("Fitting panel data regression")
33
+
34
+ r_script = '''
35
+ if (!require(plm)) install.packages("plm", quietly = TRUE)
36
+ library(plm)
37
+
38
+ data <- as.data.frame(args$data)
39
+ formula <- as.formula(args$formula)
40
+ id_var <- args$id_variable
41
+ time_var <- args$time_variable
42
+ model_type <- args$model %||% "within"
43
+ robust <- args$robust %||% TRUE
44
+
45
+ # Create panel data frame
46
+ pdata <- pdata.frame(data, index = c(id_var, time_var))
47
+
48
+ # Fit panel model
49
+ if (model_type == "pooling") {
50
+ model <- plm(formula, data = pdata, model = "pooling")
51
+ } else if (model_type == "within") {
52
+ model <- plm(formula, data = pdata, model = "within") # Fixed effects
53
+ } else if (model_type == "between") {
54
+ model <- plm(formula, data = pdata, model = "between")
55
+ } else if (model_type == "random") {
56
+ model <- plm(formula, data = pdata, model = "random")
57
+ }
58
+
59
+ # Get robust standard errors if requested
60
+ if (robust) {
61
+ if (!require(lmtest)) install.packages("lmtest", quietly = TRUE)
62
+ library(lmtest)
63
+ robust_se <- coeftest(model, vcov = vcovHC(model, type = "HC1"))
64
+ coef_table <- robust_se
65
+ } else {
66
+ coef_table <- summary(model)$coefficients
67
+ }
68
+
69
+ result <- list(
70
+ coefficients = as.list(coef_table[, "Estimate"]),
71
+ std_errors = as.list(coef_table[, "Std. Error"]),
72
+ t_values = as.list(coef_table[, "t value"]),
73
+ p_values = as.list(coef_table[, "Pr(>|t|)"]),
74
+ r_squared = summary(model)$r.squared[1],
75
+ adj_r_squared = summary(model)$r.squared[2],
76
+ model_type = model_type,
77
+ robust_se = robust,
78
+ n_obs = nobs(model),
79
+ n_groups = pdim(model)$nT$n,
80
+ time_periods = pdim(model)$nT$T,
81
+ formula = deparse(formula),
82
+ id_variable = id_var,
83
+ time_variable = time_var
84
+ )
85
+ '''
86
+
87
+ try:
88
+ result = execute_r_script(r_script, params)
89
+ await context.info("Panel regression completed successfully")
90
+ return result
91
+
92
+ except Exception as e:
93
+ await context.error("Panel regression failed", error=str(e))
94
+ raise
95
+
96
+
97
+ @tool(
98
+ name="instrumental_variables",
99
+ input_schema={
100
+ "type": "object",
101
+ "properties": {
102
+ "data": table_schema(),
103
+ "formula": {"type": "string", "description": "Format: 'y ~ x1 + x2 | z1 + z2' where | separates instruments"},
104
+ "robust": {"type": "boolean", "default": True}
105
+ },
106
+ "required": ["data", "formula"]
107
+ },
108
+ description="Two-stage least squares (2SLS) instrumental variables regression"
109
+ )
110
+ async def instrumental_variables(context, params):
111
+ """Perform instrumental variables regression."""
112
+
113
+ await context.info("Fitting instrumental variables model")
114
+
115
+ r_script = '''
116
+ if (!require(AER)) install.packages("AER", quietly = TRUE)
117
+ library(AER)
118
+
119
+ data <- as.data.frame(args$data)
120
+ formula_str <- args$formula
121
+ robust <- args$robust %||% TRUE
122
+
123
+ # Parse IV formula (y ~ x1 + x2 | z1 + z2)
124
+ formula <- as.formula(formula_str)
125
+
126
+ # Fit 2SLS model
127
+ iv_model <- ivreg(formula, data = data)
128
+
129
+ # Get robust standard errors if requested
130
+ if (robust) {
131
+ robust_se <- coeftest(iv_model, vcov = sandwich)
132
+ coef_table <- robust_se
133
+ } else {
134
+ coef_table <- summary(iv_model)$coefficients
135
+ }
136
+
137
+ # Diagnostic tests
138
+ summary_iv <- summary(iv_model, diagnostics = TRUE)
139
+
140
+ result <- list(
141
+ coefficients = as.list(coef_table[, "Estimate"]),
142
+ std_errors = as.list(coef_table[, "Std. Error"]),
143
+ t_values = as.list(coef_table[, "t value"]),
144
+ p_values = as.list(coef_table[, "Pr(>|t|)"]),
145
+ r_squared = summary_iv$r.squared,
146
+ adj_r_squared = summary_iv$adj.r.squared,
147
+ weak_instruments = list(
148
+ statistic = summary_iv$diagnostics["Weak instruments", "statistic"],
149
+ p_value = summary_iv$diagnostics["Weak instruments", "p-value"]
150
+ ),
151
+ wu_hausman = list(
152
+ statistic = summary_iv$diagnostics["Wu-Hausman", "statistic"],
153
+ p_value = summary_iv$diagnostics["Wu-Hausman", "p-value"]
154
+ ),
155
+ sargan = list(
156
+ statistic = summary_iv$diagnostics["Sargan", "statistic"],
157
+ p_value = summary_iv$diagnostics["Sargan", "p-value"]
158
+ ),
159
+ robust_se = robust,
160
+ formula = formula_str,
161
+ n_obs = nobs(iv_model)
162
+ )
163
+ '''
164
+
165
+ try:
166
+ result = execute_r_script(r_script, params)
167
+ await context.info("Instrumental variables model fitted successfully")
168
+ return result
169
+
170
+ except Exception as e:
171
+ await context.error("Instrumental variables fitting failed", error=str(e))
172
+ raise
173
+
174
+
175
+ @tool(
176
+ name="var_model",
177
+ input_schema={
178
+ "type": "object",
179
+ "properties": {
180
+ "data": table_schema(),
181
+ "variables": {"type": "array", "items": {"type": "string"}},
182
+ "lags": {"type": "integer", "minimum": 1, "maximum": 10, "default": 2},
183
+ "type": {"type": "string", "enum": ["const", "trend", "both", "none"], "default": "const"}
184
+ },
185
+ "required": ["data", "variables"]
186
+ },
187
+ description="Vector Autoregression (VAR) model for multivariate time series"
188
+ )
189
+ async def var_model(context, params):
190
+ """Fit Vector Autoregression model."""
191
+
192
+ await context.info("Fitting VAR model")
193
+
194
+ r_script = '''
195
+ if (!require(vars)) install.packages("vars", quietly = TRUE)
196
+ library(vars)
197
+
198
+ data <- as.data.frame(args$data)
199
+ variables <- args$variables
200
+ lag_order <- args$lags %||% 2
201
+ var_type <- args$type %||% "const"
202
+
203
+ # Select variables for VAR
204
+ var_data <- data[, variables, drop = FALSE]
205
+
206
+ # Remove missing values
207
+ var_data <- na.omit(var_data)
208
+
209
+ # Fit VAR model
210
+ var_model <- VAR(var_data, p = lag_order, type = var_type)
211
+
212
+ # Extract coefficients for each equation
213
+ equations <- list()
214
+ for (var in variables) {
215
+ eq_summary <- summary(var_model)$varresult[[var]]
216
+ equations[[var]] <- list(
217
+ coefficients = as.list(coef(eq_summary)),
218
+ std_errors = as.list(eq_summary$coefficients[, "Std. Error"]),
219
+ t_values = as.list(eq_summary$coefficients[, "t value"]),
220
+ p_values = as.list(eq_summary$coefficients[, "Pr(>|t|)"]),
221
+ r_squared = eq_summary$r.squared,
222
+ adj_r_squared = eq_summary$adj.r.squared
223
+ )
224
+ }
225
+
226
+ # Model diagnostics
227
+ var_summary <- summary(var_model)
228
+
229
+ result <- list(
230
+ equations = equations,
231
+ variables = variables,
232
+ lag_order = lag_order,
233
+ var_type = var_type,
234
+ n_obs = nobs(var_model),
235
+ n_variables = length(variables),
236
+ loglik = logLik(var_model)[1],
237
+ aic = AIC(var_model),
238
+ bic = BIC(var_model),
239
+ residual_covariance = as.matrix(var_summary$covres)
240
+ )
241
+ '''
242
+
243
+ try:
244
+ result = execute_r_script(r_script, params)
245
+ await context.info("VAR model fitted successfully")
246
+ return result
247
+
248
+ except Exception as e:
249
+ await context.error("VAR model fitting failed", error=str(e))
250
+ raise