aiecs 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +75 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +295 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +341 -0
- aiecs/config/__init__.py +15 -0
- aiecs/config/config.py +117 -0
- aiecs/config/registry.py +19 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +150 -0
- aiecs/core/interface/storage_interface.py +214 -0
- aiecs/domain/__init__.py +20 -0
- aiecs/domain/context/__init__.py +28 -0
- aiecs/domain/context/content_engine.py +982 -0
- aiecs/domain/context/conversation_models.py +306 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +49 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +460 -0
- aiecs/domain/task/model.py +50 -0
- aiecs/domain/task/task_context.py +257 -0
- aiecs/infrastructure/__init__.py +26 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +341 -0
- aiecs/infrastructure/messaging/websocket_manager.py +289 -0
- aiecs/infrastructure/monitoring/__init__.py +12 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +138 -0
- aiecs/infrastructure/monitoring/structured_logger.py +50 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +376 -0
- aiecs/infrastructure/persistence/__init__.py +12 -0
- aiecs/infrastructure/persistence/database_manager.py +286 -0
- aiecs/infrastructure/persistence/file_storage.py +671 -0
- aiecs/infrastructure/persistence/redis_client.py +162 -0
- aiecs/llm/__init__.py +54 -0
- aiecs/llm/base_client.py +99 -0
- aiecs/llm/client_factory.py +339 -0
- aiecs/llm/custom_callbacks.py +228 -0
- aiecs/llm/openai_client.py +125 -0
- aiecs/llm/vertex_client.py +186 -0
- aiecs/llm/xai_client.py +184 -0
- aiecs/main.py +351 -0
- aiecs/scripts/DEPENDENCY_SYSTEM_SUMMARY.md +241 -0
- aiecs/scripts/README_DEPENDENCY_CHECKER.md +309 -0
- aiecs/scripts/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/dependency_checker.py +825 -0
- aiecs/scripts/dependency_fixer.py +348 -0
- aiecs/scripts/download_nlp_data.py +348 -0
- aiecs/scripts/fix_weasel_validator.py +121 -0
- aiecs/scripts/fix_weasel_validator.sh +82 -0
- aiecs/scripts/patch_weasel_library.sh +188 -0
- aiecs/scripts/quick_dependency_check.py +269 -0
- aiecs/scripts/run_weasel_patch.sh +41 -0
- aiecs/scripts/setup_nlp_data.sh +217 -0
- aiecs/tasks/__init__.py +2 -0
- aiecs/tasks/worker.py +111 -0
- aiecs/tools/__init__.py +196 -0
- aiecs/tools/base_tool.py +202 -0
- aiecs/tools/langchain_adapter.py +361 -0
- aiecs/tools/task_tools/__init__.py +82 -0
- aiecs/tools/task_tools/chart_tool.py +704 -0
- aiecs/tools/task_tools/classfire_tool.py +901 -0
- aiecs/tools/task_tools/image_tool.py +397 -0
- aiecs/tools/task_tools/office_tool.py +600 -0
- aiecs/tools/task_tools/pandas_tool.py +565 -0
- aiecs/tools/task_tools/report_tool.py +499 -0
- aiecs/tools/task_tools/research_tool.py +363 -0
- aiecs/tools/task_tools/scraper_tool.py +548 -0
- aiecs/tools/task_tools/search_api.py +7 -0
- aiecs/tools/task_tools/stats_tool.py +513 -0
- aiecs/tools/temp_file_manager.py +126 -0
- aiecs/tools/tool_executor/__init__.py +35 -0
- aiecs/tools/tool_executor/tool_executor.py +518 -0
- aiecs/utils/LLM_output_structor.py +409 -0
- aiecs/utils/__init__.py +23 -0
- aiecs/utils/base_callback.py +50 -0
- aiecs/utils/execution_utils.py +158 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +13 -0
- aiecs/utils/token_usage_repository.py +279 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +41 -0
- aiecs-1.0.0.dist-info/METADATA +610 -0
- aiecs-1.0.0.dist-info/RECORD +90 -0
- aiecs-1.0.0.dist-info/WHEEL +5 -0
- aiecs-1.0.0.dist-info/entry_points.txt +7 -0
- aiecs-1.0.0.dist-info/licenses/LICENSE +225 -0
- aiecs-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,513 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import tempfile
|
|
4
|
+
from typing import Dict, Any, List, Optional, Union, Tuple
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import numpy as np
|
|
10
|
+
from pydantic_settings import BaseSettings
|
|
11
|
+
from pydantic import ValidationError, ConfigDict
|
|
12
|
+
|
|
13
|
+
from aiecs.tools.base_tool import BaseTool
|
|
14
|
+
from aiecs.tools import register_tool
|
|
15
|
+
|
|
16
|
+
# Enums for configuration options
|
|
17
|
+
class ScalerType(str, Enum):
|
|
18
|
+
STANDARD = "standard"
|
|
19
|
+
MINMAX = "minmax"
|
|
20
|
+
ROBUST = "robust"
|
|
21
|
+
NONE = "none"
|
|
22
|
+
|
|
23
|
+
class StatsSettings(BaseSettings):
|
|
24
|
+
"""Configuration for StatsTool."""
|
|
25
|
+
max_file_size_mb: int = 200
|
|
26
|
+
allowed_extensions: List[str] = ['.sav', '.sas7bdat', '.por', '.csv', '.xlsx', '.xls', '.json', '.parquet', '.feather']
|
|
27
|
+
env_prefix: str = 'STATS_TOOL_'
|
|
28
|
+
|
|
29
|
+
model_config = ConfigDict(env_prefix='STATS_TOOL_')
|
|
30
|
+
|
|
31
|
+
# Exceptions
|
|
32
|
+
class StatsToolError(Exception): pass
|
|
33
|
+
class FileOperationError(StatsToolError): pass
|
|
34
|
+
class AnalysisError(StatsToolError): pass
|
|
35
|
+
|
|
36
|
+
# Utility Dataclass for Statistical Results
|
|
37
|
+
@dataclass
|
|
38
|
+
class StatsResult:
|
|
39
|
+
"""Structured statistical result."""
|
|
40
|
+
test_type: str
|
|
41
|
+
statistic: float
|
|
42
|
+
pvalue: float
|
|
43
|
+
significant: bool
|
|
44
|
+
additional_metrics: Dict[str, Any]
|
|
45
|
+
|
|
46
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
47
|
+
return {
|
|
48
|
+
'test_type': self.test_type,
|
|
49
|
+
'statistic': self.statistic,
|
|
50
|
+
'pvalue': self.pvalue,
|
|
51
|
+
'significant': self.significant,
|
|
52
|
+
**self.additional_metrics
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
@register_tool('stats')
|
|
56
|
+
class StatsTool(BaseTool):
|
|
57
|
+
"""Enhanced statistical analysis tool for various data formats and operations."""
|
|
58
|
+
def __init__(self, config: Dict[str, Any] = None):
|
|
59
|
+
super().__init__(config)
|
|
60
|
+
self.settings = StatsSettings()
|
|
61
|
+
if config:
|
|
62
|
+
try:
|
|
63
|
+
self.settings = self.settings.model_validate({**self.settings.model_dump(), **config})
|
|
64
|
+
except ValidationError as e:
|
|
65
|
+
raise ValueError(f"Invalid settings: {e}")
|
|
66
|
+
self.logger = logging.getLogger(__name__)
|
|
67
|
+
if not self.logger.handlers:
|
|
68
|
+
h = logging.StreamHandler()
|
|
69
|
+
h.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
|
|
70
|
+
self.logger.addHandler(h)
|
|
71
|
+
self.logger.setLevel(logging.INFO)
|
|
72
|
+
|
|
73
|
+
def _load_data(self, file_path: str, nrows: Optional[int] = None, sheet_name: Optional[Union[str, int]] = 0) -> pd.DataFrame:
|
|
74
|
+
"""Load data from various file formats into a pandas DataFrame."""
|
|
75
|
+
try:
|
|
76
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
77
|
+
if ext in ['.sav', '.sas7bdat', '.por']:
|
|
78
|
+
import pyreadstat
|
|
79
|
+
if ext == '.sav':
|
|
80
|
+
df, meta = pyreadstat.read_sav(file_path)
|
|
81
|
+
elif ext == '.sas7bdat':
|
|
82
|
+
df, meta = pyreadstat.read_sas7bdat(file_path)
|
|
83
|
+
else:
|
|
84
|
+
df, meta = pyreadstat.read_por(file_path)
|
|
85
|
+
return df
|
|
86
|
+
elif ext == '.csv':
|
|
87
|
+
return pd.read_csv(file_path, nrows=nrows)
|
|
88
|
+
elif ext in ['.xlsx', '.xls']:
|
|
89
|
+
return pd.read_excel(file_path, sheet_name=sheet_name, nrows=nrows)
|
|
90
|
+
elif ext == '.json':
|
|
91
|
+
return pd.read_json(file_path)
|
|
92
|
+
elif ext == '.parquet':
|
|
93
|
+
return pd.read_parquet(file_path)
|
|
94
|
+
elif ext == '.feather':
|
|
95
|
+
return pd.read_feather(file_path)
|
|
96
|
+
else:
|
|
97
|
+
raise FileOperationError(f"Unsupported file format: {ext}")
|
|
98
|
+
except Exception as e:
|
|
99
|
+
raise FileOperationError(f"Error reading file {file_path}: {str(e)}")
|
|
100
|
+
|
|
101
|
+
def _validate_variables(self, df: pd.DataFrame, vars_to_check: List[str]) -> None:
|
|
102
|
+
"""Validate variables exist in the dataset."""
|
|
103
|
+
if not vars_to_check:
|
|
104
|
+
return
|
|
105
|
+
available_vars = df.columns.tolist()
|
|
106
|
+
missing_vars = [var for var in vars_to_check if var not in available_vars]
|
|
107
|
+
if missing_vars:
|
|
108
|
+
raise FileOperationError(f"Variables not found in dataset: {', '.join(missing_vars)}")
|
|
109
|
+
|
|
110
|
+
def _interpret_effect_size(self, d: float) -> str:
|
|
111
|
+
"""Interpret Cohen's d or Cramer's V effect size."""
|
|
112
|
+
thresholds = [(0.2, "negligible"), (0.5, "small"), (0.8, "medium")]
|
|
113
|
+
for threshold, label in thresholds:
|
|
114
|
+
if abs(d) < threshold:
|
|
115
|
+
return label
|
|
116
|
+
return "large"
|
|
117
|
+
|
|
118
|
+
def read_data(self, file_path: str, nrows: Optional[int] = None, sheet_name: Optional[Union[str, int]] = 0) -> Dict[str, Any]:
|
|
119
|
+
"""Read data from various file formats."""
|
|
120
|
+
df = self._load_data(file_path, nrows, sheet_name)
|
|
121
|
+
return {
|
|
122
|
+
'variables': df.columns.tolist(),
|
|
123
|
+
'observations': len(df),
|
|
124
|
+
'dtypes': {col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
125
|
+
'memory_usage': df.memory_usage(deep=True).sum() / (1024 * 1024),
|
|
126
|
+
'preview': df.head(5).to_dict(orient='records')
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
def describe(self, file_path: str, variables: Optional[List[str]] = None, include_percentiles: bool = False, percentiles: Optional[List[float]] = None) -> Dict[str, Any]:
|
|
130
|
+
"""Generate descriptive statistics for variables."""
|
|
131
|
+
df = self._load_data(file_path)
|
|
132
|
+
if variables:
|
|
133
|
+
self._validate_variables(df, variables)
|
|
134
|
+
df = df[variables]
|
|
135
|
+
desc = df.describe()
|
|
136
|
+
if include_percentiles and percentiles:
|
|
137
|
+
additional_percentiles = [p for p in percentiles if p not in [0.25, 0.5, 0.75]]
|
|
138
|
+
if additional_percentiles:
|
|
139
|
+
additional_desc = df.describe(percentiles=percentiles)
|
|
140
|
+
desc = pd.concat([desc, additional_desc.loc[[f"{int(p*100)}%" for p in additional_percentiles]]])
|
|
141
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
142
|
+
if numeric_cols.any():
|
|
143
|
+
desc.loc['skew'] = df[numeric_cols].skew()
|
|
144
|
+
desc.loc['kurtosis'] = df[numeric_cols].kurt()
|
|
145
|
+
return {
|
|
146
|
+
'statistics': desc.to_dict(),
|
|
147
|
+
'summary': desc.to_string()
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
def ttest(self, file_path: str, var1: str, var2: str, equal_var: bool = True, paired: bool = False) -> Dict[str, Any]:
|
|
151
|
+
"""Perform t-tests (independent or paired). Also handles legacy ttest_ind."""
|
|
152
|
+
df = self._load_data(file_path)
|
|
153
|
+
self._validate_variables(df, [var1, var2])
|
|
154
|
+
import scipy.stats as stats
|
|
155
|
+
a = df[var1].dropna().values
|
|
156
|
+
b = df[var2].dropna().values
|
|
157
|
+
if paired:
|
|
158
|
+
min_len = min(len(a), len(b))
|
|
159
|
+
stat, p = stats.ttest_rel(a[:min_len], b[:min_len])
|
|
160
|
+
test_type = "paired t-test"
|
|
161
|
+
else:
|
|
162
|
+
stat, p = stats.ttest_ind(a, b, equal_var=equal_var)
|
|
163
|
+
test_type = "independent t-test (equal variance)" if equal_var else "Welch's t-test (unequal variance)"
|
|
164
|
+
mean_a = np.mean(a)
|
|
165
|
+
mean_b = np.mean(b)
|
|
166
|
+
std_a = np.std(a, ddof=1)
|
|
167
|
+
std_b = np.std(b, ddof=1)
|
|
168
|
+
if equal_var:
|
|
169
|
+
pooled_std = np.sqrt(((len(a) - 1) * std_a**2 + (len(b) - 1) * std_b**2) / (len(a) + len(b) - 2))
|
|
170
|
+
cohens_d = (mean_a - mean_b) / pooled_std
|
|
171
|
+
else:
|
|
172
|
+
cohens_d = (mean_a - mean_b) / np.sqrt((std_a**2 + std_b**2) / 2)
|
|
173
|
+
return StatsResult(
|
|
174
|
+
test_type=test_type,
|
|
175
|
+
statistic=float(stat),
|
|
176
|
+
pvalue=float(p),
|
|
177
|
+
significant=p < 0.05,
|
|
178
|
+
additional_metrics={
|
|
179
|
+
'cohens_d': float(cohens_d),
|
|
180
|
+
'effect_size_interpretation': self._interpret_effect_size(cohens_d),
|
|
181
|
+
'group1_mean': float(mean_a),
|
|
182
|
+
'group2_mean': float(mean_b),
|
|
183
|
+
'group1_std': float(std_a),
|
|
184
|
+
'group2_std': float(std_b),
|
|
185
|
+
'group1_n': int(len(a)),
|
|
186
|
+
'group2_n': int(len(b))
|
|
187
|
+
}
|
|
188
|
+
).to_dict()
|
|
189
|
+
|
|
190
|
+
# Legacy method (now an alias)
|
|
191
|
+
ttest_ind = ttest
|
|
192
|
+
|
|
193
|
+
def correlation(self, file_path: str, variables: Optional[List[str]] = None, var1: Optional[str] = None, var2: Optional[str] = None, method: str = "pearson") -> Dict[str, Any]:
|
|
194
|
+
"""Perform correlation analysis."""
|
|
195
|
+
df = self._load_data(file_path)
|
|
196
|
+
if variables:
|
|
197
|
+
self._validate_variables(df, variables)
|
|
198
|
+
if var1 and var2:
|
|
199
|
+
self._validate_variables(df, [var1, var2])
|
|
200
|
+
import scipy.stats as stats
|
|
201
|
+
result = {}
|
|
202
|
+
if variables:
|
|
203
|
+
corr_matrix = df[variables].corr(method=method)
|
|
204
|
+
result['correlation_matrix'] = corr_matrix.to_dict()
|
|
205
|
+
flat_corrs = [
|
|
206
|
+
{'var1': v1, 'var2': v2, 'correlation': corr_matrix.loc[v1, v2], 'abs_correlation': abs(corr_matrix.loc[v1, v2])}
|
|
207
|
+
for i, v1 in enumerate(variables)
|
|
208
|
+
for j, v2 in enumerate(variables) if i < j
|
|
209
|
+
]
|
|
210
|
+
flat_corrs.sort(key=lambda x: x['abs_correlation'], reverse=True)
|
|
211
|
+
result['pairs'] = flat_corrs
|
|
212
|
+
elif var1 and var2:
|
|
213
|
+
x = df[var1].dropna()
|
|
214
|
+
y = df[var2].dropna()
|
|
215
|
+
method_map = {
|
|
216
|
+
'pearson': (stats.pearsonr, "Pearson's r"),
|
|
217
|
+
'spearman': (stats.spearmanr, "Spearman's rho"),
|
|
218
|
+
'kendall': (stats.kendalltau, "Kendall's tau")
|
|
219
|
+
}
|
|
220
|
+
func, method_name = method_map[method]
|
|
221
|
+
corr, p = func(x, y)
|
|
222
|
+
result = {
|
|
223
|
+
'method': method_name,
|
|
224
|
+
'correlation': float(corr),
|
|
225
|
+
'pvalue': float(p),
|
|
226
|
+
'significant': p < 0.05,
|
|
227
|
+
'n': len(x)
|
|
228
|
+
}
|
|
229
|
+
return result
|
|
230
|
+
|
|
231
|
+
def anova(self, file_path: str, dependent: str, factor: str, post_hoc: bool = False) -> Dict[str, Any]:
|
|
232
|
+
"""Perform one-way ANOVA with optional post-hoc tests."""
|
|
233
|
+
df = self._load_data(file_path)
|
|
234
|
+
self._validate_variables(df, [dependent, factor])
|
|
235
|
+
import scipy.stats as stats
|
|
236
|
+
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
|
237
|
+
dependent_var = df[dependent].dropna()
|
|
238
|
+
factor_var = df[factor].dropna()
|
|
239
|
+
min_len = min(len(dependent_var), len(factor_var))
|
|
240
|
+
dependent_var = dependent_var[:min_len]
|
|
241
|
+
factor_var = factor_var[:min_len]
|
|
242
|
+
groups = {name: group[dependent].dropna().values for name, group in df.groupby(factor)}
|
|
243
|
+
stat, p = stats.f_oneway(*groups.values())
|
|
244
|
+
result = {
|
|
245
|
+
'F': float(stat),
|
|
246
|
+
'pvalue': float(p),
|
|
247
|
+
'significant': p < 0.05,
|
|
248
|
+
'groups': len(groups),
|
|
249
|
+
'group_sizes': {name: len(values) for name, values in groups.items()},
|
|
250
|
+
'group_means': {name: float(np.mean(values)) for name, values in groups.items()},
|
|
251
|
+
'group_std': {name: float(np.std(values, ddof=1)) for name, values in groups.items()}
|
|
252
|
+
}
|
|
253
|
+
if post_hoc:
|
|
254
|
+
post_hoc_df = pd.DataFrame({'value': dependent_var, 'group': factor_var})
|
|
255
|
+
tukey = pairwise_tukeyhsd(post_hoc_df['value'], post_hoc_df['group'])
|
|
256
|
+
from itertools import combinations
|
|
257
|
+
group_pairs = list(combinations(tukey.groupsunique, 2))
|
|
258
|
+
tukey_results = [
|
|
259
|
+
{
|
|
260
|
+
'group1': str(group1),
|
|
261
|
+
'group2': str(group2),
|
|
262
|
+
'mean_difference': float(mean_diff),
|
|
263
|
+
'p_adjusted': float(p_adj),
|
|
264
|
+
'significant': bool(reject),
|
|
265
|
+
'conf_lower': float(lower),
|
|
266
|
+
'conf_upper': float(upper)
|
|
267
|
+
}
|
|
268
|
+
for (group1, group2), mean_diff, p_adj, lower, upper, reject in zip(
|
|
269
|
+
group_pairs,
|
|
270
|
+
tukey.meandiffs,
|
|
271
|
+
tukey.pvalues,
|
|
272
|
+
tukey.confint[:,0],
|
|
273
|
+
tukey.confint[:,1],
|
|
274
|
+
tukey.reject
|
|
275
|
+
)
|
|
276
|
+
]
|
|
277
|
+
result['post_hoc'] = {
|
|
278
|
+
'method': 'Tukey HSD',
|
|
279
|
+
'alpha': 0.05, # Standard significance level for Tukey HSD
|
|
280
|
+
'comparisons': tukey_results
|
|
281
|
+
}
|
|
282
|
+
return result
|
|
283
|
+
|
|
284
|
+
def chi_square(self, file_path: str, var1: str, var2: str, correction: bool = True) -> Dict[str, Any]:
|
|
285
|
+
"""Perform chi-square test of independence."""
|
|
286
|
+
df = self._load_data(file_path)
|
|
287
|
+
self._validate_variables(df, [var1, var2])
|
|
288
|
+
import scipy.stats as stats
|
|
289
|
+
contingency = pd.crosstab(df[var1], df[var2])
|
|
290
|
+
chi2, p, dof, expected = stats.chi2_contingency(contingency, correction=correction)
|
|
291
|
+
n = contingency.sum().sum()
|
|
292
|
+
min_dim = min(contingency.shape) - 1
|
|
293
|
+
cramers_v = np.sqrt(chi2 / (n * min_dim))
|
|
294
|
+
return {
|
|
295
|
+
'chi2': float(chi2),
|
|
296
|
+
'pvalue': float(p),
|
|
297
|
+
'dof': int(dof),
|
|
298
|
+
'significant': p < 0.05,
|
|
299
|
+
'cramers_v': float(cramers_v),
|
|
300
|
+
'effect_size_interpretation': self._interpret_effect_size(cramers_v),
|
|
301
|
+
'contingency_table': contingency.to_dict(),
|
|
302
|
+
'expected_frequencies': pd.DataFrame(expected, index=contingency.index, columns=contingency.columns).to_dict(),
|
|
303
|
+
'test_type': 'Chi-square test with Yates correction' if correction else 'Chi-square test'
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
def non_parametric(self, file_path: str, test_type: str, variables: List[str], grouping: Optional[str] = None) -> Dict[str, Any]:
|
|
307
|
+
"""Perform non-parametric statistical tests."""
|
|
308
|
+
df = self._load_data(file_path)
|
|
309
|
+
self._validate_variables(df, variables + ([grouping] if grouping else []))
|
|
310
|
+
import scipy.stats as stats
|
|
311
|
+
if test_type == 'mann_whitney':
|
|
312
|
+
if len(variables) != 2:
|
|
313
|
+
raise AnalysisError("Mann-Whitney U test requires exactly 2 variables")
|
|
314
|
+
x = df[variables[0]].dropna().values
|
|
315
|
+
y = df[variables[1]].dropna().values
|
|
316
|
+
u_stat, p_value = stats.mannwhitneyu(x, y)
|
|
317
|
+
return StatsResult(
|
|
318
|
+
test_type='Mann-Whitney U test',
|
|
319
|
+
statistic=float(u_stat),
|
|
320
|
+
pvalue=float(p_value),
|
|
321
|
+
significant=p_value < 0.05,
|
|
322
|
+
additional_metrics={
|
|
323
|
+
'n1': len(x),
|
|
324
|
+
'n2': len(y),
|
|
325
|
+
'median1': float(np.median(x)),
|
|
326
|
+
'median2': float(np.median(y))
|
|
327
|
+
}
|
|
328
|
+
).to_dict()
|
|
329
|
+
elif test_type == 'wilcoxon':
|
|
330
|
+
if len(variables) != 2:
|
|
331
|
+
raise AnalysisError("Wilcoxon signed-rank test requires exactly 2 variables")
|
|
332
|
+
x = df[variables[0]].dropna().values
|
|
333
|
+
y = df[variables[1]].dropna().values
|
|
334
|
+
min_len = min(len(x), len(y))
|
|
335
|
+
x = x[:min_len]
|
|
336
|
+
y = y[:min_len]
|
|
337
|
+
w_stat, p_value = stats.wilcoxon(x, y)
|
|
338
|
+
return StatsResult(
|
|
339
|
+
test_type='Wilcoxon signed-rank test',
|
|
340
|
+
statistic=float(w_stat),
|
|
341
|
+
pvalue=float(p_value),
|
|
342
|
+
significant=p_value < 0.05,
|
|
343
|
+
additional_metrics={
|
|
344
|
+
'n_pairs': min_len,
|
|
345
|
+
'median_difference': float(np.median(x - y))
|
|
346
|
+
}
|
|
347
|
+
).to_dict()
|
|
348
|
+
elif test_type == 'kruskal':
|
|
349
|
+
if not grouping:
|
|
350
|
+
raise AnalysisError("Kruskal-Wallis test requires a grouping variable")
|
|
351
|
+
groups = {f"{var}_{name}": group[var].dropna().values for name, group in df.groupby(grouping) for var in variables}
|
|
352
|
+
h_stat, p_value = stats.kruskal(*groups.values())
|
|
353
|
+
return StatsResult(
|
|
354
|
+
test_type='Kruskal-Wallis H test',
|
|
355
|
+
statistic=float(h_stat),
|
|
356
|
+
pvalue=float(p_value),
|
|
357
|
+
significant=p_value < 0.05,
|
|
358
|
+
additional_metrics={
|
|
359
|
+
'groups': len(groups),
|
|
360
|
+
'group_sizes': {name: len(values) for name, values in groups.items()},
|
|
361
|
+
'group_medians': {name: float(np.median(values)) for name, values in groups.items()}
|
|
362
|
+
}
|
|
363
|
+
).to_dict()
|
|
364
|
+
elif test_type == 'friedman':
|
|
365
|
+
if len(variables) < 2:
|
|
366
|
+
raise AnalysisError("Friedman test requires at least 2 variables")
|
|
367
|
+
data = df[variables].dropna()
|
|
368
|
+
chi2, p_value = stats.friedmanchisquare(*[data[var].values for var in variables])
|
|
369
|
+
return StatsResult(
|
|
370
|
+
test_type='Friedman test',
|
|
371
|
+
statistic=float(chi2),
|
|
372
|
+
pvalue=float(p_value),
|
|
373
|
+
significant=p_value < 0.05,
|
|
374
|
+
additional_metrics={
|
|
375
|
+
'n_measures': len(variables),
|
|
376
|
+
'n_samples': len(data),
|
|
377
|
+
'variable_medians': {var: float(np.median(data[var])) for var in variables}
|
|
378
|
+
}
|
|
379
|
+
).to_dict()
|
|
380
|
+
else:
|
|
381
|
+
raise AnalysisError(f"Unsupported non-parametric test type: {test_type}. Supported types: mann_whitney, wilcoxon, kruskal, friedman")
|
|
382
|
+
|
|
383
|
+
def regression(self, file_path: str, formula: str, regression_type: str = "ols", robust: bool = False, structured_output: bool = True) -> Dict[str, Any]:
|
|
384
|
+
"""Perform regression analysis with various models."""
|
|
385
|
+
df = self._load_data(file_path)
|
|
386
|
+
import statsmodels.formula.api as smf
|
|
387
|
+
try:
|
|
388
|
+
model_map = {
|
|
389
|
+
'ols': smf.ols,
|
|
390
|
+
'logit': smf.logit,
|
|
391
|
+
'probit': smf.probit,
|
|
392
|
+
'poisson': smf.poisson
|
|
393
|
+
}
|
|
394
|
+
model = model_map[regression_type](formula=formula, data=df)
|
|
395
|
+
fit = model.fit(cov_type='HC3' if robust else 'nonrobust')
|
|
396
|
+
if structured_output:
|
|
397
|
+
result = {
|
|
398
|
+
'model_type': regression_type,
|
|
399
|
+
'formula': formula,
|
|
400
|
+
'n_observations': int(fit.nobs),
|
|
401
|
+
'r_squared': float(fit.rsquared) if hasattr(fit, 'rsquared') else None,
|
|
402
|
+
'adj_r_squared': float(fit.rsquared_adj) if hasattr(fit, 'rsquared_adj') else None,
|
|
403
|
+
'aic': float(fit.aic) if hasattr(fit, 'aic') else None,
|
|
404
|
+
'bic': float(fit.bic) if hasattr(fit, 'bic') else None,
|
|
405
|
+
'f_statistic': float(fit.fvalue) if hasattr(fit, 'fvalue') else None,
|
|
406
|
+
'f_pvalue': float(fit.f_pvalue) if hasattr(fit, 'f_pvalue') else None,
|
|
407
|
+
'log_likelihood': float(fit.llf) if hasattr(fit, 'llf') else None,
|
|
408
|
+
'coefficients': {
|
|
409
|
+
var: {
|
|
410
|
+
'coef': float(fit.params[var]),
|
|
411
|
+
'std_err': float(fit.bse[var]),
|
|
412
|
+
't_value': float(fit.tvalues[var]) if hasattr(fit, 'tvalues') else None,
|
|
413
|
+
'p_value': float(fit.pvalues[var]),
|
|
414
|
+
'significant': fit.pvalues[var] < 0.05,
|
|
415
|
+
'conf_lower': float(fit.conf_int().loc[var, 0]),
|
|
416
|
+
'conf_upper': float(fit.conf_int().loc[var, 1])
|
|
417
|
+
} for var in fit.params.index
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
return {'summary_text': fit.summary().as_text(), 'structured': result}
|
|
421
|
+
return {'summary': fit.summary().as_text()}
|
|
422
|
+
except Exception as e:
|
|
423
|
+
raise AnalysisError(f"Regression error: {str(e)}")
|
|
424
|
+
|
|
425
|
+
def time_series(self, file_path: str, variable: str, date_variable: Optional[str] = None, model_type: str = "arima", order: Optional[Tuple[int, int, int]] = (1, 1, 1), seasonal_order: Optional[Tuple[int, int, int, int]] = None, forecast_periods: int = 10) -> Dict[str, Any]:
|
|
426
|
+
"""Perform time series analysis."""
|
|
427
|
+
df = self._load_data(file_path)
|
|
428
|
+
self._validate_variables(df, [variable] + ([date_variable] if date_variable else []))
|
|
429
|
+
from statsmodels.tsa.arima.model import ARIMA
|
|
430
|
+
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
|
431
|
+
try:
|
|
432
|
+
ts_data = df[variable].dropna()
|
|
433
|
+
if date_variable and date_variable in df.columns:
|
|
434
|
+
ts_data.index = df[date_variable]
|
|
435
|
+
if model_type == 'arima':
|
|
436
|
+
model = ARIMA(ts_data, order=order)
|
|
437
|
+
fit = model.fit()
|
|
438
|
+
model_type_name = 'ARIMA'
|
|
439
|
+
elif model_type == 'sarima':
|
|
440
|
+
if not seasonal_order:
|
|
441
|
+
raise AnalysisError("seasonal_order must be provided for SARIMA model")
|
|
442
|
+
model = SARIMAX(ts_data, order=order, seasonal_order=seasonal_order)
|
|
443
|
+
fit = model.fit(disp=False)
|
|
444
|
+
model_type_name = 'SARIMA'
|
|
445
|
+
else:
|
|
446
|
+
raise AnalysisError(f"Unsupported time series model: {model_type}")
|
|
447
|
+
forecast = fit.forecast(steps=forecast_periods)
|
|
448
|
+
forecast_index = pd.date_range(
|
|
449
|
+
start=ts_data.index[-1] if isinstance(ts_data.index, pd.DatetimeIndex) else len(ts_data),
|
|
450
|
+
periods=forecast_periods + 1,
|
|
451
|
+
freq='D'
|
|
452
|
+
)[1:]
|
|
453
|
+
return {
|
|
454
|
+
'model_type': model_type_name,
|
|
455
|
+
'order': order,
|
|
456
|
+
'seasonal_order': seasonal_order if model_type == 'sarima' else None,
|
|
457
|
+
'aic': float(fit.aic),
|
|
458
|
+
'bic': float(fit.bic),
|
|
459
|
+
'forecast': {
|
|
460
|
+
'values': forecast.tolist() if isinstance(forecast, np.ndarray) else forecast.values.tolist(),
|
|
461
|
+
'index': forecast_index.strftime('%Y-%m-%d').tolist() if isinstance(forecast_index, pd.DatetimeIndex) else list(range(len(forecast)))
|
|
462
|
+
},
|
|
463
|
+
'summary': str(fit.summary())
|
|
464
|
+
}
|
|
465
|
+
except Exception as e:
|
|
466
|
+
raise AnalysisError(f"Time series analysis error: {str(e)}")
|
|
467
|
+
|
|
468
|
+
def preprocess(self, file_path: str, variables: List[str], operation: str, scaler_type: ScalerType = ScalerType.STANDARD, output_path: Optional[str] = None) -> Dict[str, Any]:
|
|
469
|
+
"""Preprocess data with various operations."""
|
|
470
|
+
df = self._load_data(file_path)
|
|
471
|
+
self._validate_variables(df, variables)
|
|
472
|
+
data = df[variables].copy()
|
|
473
|
+
result = {'operation': operation}
|
|
474
|
+
if operation == 'scale':
|
|
475
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
|
|
476
|
+
scaler_map = {
|
|
477
|
+
ScalerType.STANDARD: (StandardScaler, "StandardScaler"),
|
|
478
|
+
ScalerType.MINMAX: (MinMaxScaler, "MinMaxScaler"),
|
|
479
|
+
ScalerType.ROBUST: (RobustScaler, "RobustScaler")
|
|
480
|
+
}
|
|
481
|
+
scaler_cls, scaler_name = scaler_map[scaler_type]
|
|
482
|
+
scaler = scaler_cls()
|
|
483
|
+
scaled_data = scaler.fit_transform(data)
|
|
484
|
+
scaled_df = pd.DataFrame(scaled_data, columns=[f"{col}_scaled" for col in data.columns], index=data.index)
|
|
485
|
+
result.update({
|
|
486
|
+
'scaler': scaler_name,
|
|
487
|
+
'original_stats': data.describe().to_dict(),
|
|
488
|
+
'scaled_stats': scaled_df.describe().to_dict(),
|
|
489
|
+
'preview': scaled_df.head(5).to_dict(orient='records')
|
|
490
|
+
})
|
|
491
|
+
processed_df = scaled_df
|
|
492
|
+
elif operation == 'impute':
|
|
493
|
+
import numpy as np
|
|
494
|
+
imputed_df = data.copy()
|
|
495
|
+
numeric_cols = data.select_dtypes(include=[np.number]).columns
|
|
496
|
+
for col in numeric_cols:
|
|
497
|
+
imputed_df[col] = data[col].fillna(data[col].mean())
|
|
498
|
+
cat_cols = data.select_dtypes(exclude=[np.number]).columns
|
|
499
|
+
for col in cat_cols:
|
|
500
|
+
imputed_df[col] = data[col].fillna(data[col].mode()[0] if not data[col].mode().empty else None)
|
|
501
|
+
result.update({
|
|
502
|
+
'imputation_method': {'numeric': 'mean', 'categorical': 'mode'},
|
|
503
|
+
'missing_counts_before': data.isna().sum().to_dict(),
|
|
504
|
+
'missing_counts_after': imputed_df.isna().sum().to_dict(),
|
|
505
|
+
'preview': imputed_df.head(5).to_dict(orient='records')
|
|
506
|
+
})
|
|
507
|
+
processed_df = imputed_df
|
|
508
|
+
if output_path:
|
|
509
|
+
output_path = os.path.abspath(output_path) if os.path.isabs(output_path) else os.path.join(tempfile.gettempdir(), 'stats_outputs', output_path)
|
|
510
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
511
|
+
processed_df.to_csv(output_path)
|
|
512
|
+
result['output_file'] = output_path
|
|
513
|
+
return result
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, Optional
|
|
5
|
+
from threading import Lock
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
class TempFileManager:
|
|
10
|
+
"""
|
|
11
|
+
Manages temporary files with automatic cleanup based on age.
|
|
12
|
+
|
|
13
|
+
This class provides functionality to register temporary files, track their creation time,
|
|
14
|
+
and clean up files that exceed a specified maximum age. It ensures thread-safe operations
|
|
15
|
+
for file registration and cleanup.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
base_dir (str): Base directory for temporary files.
|
|
19
|
+
max_age (int): Maximum age of temporary files in seconds.
|
|
20
|
+
files (Dict[str, float]): Dictionary mapping file paths to their creation timestamps.
|
|
21
|
+
lock (Lock): Thread lock for safe concurrent access.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, base_dir: str, max_age: int = 3600):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the TempFileManager.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
base_dir (str): Base directory for temporary files.
|
|
30
|
+
max_age (int, optional): Maximum age of temporary files in seconds. Defaults to 3600 (1 hour).
|
|
31
|
+
"""
|
|
32
|
+
self.base_dir = base_dir
|
|
33
|
+
self.max_age = max_age
|
|
34
|
+
self.files: Dict[str, float] = {}
|
|
35
|
+
self.lock = Lock()
|
|
36
|
+
|
|
37
|
+
# Ensure base directory exists
|
|
38
|
+
os.makedirs(self.base_dir, exist_ok=True)
|
|
39
|
+
logger.info(f"Initialized TempFileManager with base_dir: {self.base_dir}, max_age: {self.max_age} seconds")
|
|
40
|
+
|
|
41
|
+
def register_file(self, file_path: str) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Register a temporary file with its creation timestamp.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
file_path (str): Path to the temporary file.
|
|
47
|
+
"""
|
|
48
|
+
abs_path = os.path.abspath(file_path)
|
|
49
|
+
if not os.path.isfile(abs_path):
|
|
50
|
+
logger.warning(f"Attempted to register non-existent file: {abs_path}")
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
with self.lock:
|
|
54
|
+
self.files[abs_path] = time.time()
|
|
55
|
+
logger.debug(f"Registered temporary file: {abs_path}")
|
|
56
|
+
|
|
57
|
+
def cleanup(self, force: bool = False) -> int:
|
|
58
|
+
"""
|
|
59
|
+
Clean up temporary files older than max_age or all files if force is True.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
force (bool, optional): If True, remove all registered files regardless of age. Defaults to False.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
int: Number of files removed.
|
|
66
|
+
"""
|
|
67
|
+
current_time = time.time()
|
|
68
|
+
removed_count = 0
|
|
69
|
+
|
|
70
|
+
with self.lock:
|
|
71
|
+
files_to_remove = []
|
|
72
|
+
for file_path, creation_time in self.files.items():
|
|
73
|
+
age = current_time - creation_time
|
|
74
|
+
if force or age > self.max_age:
|
|
75
|
+
files_to_remove.append(file_path)
|
|
76
|
+
removed_count += 1
|
|
77
|
+
|
|
78
|
+
for file_path in files_to_remove:
|
|
79
|
+
try:
|
|
80
|
+
if os.path.exists(file_path):
|
|
81
|
+
os.remove(file_path)
|
|
82
|
+
logger.debug(f"Removed temporary file: {file_path}")
|
|
83
|
+
del self.files[file_path]
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.error(f"Failed to remove temporary file {file_path}: {e}")
|
|
86
|
+
# Keep the file in the registry if removal fails to retry later
|
|
87
|
+
|
|
88
|
+
logger.info(f"Cleaned up {removed_count} temporary files (force={force})")
|
|
89
|
+
return removed_count
|
|
90
|
+
|
|
91
|
+
def get_file_age(self, file_path: str) -> Optional[float]:
|
|
92
|
+
"""
|
|
93
|
+
Get the age of a registered temporary file in seconds.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
file_path (str): Path to the temporary file.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Optional[float]: Age of the file in seconds if registered, None otherwise.
|
|
100
|
+
"""
|
|
101
|
+
abs_path = os.path.abspath(file_path)
|
|
102
|
+
if abs_path in self.files:
|
|
103
|
+
return time.time() - self.files[abs_path]
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
def is_temp_file(self, file_path: str) -> bool:
|
|
107
|
+
"""
|
|
108
|
+
Check if a file is a registered temporary file.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
file_path (str): Path to check.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
bool: True if the file is registered as temporary, False otherwise.
|
|
115
|
+
"""
|
|
116
|
+
abs_path = os.path.abspath(file_path)
|
|
117
|
+
return abs_path in self.files
|
|
118
|
+
|
|
119
|
+
def clear_all(self) -> int:
|
|
120
|
+
"""
|
|
121
|
+
Remove all registered temporary files.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
int: Number of files removed.
|
|
125
|
+
"""
|
|
126
|
+
return self.cleanup(force=True)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# python-middleware/app/tools/tool_executor/__init__.py
|
|
2
|
+
|
|
3
|
+
from .tool_executor import (
|
|
4
|
+
ToolExecutor,
|
|
5
|
+
ToolExecutionError,
|
|
6
|
+
InputValidationError,
|
|
7
|
+
OperationError,
|
|
8
|
+
SecurityError,
|
|
9
|
+
TimeoutError,
|
|
10
|
+
ExecutorConfig,
|
|
11
|
+
ExecutorMetrics,
|
|
12
|
+
get_executor,
|
|
13
|
+
validate_input,
|
|
14
|
+
cache_result,
|
|
15
|
+
run_in_executor,
|
|
16
|
+
measure_execution_time,
|
|
17
|
+
sanitize_input
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
'ToolExecutor',
|
|
22
|
+
'ToolExecutionError',
|
|
23
|
+
'InputValidationError',
|
|
24
|
+
'OperationError',
|
|
25
|
+
'SecurityError',
|
|
26
|
+
'TimeoutError',
|
|
27
|
+
'ExecutorConfig',
|
|
28
|
+
'ExecutorMetrics',
|
|
29
|
+
'get_executor',
|
|
30
|
+
'validate_input',
|
|
31
|
+
'cache_result',
|
|
32
|
+
'run_in_executor',
|
|
33
|
+
'measure_execution_time',
|
|
34
|
+
'sanitize_input'
|
|
35
|
+
]
|