aiecs 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +1 -1
- aiecs/config/config.py +2 -0
- aiecs/domain/__init__.py +95 -0
- aiecs/domain/community/__init__.py +159 -0
- aiecs/domain/community/agent_adapter.py +516 -0
- aiecs/domain/community/analytics.py +465 -0
- aiecs/domain/community/collaborative_workflow.py +99 -7
- aiecs/domain/community/communication_hub.py +649 -0
- aiecs/domain/community/community_builder.py +322 -0
- aiecs/domain/community/community_integration.py +365 -12
- aiecs/domain/community/community_manager.py +481 -5
- aiecs/domain/community/decision_engine.py +459 -13
- aiecs/domain/community/exceptions.py +238 -0
- aiecs/domain/community/models/__init__.py +36 -0
- aiecs/domain/community/resource_manager.py +1 -1
- aiecs/domain/community/shared_context_manager.py +621 -0
- aiecs/domain/context/context_engine.py +37 -33
- aiecs/main.py +2 -2
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +15 -0
- aiecs/scripts/aid/version_manager.py +224 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +1 -0
- aiecs/tools/__init__.py +23 -23
- aiecs/tools/docs/__init__.py +5 -2
- aiecs/tools/docs/ai_document_orchestrator.py +39 -26
- aiecs/tools/docs/ai_document_writer_orchestrator.py +61 -38
- aiecs/tools/docs/content_insertion_tool.py +48 -28
- aiecs/tools/docs/document_creator_tool.py +47 -29
- aiecs/tools/docs/document_layout_tool.py +35 -20
- aiecs/tools/docs/document_parser_tool.py +56 -36
- aiecs/tools/docs/document_writer_tool.py +115 -62
- aiecs/tools/schema_generator.py +56 -56
- aiecs/tools/statistics/__init__.py +82 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +581 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +473 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +629 -0
- aiecs/tools/statistics/data_loader_tool.py +518 -0
- aiecs/tools/statistics/data_profiler_tool.py +599 -0
- aiecs/tools/statistics/data_transformer_tool.py +531 -0
- aiecs/tools/statistics/data_visualizer_tool.py +460 -0
- aiecs/tools/statistics/model_trainer_tool.py +470 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +426 -0
- aiecs/tools/task_tools/chart_tool.py +2 -1
- aiecs/tools/task_tools/image_tool.py +43 -43
- aiecs/tools/task_tools/office_tool.py +39 -36
- aiecs/tools/task_tools/pandas_tool.py +37 -33
- aiecs/tools/task_tools/report_tool.py +67 -56
- aiecs/tools/task_tools/research_tool.py +32 -31
- aiecs/tools/task_tools/scraper_tool.py +53 -46
- aiecs/tools/task_tools/search_tool.py +1123 -0
- aiecs/tools/task_tools/stats_tool.py +20 -15
- {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/METADATA +5 -1
- {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/RECORD +57 -36
- {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/entry_points.txt +1 -0
- aiecs/tools/task_tools/search_api.py +0 -7
- {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/WHEEL +0 -0
- {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Statistical Analyzer Tool - Advanced statistical analysis and hypothesis testing
|
|
3
|
+
|
|
4
|
+
This tool provides comprehensive statistical analysis with:
|
|
5
|
+
- Descriptive and inferential statistics
|
|
6
|
+
- Hypothesis testing (t-test, ANOVA, chi-square)
|
|
7
|
+
- Regression analysis
|
|
8
|
+
- Time series analysis
|
|
9
|
+
- Correlation and causality analysis
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from typing import Dict, Any, List, Optional, Union
|
|
14
|
+
from enum import Enum
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import numpy as np
|
|
18
|
+
from scipy import stats as scipy_stats
|
|
19
|
+
from pydantic import BaseModel, Field, ValidationError, ConfigDict
|
|
20
|
+
|
|
21
|
+
from aiecs.tools.base_tool import BaseTool
|
|
22
|
+
from aiecs.tools import register_tool
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AnalysisType(str, Enum):
|
|
26
|
+
"""Types of statistical analyses"""
|
|
27
|
+
DESCRIPTIVE = "descriptive"
|
|
28
|
+
T_TEST = "t_test"
|
|
29
|
+
ANOVA = "anova"
|
|
30
|
+
CHI_SQUARE = "chi_square"
|
|
31
|
+
LINEAR_REGRESSION = "linear_regression"
|
|
32
|
+
LOGISTIC_REGRESSION = "logistic_regression"
|
|
33
|
+
CORRELATION = "correlation"
|
|
34
|
+
TIME_SERIES = "time_series"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class StatisticalAnalyzerError(Exception):
|
|
40
|
+
"""Base exception for StatisticalAnalyzer errors"""
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class AnalysisError(StatisticalAnalyzerError):
|
|
45
|
+
"""Raised when analysis fails"""
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@register_tool('statistical_analyzer')
|
|
50
|
+
class StatisticalAnalyzerTool(BaseTool):
|
|
51
|
+
"""
|
|
52
|
+
Advanced statistical analysis tool that can:
|
|
53
|
+
1. Perform hypothesis testing
|
|
54
|
+
2. Conduct regression analysis
|
|
55
|
+
3. Analyze time series
|
|
56
|
+
4. Perform correlation and causal analysis
|
|
57
|
+
|
|
58
|
+
Integrates with stats_tool for core statistical operations.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
# Configuration schema
|
|
62
|
+
class Config(BaseModel):
|
|
63
|
+
"""Configuration for the statistical analyzer tool"""
|
|
64
|
+
model_config = ConfigDict(env_prefix="STATISTICAL_ANALYZER_")
|
|
65
|
+
|
|
66
|
+
significance_level: float = Field(
|
|
67
|
+
default=0.05,
|
|
68
|
+
description="Significance level for hypothesis testing"
|
|
69
|
+
)
|
|
70
|
+
confidence_level: float = Field(
|
|
71
|
+
default=0.95,
|
|
72
|
+
description="Confidence level for statistical intervals"
|
|
73
|
+
)
|
|
74
|
+
enable_effect_size: bool = Field(
|
|
75
|
+
default=True,
|
|
76
|
+
description="Whether to calculate effect sizes in analyses"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
80
|
+
"""Initialize StatisticalAnalyzerTool with settings"""
|
|
81
|
+
super().__init__(config)
|
|
82
|
+
|
|
83
|
+
# Parse configuration
|
|
84
|
+
self.config = self.Config(**(config or {}))
|
|
85
|
+
|
|
86
|
+
self.logger = logging.getLogger(__name__)
|
|
87
|
+
if not self.logger.handlers:
|
|
88
|
+
handler = logging.StreamHandler()
|
|
89
|
+
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
|
|
90
|
+
self.logger.addHandler(handler)
|
|
91
|
+
self.logger.setLevel(logging.INFO)
|
|
92
|
+
|
|
93
|
+
self._init_external_tools()
|
|
94
|
+
|
|
95
|
+
def _init_external_tools(self):
|
|
96
|
+
"""Initialize external task tools"""
|
|
97
|
+
self.external_tools = {}
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
from aiecs.tools.task_tools.stats_tool import StatsTool
|
|
101
|
+
self.external_tools['stats'] = StatsTool()
|
|
102
|
+
self.logger.info("StatsTool initialized successfully")
|
|
103
|
+
except ImportError:
|
|
104
|
+
self.logger.warning("StatsTool not available")
|
|
105
|
+
self.external_tools['stats'] = None
|
|
106
|
+
|
|
107
|
+
# Schema definitions
|
|
108
|
+
class AnalyzeSchema(BaseModel):
|
|
109
|
+
"""Schema for analyze operation"""
|
|
110
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to analyze")
|
|
111
|
+
analysis_type: AnalysisType = Field(description="Type of analysis to perform")
|
|
112
|
+
variables: Dict[str, Any] = Field(description="Variables specification")
|
|
113
|
+
params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters")
|
|
114
|
+
|
|
115
|
+
class TestHypothesisSchema(BaseModel):
|
|
116
|
+
"""Schema for test_hypothesis operation"""
|
|
117
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data for hypothesis testing")
|
|
118
|
+
test_type: str = Field(description="Type of test: t_test, anova, chi_square")
|
|
119
|
+
variables: Dict[str, Any] = Field(description="Variables for testing")
|
|
120
|
+
|
|
121
|
+
class PerformRegressionSchema(BaseModel):
|
|
122
|
+
"""Schema for perform_regression operation"""
|
|
123
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data for regression")
|
|
124
|
+
dependent_var: str = Field(description="Dependent variable")
|
|
125
|
+
independent_vars: List[str] = Field(description="Independent variables")
|
|
126
|
+
regression_type: str = Field(default="linear", description="Type: linear or logistic")
|
|
127
|
+
|
|
128
|
+
class AnalyzeCorrelationSchema(BaseModel):
|
|
129
|
+
"""Schema for analyze_correlation operation"""
|
|
130
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data for correlation analysis")
|
|
131
|
+
variables: Optional[List[str]] = Field(default=None, description="Variables to analyze")
|
|
132
|
+
method: str = Field(default="pearson", description="Correlation method")
|
|
133
|
+
|
|
134
|
+
def analyze(
|
|
135
|
+
self,
|
|
136
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
137
|
+
analysis_type: AnalysisType,
|
|
138
|
+
variables: Dict[str, Any],
|
|
139
|
+
params: Optional[Dict[str, Any]] = None
|
|
140
|
+
) -> Dict[str, Any]:
|
|
141
|
+
"""
|
|
142
|
+
Perform statistical analysis.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
data: Data to analyze
|
|
146
|
+
analysis_type: Type of analysis
|
|
147
|
+
variables: Variables specification (dependent, independent, etc.)
|
|
148
|
+
params: Additional parameters
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Dict containing analysis results with statistics, p-values, interpretations
|
|
152
|
+
"""
|
|
153
|
+
try:
|
|
154
|
+
df = self._to_dataframe(data)
|
|
155
|
+
params = params or {}
|
|
156
|
+
|
|
157
|
+
if analysis_type == AnalysisType.DESCRIPTIVE:
|
|
158
|
+
result = self._descriptive_analysis(df, variables)
|
|
159
|
+
elif analysis_type == AnalysisType.T_TEST:
|
|
160
|
+
result = self._t_test_analysis(df, variables, params)
|
|
161
|
+
elif analysis_type == AnalysisType.ANOVA:
|
|
162
|
+
result = self._anova_analysis(df, variables, params)
|
|
163
|
+
elif analysis_type == AnalysisType.CHI_SQUARE:
|
|
164
|
+
result = self._chi_square_analysis(df, variables, params)
|
|
165
|
+
elif analysis_type == AnalysisType.LINEAR_REGRESSION:
|
|
166
|
+
result = self._linear_regression_analysis(df, variables, params)
|
|
167
|
+
elif analysis_type == AnalysisType.CORRELATION:
|
|
168
|
+
result = self._correlation_analysis(df, variables, params)
|
|
169
|
+
else:
|
|
170
|
+
raise AnalysisError(f"Unsupported analysis type: {analysis_type}")
|
|
171
|
+
|
|
172
|
+
result['analysis_type'] = analysis_type.value
|
|
173
|
+
return result
|
|
174
|
+
|
|
175
|
+
except Exception as e:
|
|
176
|
+
self.logger.error(f"Error in analysis: {e}")
|
|
177
|
+
raise AnalysisError(f"Analysis failed: {e}")
|
|
178
|
+
|
|
179
|
+
def test_hypothesis(
|
|
180
|
+
self,
|
|
181
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
182
|
+
test_type: str,
|
|
183
|
+
variables: Dict[str, Any]
|
|
184
|
+
) -> Dict[str, Any]:
|
|
185
|
+
"""Perform hypothesis testing"""
|
|
186
|
+
try:
|
|
187
|
+
df = self._to_dataframe(data)
|
|
188
|
+
|
|
189
|
+
if test_type == "t_test":
|
|
190
|
+
return self._t_test_analysis(df, variables, {})
|
|
191
|
+
elif test_type == "anova":
|
|
192
|
+
return self._anova_analysis(df, variables, {})
|
|
193
|
+
elif test_type == "chi_square":
|
|
194
|
+
return self._chi_square_analysis(df, variables, {})
|
|
195
|
+
else:
|
|
196
|
+
raise AnalysisError(f"Unsupported test type: {test_type}")
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
self.logger.error(f"Error in hypothesis testing: {e}")
|
|
200
|
+
raise AnalysisError(f"Hypothesis testing failed: {e}")
|
|
201
|
+
|
|
202
|
+
def perform_regression(
|
|
203
|
+
self,
|
|
204
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
205
|
+
dependent_var: str,
|
|
206
|
+
independent_vars: List[str],
|
|
207
|
+
regression_type: str = "linear"
|
|
208
|
+
) -> Dict[str, Any]:
|
|
209
|
+
"""Perform regression analysis"""
|
|
210
|
+
try:
|
|
211
|
+
df = self._to_dataframe(data)
|
|
212
|
+
variables = {
|
|
213
|
+
'dependent': dependent_var,
|
|
214
|
+
'independent': independent_vars
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if regression_type == "linear":
|
|
218
|
+
return self._linear_regression_analysis(df, variables, {})
|
|
219
|
+
else:
|
|
220
|
+
raise AnalysisError(f"Unsupported regression type: {regression_type}")
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
self.logger.error(f"Error in regression: {e}")
|
|
224
|
+
raise AnalysisError(f"Regression failed: {e}")
|
|
225
|
+
|
|
226
|
+
def analyze_correlation(
|
|
227
|
+
self,
|
|
228
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
229
|
+
variables: Optional[List[str]] = None,
|
|
230
|
+
method: str = "pearson"
|
|
231
|
+
) -> Dict[str, Any]:
|
|
232
|
+
"""Perform correlation analysis"""
|
|
233
|
+
try:
|
|
234
|
+
df = self._to_dataframe(data)
|
|
235
|
+
var_dict = {'variables': variables} if variables else {}
|
|
236
|
+
return self._correlation_analysis(df, var_dict, {'method': method})
|
|
237
|
+
|
|
238
|
+
except Exception as e:
|
|
239
|
+
self.logger.error(f"Error in correlation analysis: {e}")
|
|
240
|
+
raise AnalysisError(f"Correlation analysis failed: {e}")
|
|
241
|
+
|
|
242
|
+
# Internal analysis methods
|
|
243
|
+
|
|
244
|
+
def _to_dataframe(self, data: Union[Dict, List, pd.DataFrame]) -> pd.DataFrame:
|
|
245
|
+
"""Convert data to DataFrame"""
|
|
246
|
+
if isinstance(data, pd.DataFrame):
|
|
247
|
+
return data
|
|
248
|
+
elif isinstance(data, list):
|
|
249
|
+
return pd.DataFrame(data)
|
|
250
|
+
elif isinstance(data, dict):
|
|
251
|
+
return pd.DataFrame([data])
|
|
252
|
+
else:
|
|
253
|
+
raise AnalysisError(f"Unsupported data type: {type(data)}")
|
|
254
|
+
|
|
255
|
+
def _descriptive_analysis(self, df: pd.DataFrame, variables: Dict[str, Any]) -> Dict[str, Any]:
|
|
256
|
+
"""Perform descriptive statistics analysis"""
|
|
257
|
+
cols = variables.get('columns', df.select_dtypes(include=[np.number]).columns.tolist())
|
|
258
|
+
|
|
259
|
+
results = {}
|
|
260
|
+
for col in cols:
|
|
261
|
+
if col in df.columns:
|
|
262
|
+
series = df[col].dropna()
|
|
263
|
+
results[col] = {
|
|
264
|
+
'count': int(len(series)),
|
|
265
|
+
'mean': float(series.mean()),
|
|
266
|
+
'std': float(series.std()),
|
|
267
|
+
'min': float(series.min()),
|
|
268
|
+
'q25': float(series.quantile(0.25)),
|
|
269
|
+
'median': float(series.median()),
|
|
270
|
+
'q75': float(series.quantile(0.75)),
|
|
271
|
+
'max': float(series.max()),
|
|
272
|
+
'skewness': float(series.skew()),
|
|
273
|
+
'kurtosis': float(series.kurt())
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
return {
|
|
277
|
+
'results': results,
|
|
278
|
+
'interpretation': 'Descriptive statistics computed successfully'
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
def _t_test_analysis(self, df: pd.DataFrame, variables: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
|
|
282
|
+
"""Perform t-test"""
|
|
283
|
+
var1_name = variables.get('var1')
|
|
284
|
+
var2_name = variables.get('var2')
|
|
285
|
+
|
|
286
|
+
if not var1_name or not var2_name:
|
|
287
|
+
raise AnalysisError("T-test requires var1 and var2")
|
|
288
|
+
|
|
289
|
+
var1 = df[var1_name].dropna()
|
|
290
|
+
var2 = df[var2_name].dropna()
|
|
291
|
+
|
|
292
|
+
statistic, pvalue = scipy_stats.ttest_ind(var1, var2)
|
|
293
|
+
|
|
294
|
+
return {
|
|
295
|
+
'test_type': 't_test',
|
|
296
|
+
'statistic': float(statistic),
|
|
297
|
+
'p_value': float(pvalue),
|
|
298
|
+
'significant': pvalue < self.config.significance_level,
|
|
299
|
+
'interpretation': f"{'Significant' if pvalue < self.config.significance_level else 'Not significant'} difference at α={self.config.significance_level}",
|
|
300
|
+
'variables': [var1_name, var2_name]
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
def _anova_analysis(self, df: pd.DataFrame, variables: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
|
|
304
|
+
"""Perform ANOVA"""
|
|
305
|
+
groups = variables.get('groups', [])
|
|
306
|
+
|
|
307
|
+
if len(groups) < 2:
|
|
308
|
+
raise AnalysisError("ANOVA requires at least 2 groups")
|
|
309
|
+
|
|
310
|
+
group_data = [df[group].dropna() for group in groups if group in df.columns]
|
|
311
|
+
|
|
312
|
+
if len(group_data) < 2:
|
|
313
|
+
raise AnalysisError("Insufficient valid groups for ANOVA")
|
|
314
|
+
|
|
315
|
+
statistic, pvalue = scipy_stats.f_oneway(*group_data)
|
|
316
|
+
|
|
317
|
+
return {
|
|
318
|
+
'test_type': 'anova',
|
|
319
|
+
'statistic': float(statistic),
|
|
320
|
+
'p_value': float(pvalue),
|
|
321
|
+
'significant': pvalue < self.config.significance_level,
|
|
322
|
+
'interpretation': f"{'Significant' if pvalue < self.config.significance_level else 'Not significant'} difference between groups",
|
|
323
|
+
'groups': groups
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
def _chi_square_analysis(self, df: pd.DataFrame, variables: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
|
|
327
|
+
"""Perform chi-square test"""
|
|
328
|
+
var1_name = variables.get('var1')
|
|
329
|
+
var2_name = variables.get('var2')
|
|
330
|
+
|
|
331
|
+
if not var1_name or not var2_name:
|
|
332
|
+
raise AnalysisError("Chi-square test requires var1 and var2")
|
|
333
|
+
|
|
334
|
+
contingency_table = pd.crosstab(df[var1_name], df[var2_name])
|
|
335
|
+
statistic, pvalue, dof, expected = scipy_stats.chi2_contingency(contingency_table)
|
|
336
|
+
|
|
337
|
+
return {
|
|
338
|
+
'test_type': 'chi_square',
|
|
339
|
+
'statistic': float(statistic),
|
|
340
|
+
'p_value': float(pvalue),
|
|
341
|
+
'degrees_of_freedom': int(dof),
|
|
342
|
+
'significant': pvalue < self.config.significance_level,
|
|
343
|
+
'interpretation': f"{'Significant' if pvalue < self.config.significance_level else 'Not significant'} association",
|
|
344
|
+
'variables': [var1_name, var2_name]
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
def _linear_regression_analysis(self, df: pd.DataFrame, variables: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
|
|
348
|
+
"""Perform linear regression"""
|
|
349
|
+
from sklearn.linear_model import LinearRegression
|
|
350
|
+
from sklearn.metrics import r2_score, mean_squared_error
|
|
351
|
+
|
|
352
|
+
dependent = variables.get('dependent')
|
|
353
|
+
independent = variables.get('independent', [])
|
|
354
|
+
|
|
355
|
+
if not dependent or not independent:
|
|
356
|
+
raise AnalysisError("Regression requires dependent and independent variables")
|
|
357
|
+
|
|
358
|
+
X = df[independent].dropna()
|
|
359
|
+
y = df[dependent].loc[X.index]
|
|
360
|
+
|
|
361
|
+
model = LinearRegression()
|
|
362
|
+
model.fit(X, y)
|
|
363
|
+
|
|
364
|
+
y_pred = model.predict(X)
|
|
365
|
+
r2 = r2_score(y, y_pred)
|
|
366
|
+
mse = mean_squared_error(y, y_pred)
|
|
367
|
+
|
|
368
|
+
coefficients = {var: float(coef) for var, coef in zip(independent, model.coef_)}
|
|
369
|
+
|
|
370
|
+
return {
|
|
371
|
+
'model_type': 'linear_regression',
|
|
372
|
+
'intercept': float(model.intercept_),
|
|
373
|
+
'coefficients': coefficients,
|
|
374
|
+
'r_squared': float(r2),
|
|
375
|
+
'mse': float(mse),
|
|
376
|
+
'rmse': float(np.sqrt(mse)),
|
|
377
|
+
'interpretation': f"Model explains {r2*100:.2f}% of variance",
|
|
378
|
+
'dependent_variable': dependent,
|
|
379
|
+
'independent_variables': independent
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
def _correlation_analysis(self, df: pd.DataFrame, variables: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
|
|
383
|
+
"""Perform correlation analysis"""
|
|
384
|
+
method = params.get('method', 'pearson')
|
|
385
|
+
cols = variables.get('variables')
|
|
386
|
+
|
|
387
|
+
if cols:
|
|
388
|
+
numeric_df = df[cols].select_dtypes(include=[np.number])
|
|
389
|
+
else:
|
|
390
|
+
numeric_df = df.select_dtypes(include=[np.number])
|
|
391
|
+
|
|
392
|
+
if numeric_df.shape[1] < 2:
|
|
393
|
+
raise AnalysisError("Correlation requires at least 2 numeric variables")
|
|
394
|
+
|
|
395
|
+
corr_matrix = numeric_df.corr(method=method)
|
|
396
|
+
|
|
397
|
+
# Find significant correlations
|
|
398
|
+
significant_pairs = []
|
|
399
|
+
for i in range(len(corr_matrix.columns)):
|
|
400
|
+
for j in range(i+1, len(corr_matrix.columns)):
|
|
401
|
+
corr_value = corr_matrix.iloc[i, j]
|
|
402
|
+
if abs(corr_value) > 0.3: # Threshold for noteworthy correlation
|
|
403
|
+
significant_pairs.append({
|
|
404
|
+
'var1': corr_matrix.columns[i],
|
|
405
|
+
'var2': corr_matrix.columns[j],
|
|
406
|
+
'correlation': float(corr_value),
|
|
407
|
+
'strength': self._interpret_correlation(corr_value)
|
|
408
|
+
})
|
|
409
|
+
|
|
410
|
+
return {
|
|
411
|
+
'method': method,
|
|
412
|
+
'correlation_matrix': corr_matrix.to_dict(),
|
|
413
|
+
'significant_correlations': significant_pairs,
|
|
414
|
+
'interpretation': f"Found {len(significant_pairs)} significant correlations"
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
def _interpret_correlation(self, corr: float) -> str:
|
|
418
|
+
"""Interpret correlation strength"""
|
|
419
|
+
abs_corr = abs(corr)
|
|
420
|
+
if abs_corr < 0.3:
|
|
421
|
+
return "weak"
|
|
422
|
+
elif abs_corr < 0.7:
|
|
423
|
+
return "moderate"
|
|
424
|
+
else:
|
|
425
|
+
return "strong"
|
|
426
|
+
|
|
@@ -39,8 +39,9 @@ class ChartTool(BaseTool):
|
|
|
39
39
|
|
|
40
40
|
# Configuration schema
|
|
41
41
|
class Config(BaseModel):
|
|
42
|
-
model_config = ConfigDict()
|
|
43
42
|
"""Configuration for the chart tool"""
|
|
43
|
+
model_config = ConfigDict(env_prefix="CHART_TOOL_")
|
|
44
|
+
|
|
44
45
|
export_dir: str = Field(
|
|
45
46
|
default=os.path.join(tempfile.gettempdir(), 'chart_exports'),
|
|
46
47
|
description="Directory to export files to"
|
|
@@ -7,31 +7,16 @@ from typing import Dict, Any, List, Optional
|
|
|
7
7
|
from dataclasses import dataclass
|
|
8
8
|
from dataclasses import field
|
|
9
9
|
|
|
10
|
-
from pydantic import BaseModel, ValidationError, field_validator, ConfigDict
|
|
11
|
-
from pydantic_settings import BaseSettings
|
|
10
|
+
from pydantic import BaseModel, ValidationError, field_validator, ConfigDict, Field
|
|
12
11
|
from PIL import Image, ExifTags, ImageFilter
|
|
13
12
|
from queue import Queue
|
|
14
13
|
|
|
15
14
|
from aiecs.tools.base_tool import BaseTool
|
|
16
15
|
from aiecs.tools import register_tool
|
|
17
16
|
|
|
18
|
-
#
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
Configuration for ImageTool.
|
|
22
|
-
|
|
23
|
-
Attributes:
|
|
24
|
-
max_file_size_mb (int): Maximum file size in megabytes.
|
|
25
|
-
allowed_extensions (List[str]): Allowed image file extensions.
|
|
26
|
-
tesseract_pool_size (int): Number of Tesseract processes for OCR.
|
|
27
|
-
env_prefix (str): Environment variable prefix for settings.
|
|
28
|
-
"""
|
|
29
|
-
max_file_size_mb: int = 50
|
|
30
|
-
allowed_extensions: List[str] = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.gif']
|
|
31
|
-
tesseract_pool_size: int = 2
|
|
32
|
-
env_prefix: str = 'IMAGE_TOOL_'
|
|
33
|
-
|
|
34
|
-
model_config = ConfigDict(env_prefix='IMAGE_TOOL_')
|
|
17
|
+
# Module-level default configuration for validators
|
|
18
|
+
_DEFAULT_MAX_FILE_SIZE_MB = 50
|
|
19
|
+
_DEFAULT_ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.gif']
|
|
35
20
|
|
|
36
21
|
# Exceptions
|
|
37
22
|
class ImageToolError(Exception):
|
|
@@ -55,16 +40,15 @@ class BaseFileSchema(BaseModel):
|
|
|
55
40
|
@classmethod
|
|
56
41
|
def validate_file_path(cls, v: str) -> str:
|
|
57
42
|
"""Validate file path for existence, size, and extension."""
|
|
58
|
-
settings = ImageSettings()
|
|
59
43
|
abs_path = os.path.abspath(os.path.normpath(v))
|
|
60
44
|
ext = os.path.splitext(abs_path)[1].lower()
|
|
61
|
-
if ext not in
|
|
62
|
-
raise SecurityError(f"Extension '{ext}' not allowed, expected {
|
|
45
|
+
if ext not in _DEFAULT_ALLOWED_EXTENSIONS:
|
|
46
|
+
raise SecurityError(f"Extension '{ext}' not allowed, expected {_DEFAULT_ALLOWED_EXTENSIONS}")
|
|
63
47
|
if not os.path.isfile(abs_path):
|
|
64
48
|
raise FileOperationError(f"File not found: {abs_path}")
|
|
65
49
|
size_mb = os.path.getsize(abs_path) / (1024 * 1024)
|
|
66
|
-
if size_mb >
|
|
67
|
-
raise FileOperationError(f"File too large: {size_mb:.1f}MB, max {
|
|
50
|
+
if size_mb > _DEFAULT_MAX_FILE_SIZE_MB:
|
|
51
|
+
raise FileOperationError(f"File too large: {size_mb:.1f}MB, max {_DEFAULT_MAX_FILE_SIZE_MB}MB")
|
|
68
52
|
return abs_path
|
|
69
53
|
|
|
70
54
|
# Schemas for operations
|
|
@@ -91,11 +75,10 @@ class ResizeSchema(BaseFileSchema):
|
|
|
91
75
|
@classmethod
|
|
92
76
|
def validate_output_path(cls, v: str) -> str:
|
|
93
77
|
"""Validate output path for existence and extension."""
|
|
94
|
-
settings = ImageSettings()
|
|
95
78
|
abs_path = os.path.abspath(os.path.normpath(v))
|
|
96
79
|
ext = os.path.splitext(abs_path)[1].lower()
|
|
97
|
-
if ext not in
|
|
98
|
-
raise SecurityError(f"Output extension '{ext}' not allowed, expected {
|
|
80
|
+
if ext not in _DEFAULT_ALLOWED_EXTENSIONS:
|
|
81
|
+
raise SecurityError(f"Output extension '{ext}' not allowed, expected {_DEFAULT_ALLOWED_EXTENSIONS}")
|
|
99
82
|
if os.path.exists(abs_path):
|
|
100
83
|
raise FileOperationError(f"Output file already exists: {abs_path}")
|
|
101
84
|
return abs_path
|
|
@@ -118,11 +101,10 @@ class FilterSchema(BaseFileSchema):
|
|
|
118
101
|
@classmethod
|
|
119
102
|
def validate_output_path(cls, v: str) -> str:
|
|
120
103
|
"""Validate output path for existence and extension."""
|
|
121
|
-
settings = ImageSettings()
|
|
122
104
|
abs_path = os.path.abspath(os.path.normpath(v))
|
|
123
105
|
ext = os.path.splitext(abs_path)[1].lower()
|
|
124
|
-
if ext not in
|
|
125
|
-
raise SecurityError(f"Output extension '{ext}' not allowed, expected {
|
|
106
|
+
if ext not in _DEFAULT_ALLOWED_EXTENSIONS:
|
|
107
|
+
raise SecurityError(f"Output extension '{ext}' not allowed, expected {_DEFAULT_ALLOWED_EXTENSIONS}")
|
|
126
108
|
if os.path.exists(abs_path):
|
|
127
109
|
raise FileOperationError(f"Output file already exists: {abs_path}")
|
|
128
110
|
return abs_path
|
|
@@ -183,38 +165,56 @@ class ImageTool(BaseTool):
|
|
|
183
165
|
|
|
184
166
|
Inherits from BaseTool to leverage ToolExecutor for caching, concurrency, and error handling.
|
|
185
167
|
"""
|
|
186
|
-
|
|
168
|
+
|
|
169
|
+
# Configuration schema
|
|
170
|
+
class Config(BaseModel):
|
|
171
|
+
"""Configuration for the image tool"""
|
|
172
|
+
model_config = ConfigDict(env_prefix="IMAGE_TOOL_")
|
|
173
|
+
|
|
174
|
+
max_file_size_mb: int = Field(
|
|
175
|
+
default=50,
|
|
176
|
+
description="Maximum file size in megabytes"
|
|
177
|
+
)
|
|
178
|
+
allowed_extensions: List[str] = Field(
|
|
179
|
+
default=['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.gif'],
|
|
180
|
+
description="Allowed image file extensions"
|
|
181
|
+
)
|
|
182
|
+
tesseract_pool_size: int = Field(
|
|
183
|
+
default=2,
|
|
184
|
+
description="Number of Tesseract processes for OCR"
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
187
188
|
"""
|
|
188
|
-
Initialize ImageTool with
|
|
189
|
+
Initialize ImageTool with configuration and resources.
|
|
189
190
|
|
|
190
191
|
Args:
|
|
191
|
-
config (Dict, optional): Configuration overrides for
|
|
192
|
+
config (Dict, optional): Configuration overrides for ImageTool.
|
|
192
193
|
|
|
193
194
|
Raises:
|
|
194
195
|
ValueError: If config contains invalid settings.
|
|
195
196
|
"""
|
|
196
197
|
super().__init__(config)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
except ValidationError as e:
|
|
202
|
-
raise ValueError(f"Invalid configuration: {e}")
|
|
198
|
+
|
|
199
|
+
# Parse configuration
|
|
200
|
+
self.config = self.Config(**(config or {}))
|
|
201
|
+
|
|
203
202
|
self.logger = logging.getLogger(__name__)
|
|
204
203
|
if not self.logger.handlers:
|
|
205
204
|
handler = logging.StreamHandler()
|
|
206
205
|
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
|
|
207
206
|
self.logger.addHandler(handler)
|
|
208
207
|
self.logger.setLevel(logging.INFO)
|
|
208
|
+
|
|
209
209
|
# Initialize Tesseract manager
|
|
210
|
-
self._tesseract_manager = TesseractManager(self.
|
|
210
|
+
self._tesseract_manager = TesseractManager(self.config.tesseract_pool_size)
|
|
211
211
|
self._tesseract_manager.initialize()
|
|
212
212
|
|
|
213
213
|
def __del__(self):
|
|
214
214
|
"""Clean up Tesseract processes on destruction."""
|
|
215
215
|
self._tesseract_manager.cleanup()
|
|
216
216
|
|
|
217
|
-
def
|
|
217
|
+
def update_config(self, config: Dict) -> None:
|
|
218
218
|
"""
|
|
219
219
|
Update configuration settings dynamically.
|
|
220
220
|
|
|
@@ -225,11 +225,11 @@ class ImageTool(BaseTool):
|
|
|
225
225
|
ValueError: If config contains invalid settings.
|
|
226
226
|
"""
|
|
227
227
|
try:
|
|
228
|
-
self.
|
|
228
|
+
self.config = self.Config(**{**self.config.model_dump(), **config})
|
|
229
229
|
# Reinitialize Tesseract if pool size changes
|
|
230
230
|
if 'tesseract_pool_size' in config:
|
|
231
231
|
self._tesseract_manager.cleanup()
|
|
232
|
-
self._tesseract_manager = TesseractManager(self.
|
|
232
|
+
self._tesseract_manager = TesseractManager(self.config.tesseract_pool_size)
|
|
233
233
|
self._tesseract_manager.initialize()
|
|
234
234
|
except ValidationError as e:
|
|
235
235
|
raise ValueError(f"Invalid configuration: {e}")
|