aiecs 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (58) hide show
  1. aiecs/__init__.py +1 -1
  2. aiecs/config/config.py +2 -0
  3. aiecs/domain/__init__.py +95 -0
  4. aiecs/domain/community/__init__.py +159 -0
  5. aiecs/domain/community/agent_adapter.py +516 -0
  6. aiecs/domain/community/analytics.py +465 -0
  7. aiecs/domain/community/collaborative_workflow.py +99 -7
  8. aiecs/domain/community/communication_hub.py +649 -0
  9. aiecs/domain/community/community_builder.py +322 -0
  10. aiecs/domain/community/community_integration.py +365 -12
  11. aiecs/domain/community/community_manager.py +481 -5
  12. aiecs/domain/community/decision_engine.py +459 -13
  13. aiecs/domain/community/exceptions.py +238 -0
  14. aiecs/domain/community/models/__init__.py +36 -0
  15. aiecs/domain/community/resource_manager.py +1 -1
  16. aiecs/domain/community/shared_context_manager.py +621 -0
  17. aiecs/domain/context/context_engine.py +37 -33
  18. aiecs/main.py +2 -2
  19. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  20. aiecs/scripts/aid/__init__.py +15 -0
  21. aiecs/scripts/aid/version_manager.py +224 -0
  22. aiecs/scripts/dependance_check/download_nlp_data.py +1 -0
  23. aiecs/tools/__init__.py +23 -23
  24. aiecs/tools/docs/__init__.py +5 -2
  25. aiecs/tools/docs/ai_document_orchestrator.py +39 -26
  26. aiecs/tools/docs/ai_document_writer_orchestrator.py +61 -38
  27. aiecs/tools/docs/content_insertion_tool.py +48 -28
  28. aiecs/tools/docs/document_creator_tool.py +47 -29
  29. aiecs/tools/docs/document_layout_tool.py +35 -20
  30. aiecs/tools/docs/document_parser_tool.py +56 -36
  31. aiecs/tools/docs/document_writer_tool.py +115 -62
  32. aiecs/tools/schema_generator.py +56 -56
  33. aiecs/tools/statistics/__init__.py +82 -0
  34. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +581 -0
  35. aiecs/tools/statistics/ai_insight_generator_tool.py +473 -0
  36. aiecs/tools/statistics/ai_report_orchestrator_tool.py +629 -0
  37. aiecs/tools/statistics/data_loader_tool.py +518 -0
  38. aiecs/tools/statistics/data_profiler_tool.py +599 -0
  39. aiecs/tools/statistics/data_transformer_tool.py +531 -0
  40. aiecs/tools/statistics/data_visualizer_tool.py +460 -0
  41. aiecs/tools/statistics/model_trainer_tool.py +470 -0
  42. aiecs/tools/statistics/statistical_analyzer_tool.py +426 -0
  43. aiecs/tools/task_tools/chart_tool.py +2 -1
  44. aiecs/tools/task_tools/image_tool.py +43 -43
  45. aiecs/tools/task_tools/office_tool.py +39 -36
  46. aiecs/tools/task_tools/pandas_tool.py +37 -33
  47. aiecs/tools/task_tools/report_tool.py +67 -56
  48. aiecs/tools/task_tools/research_tool.py +32 -31
  49. aiecs/tools/task_tools/scraper_tool.py +53 -46
  50. aiecs/tools/task_tools/search_tool.py +1123 -0
  51. aiecs/tools/task_tools/stats_tool.py +20 -15
  52. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/METADATA +5 -1
  53. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/RECORD +57 -36
  54. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/entry_points.txt +1 -0
  55. aiecs/tools/task_tools/search_api.py +0 -7
  56. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/WHEEL +0 -0
  57. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/licenses/LICENSE +0 -0
  58. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,599 @@
1
+ """
2
+ Data Profiler Tool - Comprehensive data profiling and quality assessment
3
+
4
+ This tool provides advanced data profiling capabilities with:
5
+ - Statistical summaries and distributions
6
+ - Data quality issue detection
7
+ - Pattern and anomaly identification
8
+ - Preprocessing recommendations
9
+ - Column-level and dataset-level analysis
10
+ """
11
+
12
+ import logging
13
+ from typing import Dict, Any, List, Optional, Union
14
+ from enum import Enum
15
+
16
+ import pandas as pd
17
+ import numpy as np
18
+ from pydantic import BaseModel, Field, ValidationError, ConfigDict
19
+ from scipy import stats as scipy_stats
20
+
21
+ from aiecs.tools.base_tool import BaseTool
22
+ from aiecs.tools import register_tool
23
+
24
+
25
+ class ProfileLevel(str, Enum):
26
+ """Data profiling depth levels"""
27
+ BASIC = "basic"
28
+ STANDARD = "standard"
29
+ COMPREHENSIVE = "comprehensive"
30
+ DEEP = "deep"
31
+
32
+
33
+ class DataQualityCheck(str, Enum):
34
+ """Types of data quality checks"""
35
+ MISSING_VALUES = "missing_values"
36
+ DUPLICATES = "duplicates"
37
+ OUTLIERS = "outliers"
38
+ INCONSISTENCIES = "inconsistencies"
39
+ DATA_TYPES = "data_types"
40
+ DISTRIBUTIONS = "distributions"
41
+ CORRELATIONS = "correlations"
42
+
43
+
44
+
45
+
46
+ class DataProfilerError(Exception):
47
+ """Base exception for DataProfiler errors"""
48
+ pass
49
+
50
+
51
+ class ProfilingError(DataProfilerError):
52
+ """Raised when profiling operation fails"""
53
+ pass
54
+
55
+
56
+ @register_tool('data_profiler')
57
+ class DataProfilerTool(BaseTool):
58
+ """
59
+ Comprehensive data profiling tool that can:
60
+ 1. Generate statistical summaries
61
+ 2. Detect data quality issues
62
+ 3. Identify patterns and anomalies
63
+ 4. Recommend preprocessing steps
64
+
65
+ Integrates with stats_tool and pandas_tool for core operations.
66
+ """
67
+
68
+ # Configuration schema
69
+ class Config(BaseModel):
70
+ """Configuration for the data profiler tool"""
71
+ model_config = ConfigDict(env_prefix="DATA_PROFILER_")
72
+
73
+ default_profile_level: str = Field(
74
+ default="standard",
75
+ description="Default profiling depth level"
76
+ )
77
+ outlier_std_threshold: float = Field(
78
+ default=3.0,
79
+ description="Standard deviation threshold for outlier detection"
80
+ )
81
+ correlation_threshold: float = Field(
82
+ default=0.7,
83
+ description="Correlation threshold for identifying strong relationships"
84
+ )
85
+ missing_threshold: float = Field(
86
+ default=0.5,
87
+ description="Missing value threshold for quality assessment"
88
+ )
89
+ enable_visualizations: bool = Field(
90
+ default=True,
91
+ description="Whether to enable visualization generation"
92
+ )
93
+ max_unique_values_categorical: int = Field(
94
+ default=50,
95
+ description="Maximum unique values for categorical analysis"
96
+ )
97
+
98
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
99
+ """
100
+ Initialize DataProfilerTool with settings.
101
+
102
+ Args:
103
+ config: Optional configuration overrides
104
+ """
105
+ super().__init__(config)
106
+
107
+ # Parse configuration
108
+ self.config = self.Config(**(config or {}))
109
+
110
+ self.logger = logging.getLogger(__name__)
111
+ if not self.logger.handlers:
112
+ handler = logging.StreamHandler()
113
+ handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
114
+ self.logger.addHandler(handler)
115
+ self.logger.setLevel(logging.INFO)
116
+
117
+ # Initialize external tools
118
+ self._init_external_tools()
119
+
120
+ def _init_external_tools(self):
121
+ """Initialize external task tools"""
122
+ self.external_tools = {}
123
+
124
+ # Initialize StatsTool for statistical operations
125
+ try:
126
+ from aiecs.tools.task_tools.stats_tool import StatsTool
127
+ self.external_tools['stats'] = StatsTool()
128
+ self.logger.info("StatsTool initialized successfully")
129
+ except ImportError:
130
+ self.logger.warning("StatsTool not available")
131
+ self.external_tools['stats'] = None
132
+
133
+ # Initialize PandasTool for data operations
134
+ try:
135
+ from aiecs.tools.task_tools.pandas_tool import PandasTool
136
+ self.external_tools['pandas'] = PandasTool()
137
+ self.logger.info("PandasTool initialized successfully")
138
+ except ImportError:
139
+ self.logger.warning("PandasTool not available")
140
+ self.external_tools['pandas'] = None
141
+
142
+ # Schema definitions
143
+ class ProfileDatasetSchema(BaseModel):
144
+ """Schema for profile_dataset operation"""
145
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to profile")
146
+ level: ProfileLevel = Field(default=ProfileLevel.STANDARD, description="Profiling depth level")
147
+ checks: Optional[List[DataQualityCheck]] = Field(default=None, description="Specific quality checks to perform")
148
+ generate_visualizations: bool = Field(default=False, description="Generate visualization data")
149
+
150
+ class DetectQualityIssuesSchema(BaseModel):
151
+ """Schema for detect_quality_issues operation"""
152
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to check")
153
+ checks: Optional[List[DataQualityCheck]] = Field(default=None, description="Specific checks to perform")
154
+
155
+ class RecommendPreprocessingSchema(BaseModel):
156
+ """Schema for recommend_preprocessing operation"""
157
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to analyze")
158
+ target_column: Optional[str] = Field(default=None, description="Target column for ML tasks")
159
+
160
+ def profile_dataset(
161
+ self,
162
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
163
+ level: ProfileLevel = ProfileLevel.STANDARD,
164
+ checks: Optional[List[DataQualityCheck]] = None,
165
+ generate_visualizations: bool = False
166
+ ) -> Dict[str, Any]:
167
+ """
168
+ Generate comprehensive data profile.
169
+
170
+ Args:
171
+ data: Data to profile (dict, list of dicts, or DataFrame)
172
+ level: Profiling depth level
173
+ checks: Specific quality checks to perform (all if None)
174
+ generate_visualizations: Whether to generate visualization data
175
+
176
+ Returns:
177
+ Dict containing:
178
+ - summary: Dataset-level summary
179
+ - column_profiles: Column-level profiles
180
+ - quality_issues: Detected quality issues
181
+ - correlations: Correlation analysis
182
+ - recommendations: Preprocessing recommendations
183
+
184
+ Raises:
185
+ ProfilingError: If profiling fails
186
+ """
187
+ try:
188
+ # Convert to DataFrame if needed
189
+ df = self._to_dataframe(data)
190
+
191
+ self.logger.info(f"Profiling dataset with {len(df)} rows and {len(df.columns)} columns")
192
+
193
+ # Generate summary
194
+ summary = self._generate_summary(df)
195
+
196
+ # Generate column profiles
197
+ column_profiles = self._profile_columns(df, level)
198
+
199
+ # Detect quality issues
200
+ quality_issues = self._detect_quality_issues(df, checks)
201
+
202
+ # Correlation analysis (for comprehensive and deep levels)
203
+ correlations = {}
204
+ if level in [ProfileLevel.COMPREHENSIVE, ProfileLevel.DEEP]:
205
+ correlations = self._analyze_correlations(df)
206
+
207
+ # Generate recommendations
208
+ recommendations = self._generate_recommendations(df, quality_issues, level)
209
+
210
+ # Generate visualization data if requested
211
+ visualization_data = {}
212
+ if generate_visualizations:
213
+ visualization_data = self._generate_visualization_data(df)
214
+
215
+ result = {
216
+ 'summary': summary,
217
+ 'column_profiles': column_profiles,
218
+ 'quality_issues': quality_issues,
219
+ 'correlations': correlations,
220
+ 'recommendations': recommendations,
221
+ 'profile_level': level.value
222
+ }
223
+
224
+ if visualization_data:
225
+ result['visualization_data'] = visualization_data
226
+
227
+ self.logger.info("Dataset profiling completed successfully")
228
+ return result
229
+
230
+ except Exception as e:
231
+ self.logger.error(f"Error profiling dataset: {e}")
232
+ raise ProfilingError(f"Failed to profile dataset: {e}")
233
+
234
+ def detect_quality_issues(
235
+ self,
236
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
237
+ checks: Optional[List[DataQualityCheck]] = None
238
+ ) -> Dict[str, Any]:
239
+ """
240
+ Detect data quality issues.
241
+
242
+ Args:
243
+ data: Data to check
244
+ checks: Specific checks to perform (all if None)
245
+
246
+ Returns:
247
+ Dict containing detected issues by category
248
+ """
249
+ try:
250
+ df = self._to_dataframe(data)
251
+ issues = self._detect_quality_issues(df, checks)
252
+
253
+ return {
254
+ 'issues': issues,
255
+ 'total_issues': sum(len(v) for v in issues.values()),
256
+ 'severity_counts': self._categorize_severity(issues)
257
+ }
258
+
259
+ except Exception as e:
260
+ self.logger.error(f"Error detecting quality issues: {e}")
261
+ raise ProfilingError(f"Failed to detect quality issues: {e}")
262
+
263
+ def recommend_preprocessing(
264
+ self,
265
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
266
+ target_column: Optional[str] = None
267
+ ) -> Dict[str, Any]:
268
+ """
269
+ Recommend preprocessing steps based on data analysis.
270
+
271
+ Args:
272
+ data: Data to analyze
273
+ target_column: Target column for ML tasks (if applicable)
274
+
275
+ Returns:
276
+ Dict containing recommended preprocessing steps
277
+ """
278
+ try:
279
+ df = self._to_dataframe(data)
280
+
281
+ # Detect quality issues
282
+ quality_issues = self._detect_quality_issues(df, None)
283
+
284
+ # Generate recommendations
285
+ recommendations = self._generate_recommendations(df, quality_issues, ProfileLevel.COMPREHENSIVE)
286
+
287
+ # Add task-specific recommendations
288
+ if target_column and target_column in df.columns:
289
+ task_recommendations = self._generate_task_recommendations(df, target_column)
290
+ recommendations.extend(task_recommendations)
291
+
292
+ # Prioritize recommendations
293
+ prioritized = self._prioritize_recommendations(recommendations)
294
+
295
+ return {
296
+ 'recommendations': prioritized,
297
+ 'total_steps': len(prioritized),
298
+ 'estimated_impact': 'medium' # Placeholder for impact estimation
299
+ }
300
+
301
+ except Exception as e:
302
+ self.logger.error(f"Error generating recommendations: {e}")
303
+ raise ProfilingError(f"Failed to generate recommendations: {e}")
304
+
305
+ # Internal helper methods
306
+
307
+ def _to_dataframe(self, data: Union[Dict, List, pd.DataFrame]) -> pd.DataFrame:
308
+ """Convert data to DataFrame"""
309
+ if isinstance(data, pd.DataFrame):
310
+ return data
311
+ elif isinstance(data, list):
312
+ return pd.DataFrame(data)
313
+ elif isinstance(data, dict):
314
+ return pd.DataFrame([data])
315
+ else:
316
+ raise ProfilingError(f"Unsupported data type: {type(data)}")
317
+
318
+ def _generate_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
319
+ """Generate dataset-level summary"""
320
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
321
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
322
+
323
+ return {
324
+ 'rows': len(df),
325
+ 'columns': len(df.columns),
326
+ 'numeric_columns': len(numeric_cols),
327
+ 'categorical_columns': len(categorical_cols),
328
+ 'memory_usage_mb': df.memory_usage(deep=True).sum() / (1024 * 1024),
329
+ 'missing_cells': df.isnull().sum().sum(),
330
+ 'missing_percentage': (df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100) if len(df) > 0 else 0,
331
+ 'duplicate_rows': df.duplicated().sum(),
332
+ 'duplicate_percentage': (df.duplicated().sum() / len(df) * 100) if len(df) > 0 else 0
333
+ }
334
+
335
+ def _profile_columns(self, df: pd.DataFrame, level: ProfileLevel) -> Dict[str, Dict[str, Any]]:
336
+ """Generate column-level profiles"""
337
+ profiles = {}
338
+
339
+ for col in df.columns:
340
+ profile = {
341
+ 'name': col,
342
+ 'dtype': str(df[col].dtype),
343
+ 'missing_count': df[col].isnull().sum(),
344
+ 'missing_percentage': (df[col].isnull().sum() / len(df) * 100) if len(df) > 0 else 0,
345
+ 'unique_count': df[col].nunique(),
346
+ 'unique_percentage': (df[col].nunique() / len(df) * 100) if len(df) > 0 else 0
347
+ }
348
+
349
+ # Add type-specific statistics
350
+ if df[col].dtype in ['int64', 'float64']:
351
+ profile.update(self._profile_numeric_column(df[col], level))
352
+ else:
353
+ profile.update(self._profile_categorical_column(df[col], level))
354
+
355
+ profiles[col] = profile
356
+
357
+ return profiles
358
+
359
+ def _profile_numeric_column(self, series: pd.Series, level: ProfileLevel) -> Dict[str, Any]:
360
+ """Profile numeric column"""
361
+ profile = {
362
+ 'type': 'numeric',
363
+ 'min': float(series.min()) if not series.empty else None,
364
+ 'max': float(series.max()) if not series.empty else None,
365
+ 'mean': float(series.mean()) if not series.empty else None,
366
+ 'median': float(series.median()) if not series.empty else None,
367
+ 'std': float(series.std()) if not series.empty else None
368
+ }
369
+
370
+ if level in [ProfileLevel.COMPREHENSIVE, ProfileLevel.DEEP]:
371
+ profile.update({
372
+ 'q25': float(series.quantile(0.25)) if not series.empty else None,
373
+ 'q75': float(series.quantile(0.75)) if not series.empty else None,
374
+ 'skewness': float(series.skew()) if not series.empty else None,
375
+ 'kurtosis': float(series.kurt()) if not series.empty else None
376
+ })
377
+
378
+ # Detect outliers
379
+ if not series.empty and series.std() > 0:
380
+ z_scores = np.abs((series - series.mean()) / series.std())
381
+ outlier_count = (z_scores > self.config.outlier_std_threshold).sum()
382
+ profile['outlier_count'] = int(outlier_count)
383
+ profile['outlier_percentage'] = float(outlier_count / len(series) * 100)
384
+
385
+ return profile
386
+
387
+ def _profile_categorical_column(self, series: pd.Series, level: ProfileLevel) -> Dict[str, Any]:
388
+ """Profile categorical column"""
389
+ value_counts = series.value_counts()
390
+
391
+ profile = {
392
+ 'type': 'categorical',
393
+ 'unique_values': int(series.nunique()),
394
+ 'most_common': str(value_counts.index[0]) if not value_counts.empty else None,
395
+ 'most_common_count': int(value_counts.iloc[0]) if not value_counts.empty else None
396
+ }
397
+
398
+ if level in [ProfileLevel.COMPREHENSIVE, ProfileLevel.DEEP]:
399
+ # Add top categories
400
+ top_n = min(10, len(value_counts))
401
+ profile['top_categories'] = {
402
+ str(k): int(v) for k, v in value_counts.head(top_n).items()
403
+ }
404
+
405
+ return profile
406
+
407
+ def _detect_quality_issues(self, df: pd.DataFrame, checks: Optional[List[DataQualityCheck]]) -> Dict[str, List[Dict[str, Any]]]:
408
+ """Detect data quality issues"""
409
+ issues = {
410
+ 'missing_values': [],
411
+ 'duplicates': [],
412
+ 'outliers': [],
413
+ 'inconsistencies': [],
414
+ 'data_types': [],
415
+ 'distributions': [],
416
+ 'correlations': []
417
+ }
418
+
419
+ # All checks by default
420
+ if checks is None:
421
+ checks = list(DataQualityCheck)
422
+
423
+ # Missing values check
424
+ if DataQualityCheck.MISSING_VALUES in checks:
425
+ for col in df.columns:
426
+ missing_pct = (df[col].isnull().sum() / len(df) * 100) if len(df) > 0 else 0
427
+ if missing_pct > 0:
428
+ issues['missing_values'].append({
429
+ 'column': col,
430
+ 'missing_percentage': missing_pct,
431
+ 'severity': 'high' if missing_pct > self.config.missing_threshold * 100 else 'medium'
432
+ })
433
+
434
+ # Duplicates check
435
+ if DataQualityCheck.DUPLICATES in checks:
436
+ dup_count = df.duplicated().sum()
437
+ if dup_count > 0:
438
+ issues['duplicates'].append({
439
+ 'type': 'row_duplicates',
440
+ 'count': int(dup_count),
441
+ 'percentage': float(dup_count / len(df) * 100) if len(df) > 0 else 0,
442
+ 'severity': 'medium'
443
+ })
444
+
445
+ # Outliers check
446
+ if DataQualityCheck.OUTLIERS in checks:
447
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
448
+ for col in numeric_cols:
449
+ if df[col].std() > 0:
450
+ z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
451
+ outlier_count = (z_scores > self.config.outlier_std_threshold).sum()
452
+ if outlier_count > 0:
453
+ issues['outliers'].append({
454
+ 'column': col,
455
+ 'count': int(outlier_count),
456
+ 'percentage': float(outlier_count / len(df) * 100),
457
+ 'severity': 'low'
458
+ })
459
+
460
+ return issues
461
+
462
+ def _analyze_correlations(self, df: pd.DataFrame) -> Dict[str, Any]:
463
+ """Analyze correlations between numeric columns"""
464
+ numeric_df = df.select_dtypes(include=[np.number])
465
+
466
+ if numeric_df.shape[1] < 2:
467
+ return {'message': 'Insufficient numeric columns for correlation analysis'}
468
+
469
+ corr_matrix = numeric_df.corr()
470
+
471
+ # Find high correlations
472
+ high_corr_pairs = []
473
+ for i in range(len(corr_matrix.columns)):
474
+ for j in range(i + 1, len(corr_matrix.columns)):
475
+ corr_value = corr_matrix.iloc[i, j]
476
+ if abs(corr_value) > self.config.correlation_threshold:
477
+ high_corr_pairs.append({
478
+ 'column1': corr_matrix.columns[i],
479
+ 'column2': corr_matrix.columns[j],
480
+ 'correlation': float(corr_value)
481
+ })
482
+
483
+ return {
484
+ 'correlation_matrix': corr_matrix.to_dict(),
485
+ 'high_correlations': high_corr_pairs,
486
+ 'num_high_correlations': len(high_corr_pairs)
487
+ }
488
+
489
+ def _generate_recommendations(self, df: pd.DataFrame, quality_issues: Dict[str, List], level: ProfileLevel) -> List[Dict[str, Any]]:
490
+ """Generate preprocessing recommendations"""
491
+ recommendations = []
492
+
493
+ # Missing value recommendations
494
+ for issue in quality_issues.get('missing_values', []):
495
+ if issue['missing_percentage'] < 5:
496
+ recommendations.append({
497
+ 'action': 'drop_missing_rows',
498
+ 'column': issue['column'],
499
+ 'reason': f"Low missing percentage ({issue['missing_percentage']:.2f}%)",
500
+ 'priority': 'medium'
501
+ })
502
+ elif issue['missing_percentage'] < 50:
503
+ recommendations.append({
504
+ 'action': 'impute_missing',
505
+ 'column': issue['column'],
506
+ 'method': 'mean' if df[issue['column']].dtype in ['int64', 'float64'] else 'mode',
507
+ 'reason': f"Moderate missing percentage ({issue['missing_percentage']:.2f}%)",
508
+ 'priority': 'high'
509
+ })
510
+ else:
511
+ recommendations.append({
512
+ 'action': 'consider_dropping_column',
513
+ 'column': issue['column'],
514
+ 'reason': f"High missing percentage ({issue['missing_percentage']:.2f}%)",
515
+ 'priority': 'high'
516
+ })
517
+
518
+ # Duplicate recommendations
519
+ if quality_issues.get('duplicates'):
520
+ recommendations.append({
521
+ 'action': 'remove_duplicates',
522
+ 'reason': f"{quality_issues['duplicates'][0]['count']} duplicate rows found",
523
+ 'priority': 'high'
524
+ })
525
+
526
+ # Outlier recommendations
527
+ if quality_issues.get('outliers'):
528
+ for issue in quality_issues['outliers']:
529
+ if issue['percentage'] > 5:
530
+ recommendations.append({
531
+ 'action': 'handle_outliers',
532
+ 'column': issue['column'],
533
+ 'method': 'winsorize or cap',
534
+ 'reason': f"Significant outliers detected ({issue['percentage']:.2f}%)",
535
+ 'priority': 'medium'
536
+ })
537
+
538
+ return recommendations
539
+
540
+ def _generate_task_recommendations(self, df: pd.DataFrame, target_column: str) -> List[Dict[str, Any]]:
541
+ """Generate task-specific recommendations"""
542
+ recommendations = []
543
+
544
+ # Check if target is numeric or categorical
545
+ if df[target_column].dtype in ['int64', 'float64']:
546
+ task_type = 'regression'
547
+ else:
548
+ task_type = 'classification'
549
+
550
+ recommendations.append({
551
+ 'action': 'task_identified',
552
+ 'task_type': task_type,
553
+ 'target_column': target_column,
554
+ 'reason': f"Based on target column type: {df[target_column].dtype}",
555
+ 'priority': 'info'
556
+ })
557
+
558
+ return recommendations
559
+
560
+ def _prioritize_recommendations(self, recommendations: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
561
+ """Prioritize recommendations by importance"""
562
+ priority_order = {'high': 0, 'medium': 1, 'low': 2, 'info': 3}
563
+ return sorted(recommendations, key=lambda x: priority_order.get(x.get('priority', 'low'), 2))
564
+
565
+ def _categorize_severity(self, issues: Dict[str, List]) -> Dict[str, int]:
566
+ """Categorize issues by severity"""
567
+ severity_counts = {'high': 0, 'medium': 0, 'low': 0}
568
+
569
+ for issue_list in issues.values():
570
+ for issue in issue_list:
571
+ severity = issue.get('severity', 'low')
572
+ severity_counts[severity] = severity_counts.get(severity, 0) + 1
573
+
574
+ return severity_counts
575
+
576
+ def _generate_visualization_data(self, df: pd.DataFrame) -> Dict[str, Any]:
577
+ """Generate data for visualizations"""
578
+ viz_data = {}
579
+
580
+ # Numeric distributions
581
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
582
+ if len(numeric_cols) > 0:
583
+ viz_data['numeric_distributions'] = {
584
+ col: {
585
+ 'values': df[col].dropna().tolist()[:1000], # Sample for performance
586
+ 'bins': 30
587
+ } for col in numeric_cols[:5] # Limit to first 5
588
+ }
589
+
590
+ # Categorical distributions
591
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
592
+ if len(categorical_cols) > 0:
593
+ viz_data['categorical_distributions'] = {
594
+ col: df[col].value_counts().head(10).to_dict()
595
+ for col in categorical_cols[:5] # Limit to first 5
596
+ }
597
+
598
+ return viz_data
599
+