tech-hub-skills 1.2.0 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/{LICENSE → .claude/LICENSE} +21 -21
- package/.claude/README.md +291 -0
- package/.claude/bin/cli.js +266 -0
- package/{bin → .claude/bin}/copilot.js +182 -182
- package/{bin → .claude/bin}/postinstall.js +42 -42
- package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
- package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
- package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
- package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
- package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
- package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
- package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
- package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
- package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
- package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
- package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
- package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
- package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
- package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
- package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
- package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
- package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
- package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
- package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
- package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
- package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
- package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
- package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
- package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
- package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
- package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
- package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
- package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
- package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
- package/.claude/package.json +46 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
- package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
- package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
- package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
- package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
- package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
- package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
- package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
- package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
- package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
- package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
- package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
- package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
- package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
- package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
- package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
- package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
- package/.claude/skills/README.md +336 -0
- package/.claude/skills/ai-engineer.md +104 -0
- package/.claude/skills/aws.md +143 -0
- package/.claude/skills/azure.md +149 -0
- package/.claude/skills/backend-developer.md +108 -0
- package/.claude/skills/code-review.md +399 -0
- package/.claude/skills/compliance-automation.md +747 -0
- package/.claude/skills/compliance-officer.md +108 -0
- package/.claude/skills/data-engineer.md +113 -0
- package/.claude/skills/data-governance.md +102 -0
- package/.claude/skills/data-scientist.md +123 -0
- package/.claude/skills/database-admin.md +109 -0
- package/.claude/skills/devops.md +160 -0
- package/.claude/skills/docker.md +160 -0
- package/.claude/skills/enterprise-dashboard.md +613 -0
- package/.claude/skills/finops.md +184 -0
- package/.claude/skills/frontend-developer.md +108 -0
- package/.claude/skills/gcp.md +143 -0
- package/.claude/skills/ml-engineer.md +115 -0
- package/.claude/skills/mlops.md +187 -0
- package/.claude/skills/network-engineer.md +109 -0
- package/.claude/skills/optimization-advisor.md +329 -0
- package/.claude/skills/orchestrator.md +623 -0
- package/.claude/skills/platform-engineer.md +102 -0
- package/.claude/skills/process-automation.md +226 -0
- package/.claude/skills/process-changelog.md +184 -0
- package/.claude/skills/process-documentation.md +484 -0
- package/.claude/skills/process-kanban.md +324 -0
- package/.claude/skills/process-versioning.md +214 -0
- package/.claude/skills/product-designer.md +104 -0
- package/.claude/skills/project-starter.md +443 -0
- package/.claude/skills/qa-engineer.md +109 -0
- package/.claude/skills/security-architect.md +135 -0
- package/.claude/skills/sre.md +109 -0
- package/.claude/skills/system-design.md +126 -0
- package/.claude/skills/technical-writer.md +101 -0
- package/.gitattributes +2 -0
- package/GITHUB_COPILOT.md +106 -0
- package/README.md +192 -291
- package/package.json +16 -46
- package/bin/cli.js +0 -241
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Automated Exploratory Data Analysis (EDA) Generator
|
|
3
|
+
Generate comprehensive EDA reports with minimal code.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import numpy as np
|
|
8
|
+
from typing import Dict, List, Optional, Any, Tuple
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import matplotlib.pyplot as plt
|
|
12
|
+
import seaborn as sns
|
|
13
|
+
from scipy import stats
|
|
14
|
+
import warnings
|
|
15
|
+
|
|
16
|
+
warnings.filterwarnings('ignore')
|
|
17
|
+
|
|
18
|
+
# Set style
|
|
19
|
+
sns.set_style("whitegrid")
|
|
20
|
+
plt.rcParams['figure.figsize'] = (12, 6)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class DataProfileSummary:
|
|
25
|
+
"""Summary statistics for a dataset."""
|
|
26
|
+
n_rows: int
|
|
27
|
+
n_columns: int
|
|
28
|
+
n_numeric: int
|
|
29
|
+
n_categorical: int
|
|
30
|
+
n_datetime: int
|
|
31
|
+
missing_cells: int
|
|
32
|
+
missing_percentage: float
|
|
33
|
+
duplicate_rows: int
|
|
34
|
+
memory_usage_mb: float
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EDAGenerator:
|
|
38
|
+
"""Automated EDA report generator."""
|
|
39
|
+
|
|
40
|
+
def __init__(self, figsize: Tuple[int, int] = (12, 6)):
|
|
41
|
+
"""
|
|
42
|
+
Initialize EDA generator.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
figsize: Default figure size for plots
|
|
46
|
+
"""
|
|
47
|
+
self.figsize = figsize
|
|
48
|
+
self.report_sections = []
|
|
49
|
+
|
|
50
|
+
def generate_profile(self, df: pd.DataFrame) -> DataProfileSummary:
|
|
51
|
+
"""
|
|
52
|
+
Generate data profile summary.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
df: Input DataFrame
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
DataProfileSummary object
|
|
59
|
+
"""
|
|
60
|
+
# Count column types
|
|
61
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
62
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
63
|
+
datetime_cols = df.select_dtypes(include=['datetime64']).columns
|
|
64
|
+
|
|
65
|
+
# Calculate statistics
|
|
66
|
+
missing_cells = df.isna().sum().sum()
|
|
67
|
+
total_cells = df.shape[0] * df.shape[1]
|
|
68
|
+
missing_pct = (missing_cells / total_cells * 100) if total_cells > 0 else 0
|
|
69
|
+
|
|
70
|
+
return DataProfileSummary(
|
|
71
|
+
n_rows=len(df),
|
|
72
|
+
n_columns=len(df.columns),
|
|
73
|
+
n_numeric=len(numeric_cols),
|
|
74
|
+
n_categorical=len(categorical_cols),
|
|
75
|
+
n_datetime=len(datetime_cols),
|
|
76
|
+
missing_cells=missing_cells,
|
|
77
|
+
missing_percentage=round(missing_pct, 2),
|
|
78
|
+
duplicate_rows=df.duplicated().sum(),
|
|
79
|
+
memory_usage_mb=round(df.memory_usage(deep=True).sum() / 1024**2, 2)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def analyze_missing_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
83
|
+
"""Analyze missing data patterns."""
|
|
84
|
+
missing_stats = pd.DataFrame({
|
|
85
|
+
'Column': df.columns,
|
|
86
|
+
'Missing_Count': df.isna().sum().values,
|
|
87
|
+
'Missing_Percentage': (df.isna().sum() / len(df) * 100).values,
|
|
88
|
+
'Data_Type': df.dtypes.values
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
missing_stats = missing_stats[missing_stats['Missing_Count'] > 0].sort_values(
|
|
92
|
+
'Missing_Percentage', ascending=False
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return missing_stats
|
|
96
|
+
|
|
97
|
+
def analyze_numeric_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
98
|
+
"""Comprehensive analysis of numeric columns."""
|
|
99
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
100
|
+
|
|
101
|
+
if len(numeric_cols) == 0:
|
|
102
|
+
return pd.DataFrame()
|
|
103
|
+
|
|
104
|
+
stats_list = []
|
|
105
|
+
|
|
106
|
+
for col in numeric_cols:
|
|
107
|
+
col_data = df[col].dropna()
|
|
108
|
+
|
|
109
|
+
if len(col_data) == 0:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
stats_dict = {
|
|
113
|
+
'Column': col,
|
|
114
|
+
'Count': len(col_data),
|
|
115
|
+
'Mean': col_data.mean(),
|
|
116
|
+
'Median': col_data.median(),
|
|
117
|
+
'Std': col_data.std(),
|
|
118
|
+
'Min': col_data.min(),
|
|
119
|
+
'Max': col_data.max(),
|
|
120
|
+
'Q25': col_data.quantile(0.25),
|
|
121
|
+
'Q75': col_data.quantile(0.75),
|
|
122
|
+
'IQR': col_data.quantile(0.75) - col_data.quantile(0.25),
|
|
123
|
+
'Skewness': col_data.skew(),
|
|
124
|
+
'Kurtosis': col_data.kurtosis(),
|
|
125
|
+
'Zeros': (col_data == 0).sum(),
|
|
126
|
+
'Zeros_Pct': (col_data == 0).sum() / len(col_data) * 100
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
stats_list.append(stats_dict)
|
|
130
|
+
|
|
131
|
+
return pd.DataFrame(stats_list).round(3)
|
|
132
|
+
|
|
133
|
+
def analyze_categorical_columns(self, df: pd.DataFrame, max_categories: int = 20) -> Dict[str, pd.DataFrame]:
|
|
134
|
+
"""Analyze categorical columns."""
|
|
135
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
136
|
+
|
|
137
|
+
results = {}
|
|
138
|
+
|
|
139
|
+
for col in categorical_cols:
|
|
140
|
+
value_counts = df[col].value_counts()
|
|
141
|
+
|
|
142
|
+
if len(value_counts) <= max_categories:
|
|
143
|
+
stats_df = pd.DataFrame({
|
|
144
|
+
'Value': value_counts.index,
|
|
145
|
+
'Count': value_counts.values,
|
|
146
|
+
'Percentage': (value_counts / len(df) * 100).values
|
|
147
|
+
}).round(2)
|
|
148
|
+
|
|
149
|
+
results[col] = stats_df
|
|
150
|
+
|
|
151
|
+
return results
|
|
152
|
+
|
|
153
|
+
def detect_outliers(self, df: pd.DataFrame, method: str = 'iqr', threshold: float = 1.5) -> Dict[str, Dict[str, Any]]:
|
|
154
|
+
"""
|
|
155
|
+
Detect outliers in numeric columns.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
df: Input DataFrame
|
|
159
|
+
method: Detection method ('iqr' or 'zscore')
|
|
160
|
+
threshold: Threshold value (1.5 for IQR, 3 for z-score)
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Dictionary with outlier information per column
|
|
164
|
+
"""
|
|
165
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
166
|
+
outliers = {}
|
|
167
|
+
|
|
168
|
+
for col in numeric_cols:
|
|
169
|
+
col_data = df[col].dropna()
|
|
170
|
+
|
|
171
|
+
if len(col_data) == 0:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
if method == 'iqr':
|
|
175
|
+
Q1 = col_data.quantile(0.25)
|
|
176
|
+
Q3 = col_data.quantile(0.75)
|
|
177
|
+
IQR = Q3 - Q1
|
|
178
|
+
lower_bound = Q1 - threshold * IQR
|
|
179
|
+
upper_bound = Q3 + threshold * IQR
|
|
180
|
+
|
|
181
|
+
outlier_mask = (col_data < lower_bound) | (col_data > upper_bound)
|
|
182
|
+
|
|
183
|
+
elif method == 'zscore':
|
|
184
|
+
z_scores = np.abs(stats.zscore(col_data))
|
|
185
|
+
outlier_mask = z_scores > threshold
|
|
186
|
+
|
|
187
|
+
else:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
outlier_count = outlier_mask.sum()
|
|
191
|
+
outlier_pct = (outlier_count / len(col_data) * 100)
|
|
192
|
+
|
|
193
|
+
if outlier_count > 0:
|
|
194
|
+
outliers[col] = {
|
|
195
|
+
'count': outlier_count,
|
|
196
|
+
'percentage': round(outlier_pct, 2),
|
|
197
|
+
'method': method,
|
|
198
|
+
'threshold': threshold,
|
|
199
|
+
'lower_bound': lower_bound if method == 'iqr' else None,
|
|
200
|
+
'upper_bound': upper_bound if method == 'iqr' else None
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return outliers
|
|
204
|
+
|
|
205
|
+
def calculate_correlations(self, df: pd.DataFrame, method: str = 'pearson', threshold: float = 0.7) -> pd.DataFrame:
|
|
206
|
+
"""
|
|
207
|
+
Calculate correlations between numeric columns.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
df: Input DataFrame
|
|
211
|
+
method: Correlation method ('pearson', 'spearman', 'kendall')
|
|
212
|
+
threshold: Only show correlations above this threshold
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
DataFrame with high correlations
|
|
216
|
+
"""
|
|
217
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
218
|
+
|
|
219
|
+
if len(numeric_cols) < 2:
|
|
220
|
+
return pd.DataFrame()
|
|
221
|
+
|
|
222
|
+
corr_matrix = df[numeric_cols].corr(method=method)
|
|
223
|
+
|
|
224
|
+
# Extract high correlations
|
|
225
|
+
high_corr = []
|
|
226
|
+
for i in range(len(corr_matrix.columns)):
|
|
227
|
+
for j in range(i+1, len(corr_matrix.columns)):
|
|
228
|
+
corr_value = corr_matrix.iloc[i, j]
|
|
229
|
+
if abs(corr_value) >= threshold:
|
|
230
|
+
high_corr.append({
|
|
231
|
+
'Variable_1': corr_matrix.columns[i],
|
|
232
|
+
'Variable_2': corr_matrix.columns[j],
|
|
233
|
+
'Correlation': round(corr_value, 3),
|
|
234
|
+
'Abs_Correlation': round(abs(corr_value), 3)
|
|
235
|
+
})
|
|
236
|
+
|
|
237
|
+
if high_corr:
|
|
238
|
+
return pd.DataFrame(high_corr).sort_values('Abs_Correlation', ascending=False)
|
|
239
|
+
else:
|
|
240
|
+
return pd.DataFrame()
|
|
241
|
+
|
|
242
|
+
def generate_insights(self, df: pd.DataFrame) -> List[str]:
|
|
243
|
+
"""Generate automated insights from the data."""
|
|
244
|
+
insights = []
|
|
245
|
+
|
|
246
|
+
# Data size insight
|
|
247
|
+
profile = self.generate_profile(df)
|
|
248
|
+
insights.append(f"📊 Dataset contains {profile.n_rows:,} rows and {profile.n_columns} columns")
|
|
249
|
+
|
|
250
|
+
# Missing data insights
|
|
251
|
+
if profile.missing_percentage > 10:
|
|
252
|
+
insights.append(f"⚠️ High missing data: {profile.missing_percentage:.1f}% of cells are missing")
|
|
253
|
+
elif profile.missing_percentage > 0:
|
|
254
|
+
insights.append(f"ℹ️ Missing data: {profile.missing_percentage:.1f}% of cells are missing")
|
|
255
|
+
|
|
256
|
+
# Duplicate insights
|
|
257
|
+
if profile.duplicate_rows > 0:
|
|
258
|
+
dup_pct = profile.duplicate_rows / profile.n_rows * 100
|
|
259
|
+
insights.append(f"🔄 Found {profile.duplicate_rows:,} duplicate rows ({dup_pct:.1f}%)")
|
|
260
|
+
|
|
261
|
+
# Numeric columns insights
|
|
262
|
+
numeric_stats = self.analyze_numeric_columns(df)
|
|
263
|
+
if not numeric_stats.empty:
|
|
264
|
+
# Check for skewed distributions
|
|
265
|
+
highly_skewed = numeric_stats[abs(numeric_stats['Skewness']) > 2]
|
|
266
|
+
if not highly_skewed.empty:
|
|
267
|
+
insights.append(f"📈 {len(highly_skewed)} highly skewed numeric columns detected")
|
|
268
|
+
|
|
269
|
+
# Check for columns with many zeros
|
|
270
|
+
zero_heavy = numeric_stats[numeric_stats['Zeros_Pct'] > 50]
|
|
271
|
+
if not zero_heavy.empty:
|
|
272
|
+
insights.append(f"0️⃣ {len(zero_heavy)} columns have >50% zeros")
|
|
273
|
+
|
|
274
|
+
# Outlier insights
|
|
275
|
+
outliers = self.detect_outliers(df)
|
|
276
|
+
if outliers:
|
|
277
|
+
total_outliers = sum(o['count'] for o in outliers.values())
|
|
278
|
+
insights.append(f"🎯 Detected {total_outliers:,} outliers across {len(outliers)} columns")
|
|
279
|
+
|
|
280
|
+
# Correlation insights
|
|
281
|
+
high_corr = self.calculate_correlations(df, threshold=0.8)
|
|
282
|
+
if not high_corr.empty:
|
|
283
|
+
insights.append(f"🔗 Found {len(high_corr)} high correlations (>0.8)")
|
|
284
|
+
|
|
285
|
+
# Categorical insights
|
|
286
|
+
cat_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
287
|
+
if len(cat_cols) > 0:
|
|
288
|
+
high_cardinality = [col for col in cat_cols if df[col].nunique() > 50]
|
|
289
|
+
if high_cardinality:
|
|
290
|
+
insights.append(f"🏷️ {len(high_cardinality)} categorical columns with high cardinality (>50 unique values)")
|
|
291
|
+
|
|
292
|
+
return insights
|
|
293
|
+
|
|
294
|
+
def generate_report(
|
|
295
|
+
self,
|
|
296
|
+
df: pd.DataFrame,
|
|
297
|
+
title: str = "Exploratory Data Analysis Report",
|
|
298
|
+
output_file: Optional[str] = None
|
|
299
|
+
) -> str:
|
|
300
|
+
"""
|
|
301
|
+
Generate comprehensive EDA report.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
df: Input DataFrame
|
|
305
|
+
title: Report title
|
|
306
|
+
output_file: Optional file path to save report
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Report as string
|
|
310
|
+
"""
|
|
311
|
+
report = []
|
|
312
|
+
|
|
313
|
+
# Header
|
|
314
|
+
report.append("=" * 80)
|
|
315
|
+
report.append(title.center(80))
|
|
316
|
+
report.append("=" * 80)
|
|
317
|
+
report.append("")
|
|
318
|
+
|
|
319
|
+
# 1. Data Profile
|
|
320
|
+
report.append("## 1. DATA PROFILE")
|
|
321
|
+
report.append("-" * 80)
|
|
322
|
+
profile = self.generate_profile(df)
|
|
323
|
+
for key, value in profile.__dict__.items():
|
|
324
|
+
report.append(f" {key.replace('_', ' ').title()}: {value}")
|
|
325
|
+
report.append("")
|
|
326
|
+
|
|
327
|
+
# 2. Automated Insights
|
|
328
|
+
report.append("## 2. KEY INSIGHTS")
|
|
329
|
+
report.append("-" * 80)
|
|
330
|
+
insights = self.generate_insights(df)
|
|
331
|
+
for insight in insights:
|
|
332
|
+
report.append(f" {insight}")
|
|
333
|
+
report.append("")
|
|
334
|
+
|
|
335
|
+
# 3. Missing Data Analysis
|
|
336
|
+
missing = self.analyze_missing_data(df)
|
|
337
|
+
if not missing.empty:
|
|
338
|
+
report.append("## 3. MISSING DATA")
|
|
339
|
+
report.append("-" * 80)
|
|
340
|
+
report.append(missing.to_string(index=False))
|
|
341
|
+
report.append("")
|
|
342
|
+
|
|
343
|
+
# 4. Numeric Columns
|
|
344
|
+
numeric_stats = self.analyze_numeric_columns(df)
|
|
345
|
+
if not numeric_stats.empty:
|
|
346
|
+
report.append("## 4. NUMERIC COLUMNS STATISTICS")
|
|
347
|
+
report.append("-" * 80)
|
|
348
|
+
report.append(numeric_stats.to_string(index=False))
|
|
349
|
+
report.append("")
|
|
350
|
+
|
|
351
|
+
# 5. Outliers
|
|
352
|
+
outliers = self.detect_outliers(df)
|
|
353
|
+
if outliers:
|
|
354
|
+
report.append("## 5. OUTLIER DETECTION")
|
|
355
|
+
report.append("-" * 80)
|
|
356
|
+
for col, stats in outliers.items():
|
|
357
|
+
report.append(f" {col}: {stats['count']} outliers ({stats['percentage']:.2f}%)")
|
|
358
|
+
report.append("")
|
|
359
|
+
|
|
360
|
+
# 6. Correlations
|
|
361
|
+
high_corr = self.calculate_correlations(df, threshold=0.7)
|
|
362
|
+
if not high_corr.empty:
|
|
363
|
+
report.append("## 6. HIGH CORRELATIONS (>0.7)")
|
|
364
|
+
report.append("-" * 80)
|
|
365
|
+
report.append(high_corr.to_string(index=False))
|
|
366
|
+
report.append("")
|
|
367
|
+
|
|
368
|
+
# 7. Categorical Analysis
|
|
369
|
+
cat_analysis = self.analyze_categorical_columns(df)
|
|
370
|
+
if cat_analysis:
|
|
371
|
+
report.append("## 7. CATEGORICAL COLUMNS")
|
|
372
|
+
report.append("-" * 80)
|
|
373
|
+
for col, stats in cat_analysis.items():
|
|
374
|
+
report.append(f"\n {col}:")
|
|
375
|
+
report.append(stats.to_string(index=False, max_rows=10))
|
|
376
|
+
report.append("")
|
|
377
|
+
|
|
378
|
+
report_text = "\n".join(report)
|
|
379
|
+
|
|
380
|
+
# Save to file if specified
|
|
381
|
+
if output_file:
|
|
382
|
+
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
|
|
383
|
+
with open(output_file, 'w') as f:
|
|
384
|
+
f.write(report_text)
|
|
385
|
+
print(f"✅ Report saved to {output_file}")
|
|
386
|
+
|
|
387
|
+
return report_text
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
# Example usage
|
|
391
|
+
if __name__ == "__main__":
|
|
392
|
+
# Create sample marketing campaign data
|
|
393
|
+
np.random.seed(42)
|
|
394
|
+
|
|
395
|
+
n_samples = 1000
|
|
396
|
+
|
|
397
|
+
sample_data = pd.DataFrame({
|
|
398
|
+
'campaign_id': [f'C{i:04d}' for i in range(n_samples)],
|
|
399
|
+
'campaign_type': np.random.choice(['Email', 'Social', 'Search', 'Display'], n_samples),
|
|
400
|
+
'budget': np.random.exponential(5000, n_samples),
|
|
401
|
+
'impressions': np.random.poisson(10000, n_samples),
|
|
402
|
+
'clicks': np.random.poisson(250, n_samples),
|
|
403
|
+
'conversions': np.random.poisson(15, n_samples),
|
|
404
|
+
'revenue': np.random.exponential(2000, n_samples),
|
|
405
|
+
'industry': np.random.choice(['Tech', 'Finance', 'Healthcare', 'Retail', 'Manufacturing'], n_samples),
|
|
406
|
+
'region': np.random.choice(['North', 'South', 'East', 'West'], n_samples),
|
|
407
|
+
'start_date': pd.date_range('2025-01-01', periods=n_samples, freq='H')
|
|
408
|
+
})
|
|
409
|
+
|
|
410
|
+
# Add some missing values
|
|
411
|
+
sample_data.loc[np.random.choice(n_samples, 50, replace=False), 'conversions'] = np.nan
|
|
412
|
+
sample_data.loc[np.random.choice(n_samples, 30, replace=False), 'revenue'] = np.nan
|
|
413
|
+
|
|
414
|
+
# Calculate derived metrics
|
|
415
|
+
sample_data['ctr'] = sample_data['clicks'] / sample_data['impressions'] * 100
|
|
416
|
+
sample_data['conversion_rate'] = sample_data['conversions'] / sample_data['clicks'] * 100
|
|
417
|
+
sample_data['roas'] = sample_data['revenue'] / sample_data['budget']
|
|
418
|
+
sample_data['cpc'] = sample_data['budget'] / sample_data['clicks']
|
|
419
|
+
|
|
420
|
+
print("=" * 80)
|
|
421
|
+
print("AUTOMATED EDA DEMO")
|
|
422
|
+
print("=" * 80)
|
|
423
|
+
|
|
424
|
+
# Generate EDA
|
|
425
|
+
eda = EDAGenerator()
|
|
426
|
+
|
|
427
|
+
report = eda.generate_report(
|
|
428
|
+
df=sample_data,
|
|
429
|
+
title="Marketing Campaign Performance Analysis",
|
|
430
|
+
output_file="campaign_eda_report.txt"
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
print(report)
|
|
434
|
+
|
|
435
|
+
print("\n" + "=" * 80)
|
|
436
|
+
print("DETAILED STATISTICS")
|
|
437
|
+
print("=" * 80)
|
|
438
|
+
|
|
439
|
+
# Show numeric statistics
|
|
440
|
+
print("\n📊 Numeric Columns:")
|
|
441
|
+
print(eda.analyze_numeric_columns(sample_data))
|
|
442
|
+
|
|
443
|
+
# Show correlations
|
|
444
|
+
print("\n🔗 Correlations:")
|
|
445
|
+
corr = eda.calculate_correlations(sample_data, threshold=0.5)
|
|
446
|
+
print(corr if not corr.empty else "No high correlations found")
|