tech-hub-skills 1.2.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/.claude/README.md +291 -0
  2. package/.claude/bin/cli.js +266 -0
  3. package/.claude/package.json +46 -0
  4. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  5. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  6. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  7. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  8. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  9. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  10. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  11. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  12. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  13. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  14. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  15. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  16. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  17. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  18. package/.claude/skills/README.md +336 -0
  19. package/.claude/skills/ai-engineer.md +104 -0
  20. package/.claude/skills/aws.md +143 -0
  21. package/.claude/skills/azure.md +149 -0
  22. package/.claude/skills/backend-developer.md +108 -0
  23. package/.claude/skills/code-review.md +399 -0
  24. package/.claude/skills/compliance-automation.md +747 -0
  25. package/.claude/skills/compliance-officer.md +108 -0
  26. package/.claude/skills/data-engineer.md +113 -0
  27. package/.claude/skills/data-governance.md +102 -0
  28. package/.claude/skills/data-scientist.md +123 -0
  29. package/.claude/skills/database-admin.md +109 -0
  30. package/.claude/skills/devops.md +160 -0
  31. package/.claude/skills/docker.md +160 -0
  32. package/.claude/skills/enterprise-dashboard.md +613 -0
  33. package/.claude/skills/finops.md +184 -0
  34. package/.claude/skills/frontend-developer.md +108 -0
  35. package/.claude/skills/gcp.md +143 -0
  36. package/.claude/skills/ml-engineer.md +115 -0
  37. package/.claude/skills/mlops.md +187 -0
  38. package/.claude/skills/network-engineer.md +109 -0
  39. package/.claude/skills/optimization-advisor.md +329 -0
  40. package/.claude/skills/orchestrator.md +623 -0
  41. package/.claude/skills/platform-engineer.md +102 -0
  42. package/.claude/skills/process-automation.md +226 -0
  43. package/.claude/skills/process-changelog.md +184 -0
  44. package/.claude/skills/process-documentation.md +484 -0
  45. package/.claude/skills/process-kanban.md +324 -0
  46. package/.claude/skills/process-versioning.md +214 -0
  47. package/.claude/skills/product-designer.md +104 -0
  48. package/.claude/skills/project-starter.md +443 -0
  49. package/.claude/skills/qa-engineer.md +109 -0
  50. package/.claude/skills/security-architect.md +135 -0
  51. package/.claude/skills/sre.md +109 -0
  52. package/.claude/skills/system-design.md +126 -0
  53. package/.claude/skills/technical-writer.md +101 -0
  54. package/.gitattributes +2 -0
  55. package/GITHUB_COPILOT.md +106 -0
  56. package/README.md +117 -224
  57. package/package.json +4 -42
  58. package/bin/cli.js +0 -241
  59. /package/{LICENSE → .claude/LICENSE} +0 -0
  60. /package/{bin → .claude/bin}/copilot.js +0 -0
  61. /package/{bin → .claude/bin}/postinstall.js +0 -0
  62. /package/{tech_hub_skills/skills → .claude/commands}/README.md +0 -0
  63. /package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +0 -0
  64. /package/{tech_hub_skills/skills → .claude/commands}/aws.md +0 -0
  65. /package/{tech_hub_skills/skills → .claude/commands}/azure.md +0 -0
  66. /package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +0 -0
  67. /package/{tech_hub_skills/skills → .claude/commands}/code-review.md +0 -0
  68. /package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +0 -0
  69. /package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +0 -0
  70. /package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +0 -0
  71. /package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +0 -0
  72. /package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +0 -0
  73. /package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +0 -0
  74. /package/{tech_hub_skills/skills → .claude/commands}/devops.md +0 -0
  75. /package/{tech_hub_skills/skills → .claude/commands}/docker.md +0 -0
  76. /package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +0 -0
  77. /package/{tech_hub_skills/skills → .claude/commands}/finops.md +0 -0
  78. /package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +0 -0
  79. /package/{tech_hub_skills/skills → .claude/commands}/gcp.md +0 -0
  80. /package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +0 -0
  81. /package/{tech_hub_skills/skills → .claude/commands}/mlops.md +0 -0
  82. /package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +0 -0
  83. /package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +0 -0
  84. /package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +0 -0
  85. /package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +0 -0
  86. /package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +0 -0
  87. /package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +0 -0
  88. /package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +0 -0
  89. /package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +0 -0
  90. /package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +0 -0
  91. /package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +0 -0
  92. /package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +0 -0
  93. /package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +0 -0
  94. /package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +0 -0
  95. /package/{tech_hub_skills/skills → .claude/commands}/sre.md +0 -0
  96. /package/{tech_hub_skills/skills → .claude/commands}/system-design.md +0 -0
  97. /package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +0 -0
  98. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +0 -0
  99. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +0 -0
  100. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +0 -0
  101. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +0 -0
  102. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +0 -0
  103. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +0 -0
  104. /package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +0 -0
  105. /package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +0 -0
  106. /package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +0 -0
  107. /package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +0 -0
  108. /package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +0 -0
  109. /package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +0 -0
  110. /package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +0 -0
  111. /package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +0 -0
  112. /package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +0 -0
  113. /package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +0 -0
  114. /package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +0 -0
  115. /package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +0 -0
  116. /package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +0 -0
  117. /package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +0 -0
  118. /package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +0 -0
  119. /package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +0 -0
  120. /package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +0 -0
  121. /package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +0 -0
  122. /package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +0 -0
  123. /package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +0 -0
  124. /package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +0 -0
  125. /package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +0 -0
  126. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +0 -0
  127. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +0 -0
  128. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +0 -0
  129. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +0 -0
  130. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +0 -0
  131. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +0 -0
  132. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +0 -0
  133. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +0 -0
  134. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +0 -0
  135. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +0 -0
  136. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +0 -0
  137. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +0 -0
  138. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +0 -0
  139. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +0 -0
  140. /package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +0 -0
  141. /package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +0 -0
  142. /package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +0 -0
  143. /package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +0 -0
  144. /package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +0 -0
  145. /package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +0 -0
  146. /package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +0 -0
  147. /package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +0 -0
  148. /package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +0 -0
  149. /package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +0 -0
  150. /package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +0 -0
  151. /package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +0 -0
  152. /package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +0 -0
  153. /package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +0 -0
  154. /package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +0 -0
  155. /package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +0 -0
  156. /package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +0 -0
  157. /package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +0 -0
  158. /package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +0 -0
  159. /package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +0 -0
  160. /package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +0 -0
  161. /package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +0 -0
  162. /package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +0 -0
  163. /package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +0 -0
  164. /package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +0 -0
  165. /package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +0 -0
  166. /package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +0 -0
  167. /package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +0 -0
  168. /package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +0 -0
  169. /package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +0 -0
  170. /package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +0 -0
  171. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +0 -0
  172. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +0 -0
  173. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +0 -0
  174. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +0 -0
  175. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +0 -0
  176. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +0 -0
  177. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +0 -0
  178. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +0 -0
  179. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +0 -0
  180. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +0 -0
  181. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +0 -0
  182. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +0 -0
  183. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +0 -0
  184. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +0 -0
  185. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +0 -0
  186. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +0 -0
  187. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +0 -0
  188. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +0 -0
  189. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +0 -0
  190. /package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +0 -0
  191. /package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +0 -0
  192. /package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +0 -0
  193. /package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +0 -0
  194. /package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +0 -0
  195. /package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +0 -0
  196. /package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +0 -0
  197. /package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +0 -0
  198. /package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +0 -0
@@ -0,0 +1,446 @@
1
+ """
2
+ Automated Exploratory Data Analysis (EDA) Generator
3
+ Generate comprehensive EDA reports with minimal code.
4
+ """
5
+
6
+ import pandas as pd
7
+ import numpy as np
8
+ from typing import Dict, List, Optional, Any, Tuple
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ import matplotlib.pyplot as plt
12
+ import seaborn as sns
13
+ from scipy import stats
14
+ import warnings
15
+
16
+ warnings.filterwarnings('ignore')
17
+
18
+ # Set style
19
+ sns.set_style("whitegrid")
20
+ plt.rcParams['figure.figsize'] = (12, 6)
21
+
22
+
23
+ @dataclass
24
+ class DataProfileSummary:
25
+ """Summary statistics for a dataset."""
26
+ n_rows: int
27
+ n_columns: int
28
+ n_numeric: int
29
+ n_categorical: int
30
+ n_datetime: int
31
+ missing_cells: int
32
+ missing_percentage: float
33
+ duplicate_rows: int
34
+ memory_usage_mb: float
35
+
36
+
37
+ class EDAGenerator:
38
+ """Automated EDA report generator."""
39
+
40
+ def __init__(self, figsize: Tuple[int, int] = (12, 6)):
41
+ """
42
+ Initialize EDA generator.
43
+
44
+ Args:
45
+ figsize: Default figure size for plots
46
+ """
47
+ self.figsize = figsize
48
+ self.report_sections = []
49
+
50
+ def generate_profile(self, df: pd.DataFrame) -> DataProfileSummary:
51
+ """
52
+ Generate data profile summary.
53
+
54
+ Args:
55
+ df: Input DataFrame
56
+
57
+ Returns:
58
+ DataProfileSummary object
59
+ """
60
+ # Count column types
61
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
62
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
63
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns
64
+
65
+ # Calculate statistics
66
+ missing_cells = df.isna().sum().sum()
67
+ total_cells = df.shape[0] * df.shape[1]
68
+ missing_pct = (missing_cells / total_cells * 100) if total_cells > 0 else 0
69
+
70
+ return DataProfileSummary(
71
+ n_rows=len(df),
72
+ n_columns=len(df.columns),
73
+ n_numeric=len(numeric_cols),
74
+ n_categorical=len(categorical_cols),
75
+ n_datetime=len(datetime_cols),
76
+ missing_cells=missing_cells,
77
+ missing_percentage=round(missing_pct, 2),
78
+ duplicate_rows=df.duplicated().sum(),
79
+ memory_usage_mb=round(df.memory_usage(deep=True).sum() / 1024**2, 2)
80
+ )
81
+
82
+ def analyze_missing_data(self, df: pd.DataFrame) -> pd.DataFrame:
83
+ """Analyze missing data patterns."""
84
+ missing_stats = pd.DataFrame({
85
+ 'Column': df.columns,
86
+ 'Missing_Count': df.isna().sum().values,
87
+ 'Missing_Percentage': (df.isna().sum() / len(df) * 100).values,
88
+ 'Data_Type': df.dtypes.values
89
+ })
90
+
91
+ missing_stats = missing_stats[missing_stats['Missing_Count'] > 0].sort_values(
92
+ 'Missing_Percentage', ascending=False
93
+ )
94
+
95
+ return missing_stats
96
+
97
+ def analyze_numeric_columns(self, df: pd.DataFrame) -> pd.DataFrame:
98
+ """Comprehensive analysis of numeric columns."""
99
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
100
+
101
+ if len(numeric_cols) == 0:
102
+ return pd.DataFrame()
103
+
104
+ stats_list = []
105
+
106
+ for col in numeric_cols:
107
+ col_data = df[col].dropna()
108
+
109
+ if len(col_data) == 0:
110
+ continue
111
+
112
+ stats_dict = {
113
+ 'Column': col,
114
+ 'Count': len(col_data),
115
+ 'Mean': col_data.mean(),
116
+ 'Median': col_data.median(),
117
+ 'Std': col_data.std(),
118
+ 'Min': col_data.min(),
119
+ 'Max': col_data.max(),
120
+ 'Q25': col_data.quantile(0.25),
121
+ 'Q75': col_data.quantile(0.75),
122
+ 'IQR': col_data.quantile(0.75) - col_data.quantile(0.25),
123
+ 'Skewness': col_data.skew(),
124
+ 'Kurtosis': col_data.kurtosis(),
125
+ 'Zeros': (col_data == 0).sum(),
126
+ 'Zeros_Pct': (col_data == 0).sum() / len(col_data) * 100
127
+ }
128
+
129
+ stats_list.append(stats_dict)
130
+
131
+ return pd.DataFrame(stats_list).round(3)
132
+
133
+ def analyze_categorical_columns(self, df: pd.DataFrame, max_categories: int = 20) -> Dict[str, pd.DataFrame]:
134
+ """Analyze categorical columns."""
135
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
136
+
137
+ results = {}
138
+
139
+ for col in categorical_cols:
140
+ value_counts = df[col].value_counts()
141
+
142
+ if len(value_counts) <= max_categories:
143
+ stats_df = pd.DataFrame({
144
+ 'Value': value_counts.index,
145
+ 'Count': value_counts.values,
146
+ 'Percentage': (value_counts / len(df) * 100).values
147
+ }).round(2)
148
+
149
+ results[col] = stats_df
150
+
151
+ return results
152
+
153
+ def detect_outliers(self, df: pd.DataFrame, method: str = 'iqr', threshold: float = 1.5) -> Dict[str, Dict[str, Any]]:
154
+ """
155
+ Detect outliers in numeric columns.
156
+
157
+ Args:
158
+ df: Input DataFrame
159
+ method: Detection method ('iqr' or 'zscore')
160
+ threshold: Threshold value (1.5 for IQR, 3 for z-score)
161
+
162
+ Returns:
163
+ Dictionary with outlier information per column
164
+ """
165
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
166
+ outliers = {}
167
+
168
+ for col in numeric_cols:
169
+ col_data = df[col].dropna()
170
+
171
+ if len(col_data) == 0:
172
+ continue
173
+
174
+ if method == 'iqr':
175
+ Q1 = col_data.quantile(0.25)
176
+ Q3 = col_data.quantile(0.75)
177
+ IQR = Q3 - Q1
178
+ lower_bound = Q1 - threshold * IQR
179
+ upper_bound = Q3 + threshold * IQR
180
+
181
+ outlier_mask = (col_data < lower_bound) | (col_data > upper_bound)
182
+
183
+ elif method == 'zscore':
184
+ z_scores = np.abs(stats.zscore(col_data))
185
+ outlier_mask = z_scores > threshold
186
+
187
+ else:
188
+ continue
189
+
190
+ outlier_count = outlier_mask.sum()
191
+ outlier_pct = (outlier_count / len(col_data) * 100)
192
+
193
+ if outlier_count > 0:
194
+ outliers[col] = {
195
+ 'count': outlier_count,
196
+ 'percentage': round(outlier_pct, 2),
197
+ 'method': method,
198
+ 'threshold': threshold,
199
+ 'lower_bound': lower_bound if method == 'iqr' else None,
200
+ 'upper_bound': upper_bound if method == 'iqr' else None
201
+ }
202
+
203
+ return outliers
204
+
205
+ def calculate_correlations(self, df: pd.DataFrame, method: str = 'pearson', threshold: float = 0.7) -> pd.DataFrame:
206
+ """
207
+ Calculate correlations between numeric columns.
208
+
209
+ Args:
210
+ df: Input DataFrame
211
+ method: Correlation method ('pearson', 'spearman', 'kendall')
212
+ threshold: Only show correlations above this threshold
213
+
214
+ Returns:
215
+ DataFrame with high correlations
216
+ """
217
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
218
+
219
+ if len(numeric_cols) < 2:
220
+ return pd.DataFrame()
221
+
222
+ corr_matrix = df[numeric_cols].corr(method=method)
223
+
224
+ # Extract high correlations
225
+ high_corr = []
226
+ for i in range(len(corr_matrix.columns)):
227
+ for j in range(i+1, len(corr_matrix.columns)):
228
+ corr_value = corr_matrix.iloc[i, j]
229
+ if abs(corr_value) >= threshold:
230
+ high_corr.append({
231
+ 'Variable_1': corr_matrix.columns[i],
232
+ 'Variable_2': corr_matrix.columns[j],
233
+ 'Correlation': round(corr_value, 3),
234
+ 'Abs_Correlation': round(abs(corr_value), 3)
235
+ })
236
+
237
+ if high_corr:
238
+ return pd.DataFrame(high_corr).sort_values('Abs_Correlation', ascending=False)
239
+ else:
240
+ return pd.DataFrame()
241
+
242
+ def generate_insights(self, df: pd.DataFrame) -> List[str]:
243
+ """Generate automated insights from the data."""
244
+ insights = []
245
+
246
+ # Data size insight
247
+ profile = self.generate_profile(df)
248
+ insights.append(f"📊 Dataset contains {profile.n_rows:,} rows and {profile.n_columns} columns")
249
+
250
+ # Missing data insights
251
+ if profile.missing_percentage > 10:
252
+ insights.append(f"⚠️ High missing data: {profile.missing_percentage:.1f}% of cells are missing")
253
+ elif profile.missing_percentage > 0:
254
+ insights.append(f"ℹ️ Missing data: {profile.missing_percentage:.1f}% of cells are missing")
255
+
256
+ # Duplicate insights
257
+ if profile.duplicate_rows > 0:
258
+ dup_pct = profile.duplicate_rows / profile.n_rows * 100
259
+ insights.append(f"🔄 Found {profile.duplicate_rows:,} duplicate rows ({dup_pct:.1f}%)")
260
+
261
+ # Numeric columns insights
262
+ numeric_stats = self.analyze_numeric_columns(df)
263
+ if not numeric_stats.empty:
264
+ # Check for skewed distributions
265
+ highly_skewed = numeric_stats[abs(numeric_stats['Skewness']) > 2]
266
+ if not highly_skewed.empty:
267
+ insights.append(f"📈 {len(highly_skewed)} highly skewed numeric columns detected")
268
+
269
+ # Check for columns with many zeros
270
+ zero_heavy = numeric_stats[numeric_stats['Zeros_Pct'] > 50]
271
+ if not zero_heavy.empty:
272
+ insights.append(f"0️⃣ {len(zero_heavy)} columns have >50% zeros")
273
+
274
+ # Outlier insights
275
+ outliers = self.detect_outliers(df)
276
+ if outliers:
277
+ total_outliers = sum(o['count'] for o in outliers.values())
278
+ insights.append(f"🎯 Detected {total_outliers:,} outliers across {len(outliers)} columns")
279
+
280
+ # Correlation insights
281
+ high_corr = self.calculate_correlations(df, threshold=0.8)
282
+ if not high_corr.empty:
283
+ insights.append(f"🔗 Found {len(high_corr)} high correlations (>0.8)")
284
+
285
+ # Categorical insights
286
+ cat_cols = df.select_dtypes(include=['object', 'category']).columns
287
+ if len(cat_cols) > 0:
288
+ high_cardinality = [col for col in cat_cols if df[col].nunique() > 50]
289
+ if high_cardinality:
290
+ insights.append(f"🏷️ {len(high_cardinality)} categorical columns with high cardinality (>50 unique values)")
291
+
292
+ return insights
293
+
294
+ def generate_report(
295
+ self,
296
+ df: pd.DataFrame,
297
+ title: str = "Exploratory Data Analysis Report",
298
+ output_file: Optional[str] = None
299
+ ) -> str:
300
+ """
301
+ Generate comprehensive EDA report.
302
+
303
+ Args:
304
+ df: Input DataFrame
305
+ title: Report title
306
+ output_file: Optional file path to save report
307
+
308
+ Returns:
309
+ Report as string
310
+ """
311
+ report = []
312
+
313
+ # Header
314
+ report.append("=" * 80)
315
+ report.append(title.center(80))
316
+ report.append("=" * 80)
317
+ report.append("")
318
+
319
+ # 1. Data Profile
320
+ report.append("## 1. DATA PROFILE")
321
+ report.append("-" * 80)
322
+ profile = self.generate_profile(df)
323
+ for key, value in profile.__dict__.items():
324
+ report.append(f" {key.replace('_', ' ').title()}: {value}")
325
+ report.append("")
326
+
327
+ # 2. Automated Insights
328
+ report.append("## 2. KEY INSIGHTS")
329
+ report.append("-" * 80)
330
+ insights = self.generate_insights(df)
331
+ for insight in insights:
332
+ report.append(f" {insight}")
333
+ report.append("")
334
+
335
+ # 3. Missing Data Analysis
336
+ missing = self.analyze_missing_data(df)
337
+ if not missing.empty:
338
+ report.append("## 3. MISSING DATA")
339
+ report.append("-" * 80)
340
+ report.append(missing.to_string(index=False))
341
+ report.append("")
342
+
343
+ # 4. Numeric Columns
344
+ numeric_stats = self.analyze_numeric_columns(df)
345
+ if not numeric_stats.empty:
346
+ report.append("## 4. NUMERIC COLUMNS STATISTICS")
347
+ report.append("-" * 80)
348
+ report.append(numeric_stats.to_string(index=False))
349
+ report.append("")
350
+
351
+ # 5. Outliers
352
+ outliers = self.detect_outliers(df)
353
+ if outliers:
354
+ report.append("## 5. OUTLIER DETECTION")
355
+ report.append("-" * 80)
356
+ for col, stats in outliers.items():
357
+ report.append(f" {col}: {stats['count']} outliers ({stats['percentage']:.2f}%)")
358
+ report.append("")
359
+
360
+ # 6. Correlations
361
+ high_corr = self.calculate_correlations(df, threshold=0.7)
362
+ if not high_corr.empty:
363
+ report.append("## 6. HIGH CORRELATIONS (>0.7)")
364
+ report.append("-" * 80)
365
+ report.append(high_corr.to_string(index=False))
366
+ report.append("")
367
+
368
+ # 7. Categorical Analysis
369
+ cat_analysis = self.analyze_categorical_columns(df)
370
+ if cat_analysis:
371
+ report.append("## 7. CATEGORICAL COLUMNS")
372
+ report.append("-" * 80)
373
+ for col, stats in cat_analysis.items():
374
+ report.append(f"\n {col}:")
375
+ report.append(stats.to_string(index=False, max_rows=10))
376
+ report.append("")
377
+
378
+ report_text = "\n".join(report)
379
+
380
+ # Save to file if specified
381
+ if output_file:
382
+ Path(output_file).parent.mkdir(parents=True, exist_ok=True)
383
+ with open(output_file, 'w') as f:
384
+ f.write(report_text)
385
+ print(f"✅ Report saved to {output_file}")
386
+
387
+ return report_text
388
+
389
+
390
+ # Example usage
391
+ if __name__ == "__main__":
392
+ # Create sample marketing campaign data
393
+ np.random.seed(42)
394
+
395
+ n_samples = 1000
396
+
397
+ sample_data = pd.DataFrame({
398
+ 'campaign_id': [f'C{i:04d}' for i in range(n_samples)],
399
+ 'campaign_type': np.random.choice(['Email', 'Social', 'Search', 'Display'], n_samples),
400
+ 'budget': np.random.exponential(5000, n_samples),
401
+ 'impressions': np.random.poisson(10000, n_samples),
402
+ 'clicks': np.random.poisson(250, n_samples),
403
+ 'conversions': np.random.poisson(15, n_samples),
404
+ 'revenue': np.random.exponential(2000, n_samples),
405
+ 'industry': np.random.choice(['Tech', 'Finance', 'Healthcare', 'Retail', 'Manufacturing'], n_samples),
406
+ 'region': np.random.choice(['North', 'South', 'East', 'West'], n_samples),
407
+ 'start_date': pd.date_range('2025-01-01', periods=n_samples, freq='H')
408
+ })
409
+
410
+ # Add some missing values
411
+ sample_data.loc[np.random.choice(n_samples, 50, replace=False), 'conversions'] = np.nan
412
+ sample_data.loc[np.random.choice(n_samples, 30, replace=False), 'revenue'] = np.nan
413
+
414
+ # Calculate derived metrics
415
+ sample_data['ctr'] = sample_data['clicks'] / sample_data['impressions'] * 100
416
+ sample_data['conversion_rate'] = sample_data['conversions'] / sample_data['clicks'] * 100
417
+ sample_data['roas'] = sample_data['revenue'] / sample_data['budget']
418
+ sample_data['cpc'] = sample_data['budget'] / sample_data['clicks']
419
+
420
+ print("=" * 80)
421
+ print("AUTOMATED EDA DEMO")
422
+ print("=" * 80)
423
+
424
+ # Generate EDA
425
+ eda = EDAGenerator()
426
+
427
+ report = eda.generate_report(
428
+ df=sample_data,
429
+ title="Marketing Campaign Performance Analysis",
430
+ output_file="campaign_eda_report.txt"
431
+ )
432
+
433
+ print(report)
434
+
435
+ print("\n" + "=" * 80)
436
+ print("DETAILED STATISTICS")
437
+ print("=" * 80)
438
+
439
+ # Show numeric statistics
440
+ print("\n📊 Numeric Columns:")
441
+ print(eda.analyze_numeric_columns(sample_data))
442
+
443
+ # Show correlations
444
+ print("\n🔗 Correlations:")
445
+ corr = eda.calculate_correlations(sample_data, threshold=0.5)
446
+ print(corr if not corr.empty else "No high correlations found")