tech-hub-skills 1.2.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/{LICENSE → .claude/LICENSE} +21 -21
  2. package/.claude/README.md +291 -0
  3. package/.claude/bin/cli.js +266 -0
  4. package/{bin → .claude/bin}/copilot.js +182 -182
  5. package/{bin → .claude/bin}/postinstall.js +42 -42
  6. package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
  7. package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
  8. package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
  9. package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
  10. package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
  11. package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
  12. package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
  13. package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
  14. package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
  15. package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
  16. package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
  17. package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
  18. package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
  19. package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
  20. package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
  21. package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
  22. package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
  23. package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
  24. package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
  25. package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
  26. package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
  27. package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
  28. package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
  29. package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
  30. package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
  31. package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
  32. package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
  33. package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
  34. package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
  35. package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
  36. package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
  37. package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
  38. package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
  39. package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
  40. package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
  41. package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -0
  43. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  47. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  50. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
  59. package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
  64. package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  74. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  86. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  158. package/.claude/skills/README.md +336 -0
  159. package/.claude/skills/ai-engineer.md +104 -0
  160. package/.claude/skills/aws.md +143 -0
  161. package/.claude/skills/azure.md +149 -0
  162. package/.claude/skills/backend-developer.md +108 -0
  163. package/.claude/skills/code-review.md +399 -0
  164. package/.claude/skills/compliance-automation.md +747 -0
  165. package/.claude/skills/compliance-officer.md +108 -0
  166. package/.claude/skills/data-engineer.md +113 -0
  167. package/.claude/skills/data-governance.md +102 -0
  168. package/.claude/skills/data-scientist.md +123 -0
  169. package/.claude/skills/database-admin.md +109 -0
  170. package/.claude/skills/devops.md +160 -0
  171. package/.claude/skills/docker.md +160 -0
  172. package/.claude/skills/enterprise-dashboard.md +613 -0
  173. package/.claude/skills/finops.md +184 -0
  174. package/.claude/skills/frontend-developer.md +108 -0
  175. package/.claude/skills/gcp.md +143 -0
  176. package/.claude/skills/ml-engineer.md +115 -0
  177. package/.claude/skills/mlops.md +187 -0
  178. package/.claude/skills/network-engineer.md +109 -0
  179. package/.claude/skills/optimization-advisor.md +329 -0
  180. package/.claude/skills/orchestrator.md +623 -0
  181. package/.claude/skills/platform-engineer.md +102 -0
  182. package/.claude/skills/process-automation.md +226 -0
  183. package/.claude/skills/process-changelog.md +184 -0
  184. package/.claude/skills/process-documentation.md +484 -0
  185. package/.claude/skills/process-kanban.md +324 -0
  186. package/.claude/skills/process-versioning.md +214 -0
  187. package/.claude/skills/product-designer.md +104 -0
  188. package/.claude/skills/project-starter.md +443 -0
  189. package/.claude/skills/qa-engineer.md +109 -0
  190. package/.claude/skills/security-architect.md +135 -0
  191. package/.claude/skills/sre.md +109 -0
  192. package/.claude/skills/system-design.md +126 -0
  193. package/.claude/skills/technical-writer.md +101 -0
  194. package/.gitattributes +2 -0
  195. package/GITHUB_COPILOT.md +106 -0
  196. package/README.md +192 -291
  197. package/package.json +16 -46
  198. package/bin/cli.js +0 -241
@@ -0,0 +1,446 @@
1
+ """
2
+ Automated Exploratory Data Analysis (EDA) Generator
3
+ Generate comprehensive EDA reports with minimal code.
4
+ """
5
+
6
+ import pandas as pd
7
+ import numpy as np
8
+ from typing import Dict, List, Optional, Any, Tuple
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ import matplotlib.pyplot as plt
12
+ import seaborn as sns
13
+ from scipy import stats
14
+ import warnings
15
+
16
+ warnings.filterwarnings('ignore')
17
+
18
+ # Set style
19
+ sns.set_style("whitegrid")
20
+ plt.rcParams['figure.figsize'] = (12, 6)
21
+
22
+
23
+ @dataclass
24
+ class DataProfileSummary:
25
+ """Summary statistics for a dataset."""
26
+ n_rows: int
27
+ n_columns: int
28
+ n_numeric: int
29
+ n_categorical: int
30
+ n_datetime: int
31
+ missing_cells: int
32
+ missing_percentage: float
33
+ duplicate_rows: int
34
+ memory_usage_mb: float
35
+
36
+
37
+ class EDAGenerator:
38
+ """Automated EDA report generator."""
39
+
40
+ def __init__(self, figsize: Tuple[int, int] = (12, 6)):
41
+ """
42
+ Initialize EDA generator.
43
+
44
+ Args:
45
+ figsize: Default figure size for plots
46
+ """
47
+ self.figsize = figsize
48
+ self.report_sections = []
49
+
50
+ def generate_profile(self, df: pd.DataFrame) -> DataProfileSummary:
51
+ """
52
+ Generate data profile summary.
53
+
54
+ Args:
55
+ df: Input DataFrame
56
+
57
+ Returns:
58
+ DataProfileSummary object
59
+ """
60
+ # Count column types
61
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
62
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
63
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns
64
+
65
+ # Calculate statistics
66
+ missing_cells = df.isna().sum().sum()
67
+ total_cells = df.shape[0] * df.shape[1]
68
+ missing_pct = (missing_cells / total_cells * 100) if total_cells > 0 else 0
69
+
70
+ return DataProfileSummary(
71
+ n_rows=len(df),
72
+ n_columns=len(df.columns),
73
+ n_numeric=len(numeric_cols),
74
+ n_categorical=len(categorical_cols),
75
+ n_datetime=len(datetime_cols),
76
+ missing_cells=missing_cells,
77
+ missing_percentage=round(missing_pct, 2),
78
+ duplicate_rows=df.duplicated().sum(),
79
+ memory_usage_mb=round(df.memory_usage(deep=True).sum() / 1024**2, 2)
80
+ )
81
+
82
+ def analyze_missing_data(self, df: pd.DataFrame) -> pd.DataFrame:
83
+ """Analyze missing data patterns."""
84
+ missing_stats = pd.DataFrame({
85
+ 'Column': df.columns,
86
+ 'Missing_Count': df.isna().sum().values,
87
+ 'Missing_Percentage': (df.isna().sum() / len(df) * 100).values,
88
+ 'Data_Type': df.dtypes.values
89
+ })
90
+
91
+ missing_stats = missing_stats[missing_stats['Missing_Count'] > 0].sort_values(
92
+ 'Missing_Percentage', ascending=False
93
+ )
94
+
95
+ return missing_stats
96
+
97
+ def analyze_numeric_columns(self, df: pd.DataFrame) -> pd.DataFrame:
98
+ """Comprehensive analysis of numeric columns."""
99
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
100
+
101
+ if len(numeric_cols) == 0:
102
+ return pd.DataFrame()
103
+
104
+ stats_list = []
105
+
106
+ for col in numeric_cols:
107
+ col_data = df[col].dropna()
108
+
109
+ if len(col_data) == 0:
110
+ continue
111
+
112
+ stats_dict = {
113
+ 'Column': col,
114
+ 'Count': len(col_data),
115
+ 'Mean': col_data.mean(),
116
+ 'Median': col_data.median(),
117
+ 'Std': col_data.std(),
118
+ 'Min': col_data.min(),
119
+ 'Max': col_data.max(),
120
+ 'Q25': col_data.quantile(0.25),
121
+ 'Q75': col_data.quantile(0.75),
122
+ 'IQR': col_data.quantile(0.75) - col_data.quantile(0.25),
123
+ 'Skewness': col_data.skew(),
124
+ 'Kurtosis': col_data.kurtosis(),
125
+ 'Zeros': (col_data == 0).sum(),
126
+ 'Zeros_Pct': (col_data == 0).sum() / len(col_data) * 100
127
+ }
128
+
129
+ stats_list.append(stats_dict)
130
+
131
+ return pd.DataFrame(stats_list).round(3)
132
+
133
+ def analyze_categorical_columns(self, df: pd.DataFrame, max_categories: int = 20) -> Dict[str, pd.DataFrame]:
134
+ """Analyze categorical columns."""
135
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
136
+
137
+ results = {}
138
+
139
+ for col in categorical_cols:
140
+ value_counts = df[col].value_counts()
141
+
142
+ if len(value_counts) <= max_categories:
143
+ stats_df = pd.DataFrame({
144
+ 'Value': value_counts.index,
145
+ 'Count': value_counts.values,
146
+ 'Percentage': (value_counts / len(df) * 100).values
147
+ }).round(2)
148
+
149
+ results[col] = stats_df
150
+
151
+ return results
152
+
153
+ def detect_outliers(self, df: pd.DataFrame, method: str = 'iqr', threshold: float = 1.5) -> Dict[str, Dict[str, Any]]:
154
+ """
155
+ Detect outliers in numeric columns.
156
+
157
+ Args:
158
+ df: Input DataFrame
159
+ method: Detection method ('iqr' or 'zscore')
160
+ threshold: Threshold value (1.5 for IQR, 3 for z-score)
161
+
162
+ Returns:
163
+ Dictionary with outlier information per column
164
+ """
165
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
166
+ outliers = {}
167
+
168
+ for col in numeric_cols:
169
+ col_data = df[col].dropna()
170
+
171
+ if len(col_data) == 0:
172
+ continue
173
+
174
+ if method == 'iqr':
175
+ Q1 = col_data.quantile(0.25)
176
+ Q3 = col_data.quantile(0.75)
177
+ IQR = Q3 - Q1
178
+ lower_bound = Q1 - threshold * IQR
179
+ upper_bound = Q3 + threshold * IQR
180
+
181
+ outlier_mask = (col_data < lower_bound) | (col_data > upper_bound)
182
+
183
+ elif method == 'zscore':
184
+ z_scores = np.abs(stats.zscore(col_data))
185
+ outlier_mask = z_scores > threshold
186
+
187
+ else:
188
+ continue
189
+
190
+ outlier_count = outlier_mask.sum()
191
+ outlier_pct = (outlier_count / len(col_data) * 100)
192
+
193
+ if outlier_count > 0:
194
+ outliers[col] = {
195
+ 'count': outlier_count,
196
+ 'percentage': round(outlier_pct, 2),
197
+ 'method': method,
198
+ 'threshold': threshold,
199
+ 'lower_bound': lower_bound if method == 'iqr' else None,
200
+ 'upper_bound': upper_bound if method == 'iqr' else None
201
+ }
202
+
203
+ return outliers
204
+
205
+ def calculate_correlations(self, df: pd.DataFrame, method: str = 'pearson', threshold: float = 0.7) -> pd.DataFrame:
206
+ """
207
+ Calculate correlations between numeric columns.
208
+
209
+ Args:
210
+ df: Input DataFrame
211
+ method: Correlation method ('pearson', 'spearman', 'kendall')
212
+ threshold: Only show correlations above this threshold
213
+
214
+ Returns:
215
+ DataFrame with high correlations
216
+ """
217
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
218
+
219
+ if len(numeric_cols) < 2:
220
+ return pd.DataFrame()
221
+
222
+ corr_matrix = df[numeric_cols].corr(method=method)
223
+
224
+ # Extract high correlations
225
+ high_corr = []
226
+ for i in range(len(corr_matrix.columns)):
227
+ for j in range(i+1, len(corr_matrix.columns)):
228
+ corr_value = corr_matrix.iloc[i, j]
229
+ if abs(corr_value) >= threshold:
230
+ high_corr.append({
231
+ 'Variable_1': corr_matrix.columns[i],
232
+ 'Variable_2': corr_matrix.columns[j],
233
+ 'Correlation': round(corr_value, 3),
234
+ 'Abs_Correlation': round(abs(corr_value), 3)
235
+ })
236
+
237
+ if high_corr:
238
+ return pd.DataFrame(high_corr).sort_values('Abs_Correlation', ascending=False)
239
+ else:
240
+ return pd.DataFrame()
241
+
242
+ def generate_insights(self, df: pd.DataFrame) -> List[str]:
243
+ """Generate automated insights from the data."""
244
+ insights = []
245
+
246
+ # Data size insight
247
+ profile = self.generate_profile(df)
248
+ insights.append(f"📊 Dataset contains {profile.n_rows:,} rows and {profile.n_columns} columns")
249
+
250
+ # Missing data insights
251
+ if profile.missing_percentage > 10:
252
+ insights.append(f"⚠️ High missing data: {profile.missing_percentage:.1f}% of cells are missing")
253
+ elif profile.missing_percentage > 0:
254
+ insights.append(f"ℹ️ Missing data: {profile.missing_percentage:.1f}% of cells are missing")
255
+
256
+ # Duplicate insights
257
+ if profile.duplicate_rows > 0:
258
+ dup_pct = profile.duplicate_rows / profile.n_rows * 100
259
+ insights.append(f"🔄 Found {profile.duplicate_rows:,} duplicate rows ({dup_pct:.1f}%)")
260
+
261
+ # Numeric columns insights
262
+ numeric_stats = self.analyze_numeric_columns(df)
263
+ if not numeric_stats.empty:
264
+ # Check for skewed distributions
265
+ highly_skewed = numeric_stats[abs(numeric_stats['Skewness']) > 2]
266
+ if not highly_skewed.empty:
267
+ insights.append(f"📈 {len(highly_skewed)} highly skewed numeric columns detected")
268
+
269
+ # Check for columns with many zeros
270
+ zero_heavy = numeric_stats[numeric_stats['Zeros_Pct'] > 50]
271
+ if not zero_heavy.empty:
272
+ insights.append(f"0️⃣ {len(zero_heavy)} columns have >50% zeros")
273
+
274
+ # Outlier insights
275
+ outliers = self.detect_outliers(df)
276
+ if outliers:
277
+ total_outliers = sum(o['count'] for o in outliers.values())
278
+ insights.append(f"🎯 Detected {total_outliers:,} outliers across {len(outliers)} columns")
279
+
280
+ # Correlation insights
281
+ high_corr = self.calculate_correlations(df, threshold=0.8)
282
+ if not high_corr.empty:
283
+ insights.append(f"🔗 Found {len(high_corr)} high correlations (>0.8)")
284
+
285
+ # Categorical insights
286
+ cat_cols = df.select_dtypes(include=['object', 'category']).columns
287
+ if len(cat_cols) > 0:
288
+ high_cardinality = [col for col in cat_cols if df[col].nunique() > 50]
289
+ if high_cardinality:
290
+ insights.append(f"🏷️ {len(high_cardinality)} categorical columns with high cardinality (>50 unique values)")
291
+
292
+ return insights
293
+
294
+ def generate_report(
295
+ self,
296
+ df: pd.DataFrame,
297
+ title: str = "Exploratory Data Analysis Report",
298
+ output_file: Optional[str] = None
299
+ ) -> str:
300
+ """
301
+ Generate comprehensive EDA report.
302
+
303
+ Args:
304
+ df: Input DataFrame
305
+ title: Report title
306
+ output_file: Optional file path to save report
307
+
308
+ Returns:
309
+ Report as string
310
+ """
311
+ report = []
312
+
313
+ # Header
314
+ report.append("=" * 80)
315
+ report.append(title.center(80))
316
+ report.append("=" * 80)
317
+ report.append("")
318
+
319
+ # 1. Data Profile
320
+ report.append("## 1. DATA PROFILE")
321
+ report.append("-" * 80)
322
+ profile = self.generate_profile(df)
323
+ for key, value in profile.__dict__.items():
324
+ report.append(f" {key.replace('_', ' ').title()}: {value}")
325
+ report.append("")
326
+
327
+ # 2. Automated Insights
328
+ report.append("## 2. KEY INSIGHTS")
329
+ report.append("-" * 80)
330
+ insights = self.generate_insights(df)
331
+ for insight in insights:
332
+ report.append(f" {insight}")
333
+ report.append("")
334
+
335
+ # 3. Missing Data Analysis
336
+ missing = self.analyze_missing_data(df)
337
+ if not missing.empty:
338
+ report.append("## 3. MISSING DATA")
339
+ report.append("-" * 80)
340
+ report.append(missing.to_string(index=False))
341
+ report.append("")
342
+
343
+ # 4. Numeric Columns
344
+ numeric_stats = self.analyze_numeric_columns(df)
345
+ if not numeric_stats.empty:
346
+ report.append("## 4. NUMERIC COLUMNS STATISTICS")
347
+ report.append("-" * 80)
348
+ report.append(numeric_stats.to_string(index=False))
349
+ report.append("")
350
+
351
+ # 5. Outliers
352
+ outliers = self.detect_outliers(df)
353
+ if outliers:
354
+ report.append("## 5. OUTLIER DETECTION")
355
+ report.append("-" * 80)
356
+ for col, stats in outliers.items():
357
+ report.append(f" {col}: {stats['count']} outliers ({stats['percentage']:.2f}%)")
358
+ report.append("")
359
+
360
+ # 6. Correlations
361
+ high_corr = self.calculate_correlations(df, threshold=0.7)
362
+ if not high_corr.empty:
363
+ report.append("## 6. HIGH CORRELATIONS (>0.7)")
364
+ report.append("-" * 80)
365
+ report.append(high_corr.to_string(index=False))
366
+ report.append("")
367
+
368
+ # 7. Categorical Analysis
369
+ cat_analysis = self.analyze_categorical_columns(df)
370
+ if cat_analysis:
371
+ report.append("## 7. CATEGORICAL COLUMNS")
372
+ report.append("-" * 80)
373
+ for col, stats in cat_analysis.items():
374
+ report.append(f"\n {col}:")
375
+ report.append(stats.to_string(index=False, max_rows=10))
376
+ report.append("")
377
+
378
+ report_text = "\n".join(report)
379
+
380
+ # Save to file if specified
381
+ if output_file:
382
+ Path(output_file).parent.mkdir(parents=True, exist_ok=True)
383
+ with open(output_file, 'w') as f:
384
+ f.write(report_text)
385
+ print(f"✅ Report saved to {output_file}")
386
+
387
+ return report_text
388
+
389
+
390
+ # Example usage
391
+ if __name__ == "__main__":
392
+ # Create sample marketing campaign data
393
+ np.random.seed(42)
394
+
395
+ n_samples = 1000
396
+
397
+ sample_data = pd.DataFrame({
398
+ 'campaign_id': [f'C{i:04d}' for i in range(n_samples)],
399
+ 'campaign_type': np.random.choice(['Email', 'Social', 'Search', 'Display'], n_samples),
400
+ 'budget': np.random.exponential(5000, n_samples),
401
+ 'impressions': np.random.poisson(10000, n_samples),
402
+ 'clicks': np.random.poisson(250, n_samples),
403
+ 'conversions': np.random.poisson(15, n_samples),
404
+ 'revenue': np.random.exponential(2000, n_samples),
405
+ 'industry': np.random.choice(['Tech', 'Finance', 'Healthcare', 'Retail', 'Manufacturing'], n_samples),
406
+ 'region': np.random.choice(['North', 'South', 'East', 'West'], n_samples),
407
+ 'start_date': pd.date_range('2025-01-01', periods=n_samples, freq='H')
408
+ })
409
+
410
+ # Add some missing values
411
+ sample_data.loc[np.random.choice(n_samples, 50, replace=False), 'conversions'] = np.nan
412
+ sample_data.loc[np.random.choice(n_samples, 30, replace=False), 'revenue'] = np.nan
413
+
414
+ # Calculate derived metrics
415
+ sample_data['ctr'] = sample_data['clicks'] / sample_data['impressions'] * 100
416
+ sample_data['conversion_rate'] = sample_data['conversions'] / sample_data['clicks'] * 100
417
+ sample_data['roas'] = sample_data['revenue'] / sample_data['budget']
418
+ sample_data['cpc'] = sample_data['budget'] / sample_data['clicks']
419
+
420
+ print("=" * 80)
421
+ print("AUTOMATED EDA DEMO")
422
+ print("=" * 80)
423
+
424
+ # Generate EDA
425
+ eda = EDAGenerator()
426
+
427
+ report = eda.generate_report(
428
+ df=sample_data,
429
+ title="Marketing Campaign Performance Analysis",
430
+ output_file="campaign_eda_report.txt"
431
+ )
432
+
433
+ print(report)
434
+
435
+ print("\n" + "=" * 80)
436
+ print("DETAILED STATISTICS")
437
+ print("=" * 80)
438
+
439
+ # Show numeric statistics
440
+ print("\n📊 Numeric Columns:")
441
+ print(eda.analyze_numeric_columns(sample_data))
442
+
443
+ # Show correlations
444
+ print("\n🔗 Correlations:")
445
+ corr = eda.calculate_correlations(sample_data, threshold=0.5)
446
+ print(corr if not corr.empty else "No high correlations found")