tech-hub-skills 1.2.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/{LICENSE → .claude/LICENSE} +21 -21
  2. package/.claude/README.md +291 -0
  3. package/.claude/bin/cli.js +266 -0
  4. package/{bin → .claude/bin}/copilot.js +182 -182
  5. package/{bin → .claude/bin}/postinstall.js +42 -42
  6. package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
  7. package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
  8. package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
  9. package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
  10. package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
  11. package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
  12. package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
  13. package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
  14. package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
  15. package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
  16. package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
  17. package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
  18. package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
  19. package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
  20. package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
  21. package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
  22. package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
  23. package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
  24. package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
  25. package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
  26. package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
  27. package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
  28. package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
  29. package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
  30. package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
  31. package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
  32. package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
  33. package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
  34. package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
  35. package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
  36. package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
  37. package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
  38. package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
  39. package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
  40. package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
  41. package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -0
  43. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  47. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  50. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
  59. package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
  64. package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  74. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  86. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  158. package/.claude/skills/README.md +336 -0
  159. package/.claude/skills/ai-engineer.md +104 -0
  160. package/.claude/skills/aws.md +143 -0
  161. package/.claude/skills/azure.md +149 -0
  162. package/.claude/skills/backend-developer.md +108 -0
  163. package/.claude/skills/code-review.md +399 -0
  164. package/.claude/skills/compliance-automation.md +747 -0
  165. package/.claude/skills/compliance-officer.md +108 -0
  166. package/.claude/skills/data-engineer.md +113 -0
  167. package/.claude/skills/data-governance.md +102 -0
  168. package/.claude/skills/data-scientist.md +123 -0
  169. package/.claude/skills/database-admin.md +109 -0
  170. package/.claude/skills/devops.md +160 -0
  171. package/.claude/skills/docker.md +160 -0
  172. package/.claude/skills/enterprise-dashboard.md +613 -0
  173. package/.claude/skills/finops.md +184 -0
  174. package/.claude/skills/frontend-developer.md +108 -0
  175. package/.claude/skills/gcp.md +143 -0
  176. package/.claude/skills/ml-engineer.md +115 -0
  177. package/.claude/skills/mlops.md +187 -0
  178. package/.claude/skills/network-engineer.md +109 -0
  179. package/.claude/skills/optimization-advisor.md +329 -0
  180. package/.claude/skills/orchestrator.md +623 -0
  181. package/.claude/skills/platform-engineer.md +102 -0
  182. package/.claude/skills/process-automation.md +226 -0
  183. package/.claude/skills/process-changelog.md +184 -0
  184. package/.claude/skills/process-documentation.md +484 -0
  185. package/.claude/skills/process-kanban.md +324 -0
  186. package/.claude/skills/process-versioning.md +214 -0
  187. package/.claude/skills/product-designer.md +104 -0
  188. package/.claude/skills/project-starter.md +443 -0
  189. package/.claude/skills/qa-engineer.md +109 -0
  190. package/.claude/skills/security-architect.md +135 -0
  191. package/.claude/skills/sre.md +109 -0
  192. package/.claude/skills/system-design.md +126 -0
  193. package/.claude/skills/technical-writer.md +101 -0
  194. package/.gitattributes +2 -0
  195. package/GITHUB_COPILOT.md +106 -0
  196. package/README.md +192 -291
  197. package/package.json +16 -46
  198. package/bin/cli.js +0 -241
@@ -1,230 +1,230 @@
1
- # Skill 1: Automated Exploratory Data Analysis (EDA)
2
-
3
- ## 🎯 Overview
4
- Automated EDA with statistical profiling, visualization, and insight generation.
5
-
6
- ## 🔗 Connections
7
- - **Data Engineer**: Provides feedback on data quality issues (de-01, de-03)
8
- - **ML Engineer**: Identifies promising features for modeling (ml-01, ml-02)
9
- - **MLOps**: Experiment tracking for EDA findings (mo-01)
10
- - **AI Engineer**: Generates insights for LLM context (ai-02, ai-03)
11
- - **Security Architect**: PII detection in datasets (sa-01)
12
- - **FinOps**: Cost-effective analytics compute (fo-06)
13
- - **DevOps**: Automated reporting pipelines (do-01)
14
-
15
- ## 🛠️ Tools Included
16
-
17
- ### 1. `eda_generator.py`
18
- Automated EDA report generation with ydata-profiling.
19
-
20
- ### 2. `statistical_analyzer.py`
21
- Statistical tests, distributions, and correlations.
22
-
23
- ### 3. `visualization_suite.py`
24
- Interactive visualizations with Plotly.
25
-
26
- ### 4. `insight_extractor.py`
27
- Automated insight extraction and anomaly detection.
28
-
29
- ### 5. `eda_queries.sql`
30
- SQL templates for common analytical queries.
31
-
32
- ## 📊 Key Outputs
33
- - Automated profiling reports (HTML)
34
- - Statistical summaries
35
- - Correlation matrices
36
- - Distribution plots
37
- - Anomaly detection alerts
38
-
39
- ## 🚀 Quick Start
40
-
41
- ```python
42
- from eda_generator import EDAGenerator
43
-
44
- # Initialize
45
- eda = EDAGenerator()
46
-
47
- # Load data
48
- df = pd.read_csv("customer_data.csv")
49
-
50
- # Generate comprehensive report
51
- report = eda.generate_report(
52
- df=df,
53
- title="Customer Data Analysis",
54
- output_file="eda_report.html"
55
- )
56
-
57
- # Extract key insights
58
- insights = eda.extract_insights(df)
59
- print(insights)
60
- ```
61
-
62
- ## 📚 Best Practices
63
-
64
- ### Data Quality & Security (Cross-Role Integration)
65
-
66
- 1. **PII Detection Before Analysis**
67
- - Scan datasets for PII before profiling
68
- - Mask sensitive data in reports and visualizations
69
- - Track data lineage for compliance
70
- - Reference: Security Architect sa-01 (PII Detection)
71
-
72
- 2. **Data Quality Validation**
73
- - Validate schema before EDA
74
- - Check completeness, accuracy, consistency
75
- - Alert Data Engineering team on quality issues
76
- - Reference: Data Engineer de-03 (Data Quality)
77
-
78
- 3. **Automated Quality Feedback Loop**
79
- - Generate data quality scorecards
80
- - Feed insights back to data pipelines
81
- - Track quality improvements over time
82
- - Reference: Data Engineer de-01, de-03
83
-
84
- ### Cost Optimization (FinOps Integration)
85
-
86
- 4. **Optimize Compute for Analysis**
87
- - Use appropriate instance sizes for EDA workloads
88
- - Auto-shutdown notebooks when idle
89
- - Sample large datasets intelligently
90
- - Monitor analysis costs per project
91
- - Reference: FinOps fo-06 (Compute Optimization)
92
-
93
- 5. **Efficient Data Sampling**
94
- - Use stratified sampling for large datasets
95
- - Profile samples before full dataset analysis
96
- - Cache intermediate results
97
- - Minimize data movement and storage
98
- - Reference: FinOps fo-05, Data Engineer de-01
99
-
100
- ### MLOps Integration
101
-
102
- 6. **Track EDA Experiments**
103
- - Log EDA findings in MLflow/Azure ML
104
- - Version datasets used for analysis
105
- - Document feature engineering insights
106
- - Link EDA to downstream model experiments
107
- - Reference: MLOps mo-01 (Experiment Tracking)
108
-
109
- 7. **Feature Discovery Documentation**
110
- - Document promising features for ML
111
- - Track feature importance from EDA
112
- - Share insights with ML Engineering team
113
- - Maintain feature catalog
114
- - Reference: ML Engineer ml-02 (Feature Engineering)
115
-
116
- ### Automation & Deployment (DevOps Integration)
117
-
118
- 8. **Automated EDA Pipelines**
119
- - Schedule regular EDA reports for key datasets
120
- - Automate anomaly detection and alerting
121
- - Deploy EDA as part of data pipeline monitoring
122
- - Version control EDA scripts
123
- - Reference: DevOps do-01 (CI/CD), do-08 (Monitoring)
124
-
125
- 9. **Reproducible Analysis**
126
- - Use containerized environments
127
- - Pin package versions
128
- - Document analysis dependencies
129
- - Enable one-click report regeneration
130
- - Reference: DevOps do-03 (Containerization)
131
-
132
- ### AI Integration
133
-
134
- 10. **LLM-Powered Insights**
135
- - Use LLMs to generate narrative insights
136
- - Automate insight extraction from distributions
137
- - Create natural language data summaries
138
- - Reference: AI Engineer ai-01, ai-07
139
-
140
- ## 💰 Cost Optimization Examples
141
-
142
- ### Compute Cost Tracking
143
- ```python
144
- from eda_generator import EDAGenerator
145
- from finops_tracker import AnalyticsCostTracker
146
-
147
- cost_tracker = AnalyticsCostTracker()
148
-
149
- # Track EDA compute costs
150
- @cost_tracker.track_analysis_cost
151
- def run_eda(dataset_path: str):
152
- eda = EDAGenerator()
153
- df = pd.read_csv(dataset_path)
154
-
155
- # Smart sampling for large datasets
156
- if len(df) > 1_000_000:
157
- df = df.sample(n=100_000, random_state=42) # Cost savings
158
-
159
- report = eda.generate_report(df)
160
- return report
161
-
162
- # Cost report
163
- report = cost_tracker.monthly_report()
164
- print(f"Total EDA costs: ${report.total_cost:.2f}")
165
- print(f"Cost per analysis: ${report.avg_cost:.2f}")
166
- ```
167
-
168
- ## 🔒 Security Best Practices
169
-
170
- ### PII Masking in Reports
171
- ```python
172
- from pii_detector import PIIDetector
173
- from eda_generator import EDAGenerator
174
-
175
- detector = PIIDetector()
176
- eda = EDAGenerator()
177
-
178
- def secure_eda(df: pd.DataFrame):
179
- # Detect PII columns
180
- pii_columns = []
181
- for col in df.columns:
182
- sample = df[col].astype(str).sample(min(100, len(df)))
183
- if detector.contains_pii(sample.tolist()):
184
- pii_columns.append(col)
185
-
186
- # Mask PII before EDA
187
- df_masked = df.copy()
188
- for col in pii_columns:
189
- df_masked[col] = "***MASKED***"
190
-
191
- # Generate report on masked data
192
- report = eda.generate_report(
193
- df_masked,
194
- title="Customer Data Analysis (PII Masked)"
195
- )
196
-
197
- return report, pii_columns
198
- ```
199
-
200
- ## 🔄 Integration Workflow
201
-
202
- ### End-to-End EDA Pipeline
203
- ```
204
- 1. Data Ingestion (de-01)
205
-
206
- 2. PII Detection (sa-01)
207
-
208
- 3. Data Quality Check (de-03)
209
-
210
- 4. Automated EDA (ds-01)
211
-
212
- 5. Track Findings (mo-01)
213
-
214
- 6. Feature Discovery (ml-02)
215
-
216
- 7. Generate Insights (ai-07)
217
-
218
- 8. Share Report (Automated)
219
-
220
- 9. Monitor Costs (fo-06)
221
- ```
222
-
223
- ## 🎯 Quick Wins
224
-
225
- 1. **Automate PII detection** - Prevent compliance violations in reports
226
- 2. **Set up cost tracking** - Monitor analysis compute spending
227
- 3. **Enable auto-shutdown** - Stop idle notebooks to save costs
228
- 4. **Sample large datasets** - Faster EDA at lower cost
229
- 5. **Track EDA experiments** - Link insights to model performance
230
- 6. **Automate report generation** - Schedule weekly data profiling
1
+ # Skill 1: Automated Exploratory Data Analysis (EDA)
2
+
3
+ ## 🎯 Overview
4
+ Automated EDA with statistical profiling, visualization, and insight generation.
5
+
6
+ ## 🔗 Connections
7
+ - **Data Engineer**: Provides feedback on data quality issues (de-01, de-03)
8
+ - **ML Engineer**: Identifies promising features for modeling (ml-01, ml-02)
9
+ - **MLOps**: Experiment tracking for EDA findings (mo-01)
10
+ - **AI Engineer**: Generates insights for LLM context (ai-02, ai-03)
11
+ - **Security Architect**: PII detection in datasets (sa-01)
12
+ - **FinOps**: Cost-effective analytics compute (fo-06)
13
+ - **DevOps**: Automated reporting pipelines (do-01)
14
+
15
+ ## 🛠️ Tools Included
16
+
17
+ ### 1. `eda_generator.py`
18
+ Automated EDA report generation with ydata-profiling.
19
+
20
+ ### 2. `statistical_analyzer.py`
21
+ Statistical tests, distributions, and correlations.
22
+
23
+ ### 3. `visualization_suite.py`
24
+ Interactive visualizations with Plotly.
25
+
26
+ ### 4. `insight_extractor.py`
27
+ Automated insight extraction and anomaly detection.
28
+
29
+ ### 5. `eda_queries.sql`
30
+ SQL templates for common analytical queries.
31
+
32
+ ## 📊 Key Outputs
33
+ - Automated profiling reports (HTML)
34
+ - Statistical summaries
35
+ - Correlation matrices
36
+ - Distribution plots
37
+ - Anomaly detection alerts
38
+
39
+ ## 🚀 Quick Start
40
+
41
+ ```python
42
+ from eda_generator import EDAGenerator
43
+
44
+ # Initialize
45
+ eda = EDAGenerator()
46
+
47
+ # Load data
48
+ df = pd.read_csv("customer_data.csv")
49
+
50
+ # Generate comprehensive report
51
+ report = eda.generate_report(
52
+ df=df,
53
+ title="Customer Data Analysis",
54
+ output_file="eda_report.html"
55
+ )
56
+
57
+ # Extract key insights
58
+ insights = eda.extract_insights(df)
59
+ print(insights)
60
+ ```
61
+
62
+ ## 📚 Best Practices
63
+
64
+ ### Data Quality & Security (Cross-Role Integration)
65
+
66
+ 1. **PII Detection Before Analysis**
67
+ - Scan datasets for PII before profiling
68
+ - Mask sensitive data in reports and visualizations
69
+ - Track data lineage for compliance
70
+ - Reference: Security Architect sa-01 (PII Detection)
71
+
72
+ 2. **Data Quality Validation**
73
+ - Validate schema before EDA
74
+ - Check completeness, accuracy, consistency
75
+ - Alert Data Engineering team on quality issues
76
+ - Reference: Data Engineer de-03 (Data Quality)
77
+
78
+ 3. **Automated Quality Feedback Loop**
79
+ - Generate data quality scorecards
80
+ - Feed insights back to data pipelines
81
+ - Track quality improvements over time
82
+ - Reference: Data Engineer de-01, de-03
83
+
84
+ ### Cost Optimization (FinOps Integration)
85
+
86
+ 4. **Optimize Compute for Analysis**
87
+ - Use appropriate instance sizes for EDA workloads
88
+ - Auto-shutdown notebooks when idle
89
+ - Sample large datasets intelligently
90
+ - Monitor analysis costs per project
91
+ - Reference: FinOps fo-06 (Compute Optimization)
92
+
93
+ 5. **Efficient Data Sampling**
94
+ - Use stratified sampling for large datasets
95
+ - Profile samples before full dataset analysis
96
+ - Cache intermediate results
97
+ - Minimize data movement and storage
98
+ - Reference: FinOps fo-05, Data Engineer de-01
99
+
100
+ ### MLOps Integration
101
+
102
+ 6. **Track EDA Experiments**
103
+ - Log EDA findings in MLflow/Azure ML
104
+ - Version datasets used for analysis
105
+ - Document feature engineering insights
106
+ - Link EDA to downstream model experiments
107
+ - Reference: MLOps mo-01 (Experiment Tracking)
108
+
109
+ 7. **Feature Discovery Documentation**
110
+ - Document promising features for ML
111
+ - Track feature importance from EDA
112
+ - Share insights with ML Engineering team
113
+ - Maintain feature catalog
114
+ - Reference: ML Engineer ml-02 (Feature Engineering)
115
+
116
+ ### Automation & Deployment (DevOps Integration)
117
+
118
+ 8. **Automated EDA Pipelines**
119
+ - Schedule regular EDA reports for key datasets
120
+ - Automate anomaly detection and alerting
121
+ - Deploy EDA as part of data pipeline monitoring
122
+ - Version control EDA scripts
123
+ - Reference: DevOps do-01 (CI/CD), do-08 (Monitoring)
124
+
125
+ 9. **Reproducible Analysis**
126
+ - Use containerized environments
127
+ - Pin package versions
128
+ - Document analysis dependencies
129
+ - Enable one-click report regeneration
130
+ - Reference: DevOps do-03 (Containerization)
131
+
132
+ ### AI Integration
133
+
134
+ 10. **LLM-Powered Insights**
135
+ - Use LLMs to generate narrative insights
136
+ - Automate insight extraction from distributions
137
+ - Create natural language data summaries
138
+ - Reference: AI Engineer ai-01, ai-07
139
+
140
+ ## 💰 Cost Optimization Examples
141
+
142
+ ### Compute Cost Tracking
143
+ ```python
144
+ from eda_generator import EDAGenerator
145
+ from finops_tracker import AnalyticsCostTracker
146
+
147
+ cost_tracker = AnalyticsCostTracker()
148
+
149
+ # Track EDA compute costs
150
+ @cost_tracker.track_analysis_cost
151
+ def run_eda(dataset_path: str):
152
+ eda = EDAGenerator()
153
+ df = pd.read_csv(dataset_path)
154
+
155
+ # Smart sampling for large datasets
156
+ if len(df) > 1_000_000:
157
+ df = df.sample(n=100_000, random_state=42) # Cost savings
158
+
159
+ report = eda.generate_report(df)
160
+ return report
161
+
162
+ # Cost report
163
+ report = cost_tracker.monthly_report()
164
+ print(f"Total EDA costs: ${report.total_cost:.2f}")
165
+ print(f"Cost per analysis: ${report.avg_cost:.2f}")
166
+ ```
167
+
168
+ ## 🔒 Security Best Practices
169
+
170
+ ### PII Masking in Reports
171
+ ```python
172
+ from pii_detector import PIIDetector
173
+ from eda_generator import EDAGenerator
174
+
175
+ detector = PIIDetector()
176
+ eda = EDAGenerator()
177
+
178
+ def secure_eda(df: pd.DataFrame):
179
+ # Detect PII columns
180
+ pii_columns = []
181
+ for col in df.columns:
182
+ sample = df[col].astype(str).sample(min(100, len(df)))
183
+ if detector.contains_pii(sample.tolist()):
184
+ pii_columns.append(col)
185
+
186
+ # Mask PII before EDA
187
+ df_masked = df.copy()
188
+ for col in pii_columns:
189
+ df_masked[col] = "***MASKED***"
190
+
191
+ # Generate report on masked data
192
+ report = eda.generate_report(
193
+ df_masked,
194
+ title="Customer Data Analysis (PII Masked)"
195
+ )
196
+
197
+ return report, pii_columns
198
+ ```
199
+
200
+ ## 🔄 Integration Workflow
201
+
202
+ ### End-to-End EDA Pipeline
203
+ ```
204
+ 1. Data Ingestion (de-01)
205
+
206
+ 2. PII Detection (sa-01)
207
+
208
+ 3. Data Quality Check (de-03)
209
+
210
+ 4. Automated EDA (ds-01)
211
+
212
+ 5. Track Findings (mo-01)
213
+
214
+ 6. Feature Discovery (ml-02)
215
+
216
+ 7. Generate Insights (ai-07)
217
+
218
+ 8. Share Report (Automated)
219
+
220
+ 9. Monitor Costs (fo-06)
221
+ ```
222
+
223
+ ## 🎯 Quick Wins
224
+
225
+ 1. **Automate PII detection** - Prevent compliance violations in reports
226
+ 2. **Set up cost tracking** - Monitor analysis compute spending
227
+ 3. **Enable auto-shutdown** - Stop idle notebooks to save costs
228
+ 4. **Sample large datasets** - Faster EDA at lower cost
229
+ 5. **Track EDA experiments** - Link insights to model performance
230
+ 6. **Automate report generation** - Schedule weekly data profiling