tech-hub-skills 1.2.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/{LICENSE → .claude/LICENSE} +21 -21
  2. package/.claude/README.md +291 -0
  3. package/.claude/bin/cli.js +266 -0
  4. package/{bin → .claude/bin}/copilot.js +182 -182
  5. package/{bin → .claude/bin}/postinstall.js +42 -42
  6. package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
  7. package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
  8. package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
  9. package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
  10. package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
  11. package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
  12. package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
  13. package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
  14. package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
  15. package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
  16. package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
  17. package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
  18. package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
  19. package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
  20. package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
  21. package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
  22. package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
  23. package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
  24. package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
  25. package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
  26. package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
  27. package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
  28. package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
  29. package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
  30. package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
  31. package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
  32. package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
  33. package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
  34. package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
  35. package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
  36. package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
  37. package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
  38. package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
  39. package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
  40. package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
  41. package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -0
  43. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  47. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  50. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
  59. package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
  64. package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  74. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  86. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  158. package/.claude/skills/README.md +336 -0
  159. package/.claude/skills/ai-engineer.md +104 -0
  160. package/.claude/skills/aws.md +143 -0
  161. package/.claude/skills/azure.md +149 -0
  162. package/.claude/skills/backend-developer.md +108 -0
  163. package/.claude/skills/code-review.md +399 -0
  164. package/.claude/skills/compliance-automation.md +747 -0
  165. package/.claude/skills/compliance-officer.md +108 -0
  166. package/.claude/skills/data-engineer.md +113 -0
  167. package/.claude/skills/data-governance.md +102 -0
  168. package/.claude/skills/data-scientist.md +123 -0
  169. package/.claude/skills/database-admin.md +109 -0
  170. package/.claude/skills/devops.md +160 -0
  171. package/.claude/skills/docker.md +160 -0
  172. package/.claude/skills/enterprise-dashboard.md +613 -0
  173. package/.claude/skills/finops.md +184 -0
  174. package/.claude/skills/frontend-developer.md +108 -0
  175. package/.claude/skills/gcp.md +143 -0
  176. package/.claude/skills/ml-engineer.md +115 -0
  177. package/.claude/skills/mlops.md +187 -0
  178. package/.claude/skills/network-engineer.md +109 -0
  179. package/.claude/skills/optimization-advisor.md +329 -0
  180. package/.claude/skills/orchestrator.md +623 -0
  181. package/.claude/skills/platform-engineer.md +102 -0
  182. package/.claude/skills/process-automation.md +226 -0
  183. package/.claude/skills/process-changelog.md +184 -0
  184. package/.claude/skills/process-documentation.md +484 -0
  185. package/.claude/skills/process-kanban.md +324 -0
  186. package/.claude/skills/process-versioning.md +214 -0
  187. package/.claude/skills/product-designer.md +104 -0
  188. package/.claude/skills/project-starter.md +443 -0
  189. package/.claude/skills/qa-engineer.md +109 -0
  190. package/.claude/skills/security-architect.md +135 -0
  191. package/.claude/skills/sre.md +109 -0
  192. package/.claude/skills/system-design.md +126 -0
  193. package/.claude/skills/technical-writer.md +101 -0
  194. package/.gitattributes +2 -0
  195. package/GITHUB_COPILOT.md +106 -0
  196. package/README.md +192 -291
  197. package/package.json +16 -46
  198. package/bin/cli.js +0 -241
@@ -1,566 +1,566 @@
1
- # Skill 1: MLOps Pipeline Automation
2
-
3
- ## 🎯 Overview
4
- Build end-to-end MLOps pipelines with automated training, versioning, and deployment.
5
-
6
- ## 🔗 Connections
7
- - **Data Engineer**: Consumes data from feature pipelines (de-01, de-02, de-03)
8
- - **AI Engineer**: Serves models for agent systems (ai-03, ai-07)
9
- - **Data Scientist**: Promotes experiments to production (ds-01, ds-02, ds-08)
10
- - **Security Architect**: Ensures model provenance and audit trails (sa-02, sa-06, sa-08)
11
- - **MLOps**: Full lifecycle management (mo-01 through mo-08)
12
- - **FinOps**: ML training and serving cost optimization (fo-01, fo-06, fo-07)
13
- - **DevOps**: CI/CD for ML pipelines, container orchestration (do-01, do-03, do-08)
14
- - **System Design**: Scalability and performance patterns (sd-03, sd-05, sd-07)
15
-
16
- ## 🛠️ Tools Included
17
-
18
- ### 1. `ml_pipeline_orchestrator.py`
19
- End-to-end ML pipeline with Kedro/ZenML patterns.
20
-
21
- ### 2. `model_registry_manager.py`
22
- MLflow model registry with lifecycle management.
23
-
24
- ### 3. `experiment_tracker.py`
25
- Comprehensive experiment tracking with metrics, params, artifacts.
26
-
27
- ### 4. `ci_cd_ml_pipeline.py`
28
- CI/CD automation for ML workflows.
29
-
30
- ### 5. `mlops_config.yaml`
31
- Configuration templates for MLOps infrastructure.
32
-
33
- ## 📊 Pipeline Stages
34
-
35
- ```
36
- Data Validation → Feature Engineering → Model Training →
37
- Evaluation → Registration → Deployment → Monitoring
38
- ```
39
-
40
- ## 🚀 Quick Start
41
-
42
- ```python
43
- from ml_pipeline_orchestrator import MLPipeline
44
-
45
- # Define pipeline
46
- pipeline = MLPipeline(
47
- name="customer_churn_predictor",
48
- experiment_name="churn_model_v2"
49
- )
50
-
51
- # Run training
52
- pipeline.train(
53
- data_path="gold.customer_features",
54
- model_type="xgboost",
55
- hyperparams={"max_depth": 6, "n_estimators": 100}
56
- )
57
-
58
- # Promote to production
59
- pipeline.promote_to_production(
60
- stage="Production",
61
- archive_existing=True
62
- )
63
- ```
64
-
65
- ## 📚 Best Practices
66
-
67
- ### ML Training Cost Optimization (FinOps Integration)
68
-
69
- 1. **Compute Cost Optimization**
70
- - Use spot/preemptible instances for training (60-90% savings)
71
- - Right-size compute based on model requirements
72
- - Auto-scale training clusters
73
- - Schedule training during off-peak hours
74
- - Reference: FinOps fo-06 (Compute Optimization), fo-07 (AI/ML Cost)
75
-
76
- 2. **Track Training Costs**
77
- - Log compute costs per experiment
78
- - Track GPU utilization and cost efficiency
79
- - Monitor training time vs accuracy trade-offs
80
- - Set budget alerts for long-running experiments
81
- - Reference: FinOps fo-01 (Cost Monitoring), fo-03 (Budget Management)
82
-
83
- 3. **Optimize Hyperparameter Tuning**
84
- - Use early stopping to prevent wasteful runs
85
- - Implement intelligent search (Bayesian vs grid search)
86
- - Parallelize trials efficiently
87
- - Track cost per trial
88
- - Reference: ML Engineer best practices, FinOps fo-07
89
-
90
- 4. **Model Serving Cost Optimization**
91
- - Use auto-scaling for inference endpoints
92
- - Implement model caching for frequent predictions
93
- - Batch predictions when possible
94
- - Use smaller/distilled models for cost-sensitive applications
95
- - Monitor inference costs per request
96
- - Reference: FinOps fo-06, fo-07
97
-
98
- 5. **Storage Cost Optimization**
99
- - Compress model artifacts and datasets
100
- - Implement lifecycle policies for experiments
101
- - Archive old model versions to cold storage
102
- - Monitor artifact storage costs
103
- - Reference: FinOps fo-05 (Storage Optimization)
104
-
105
- ### DevOps Integration for ML
106
-
107
- 6. **CI/CD for ML Pipelines**
108
- - Automate model training on code changes
109
- - Run model validation tests before deployment
110
- - Implement canary deployments for models
111
- - Automate rollback on quality degradation
112
- - Reference: DevOps do-01 (CI/CD), do-06 (Deployment Strategies)
113
-
114
- 7. **Containerization**
115
- - Package models in containers for portability
116
- - Use multi-stage builds to minimize image size
117
- - Implement health checks for model endpoints
118
- - Deploy to AKS for production serving
119
- - Reference: DevOps do-03 (Containerization)
120
-
121
- 8. **Infrastructure as Code for ML**
122
- - Deploy ML infrastructure with Terraform
123
- - Version control all infrastructure
124
- - Automate environment provisioning
125
- - Implement disaster recovery
126
- - Reference: DevOps do-04 (IaC)
127
-
128
- 9. **Monitoring & Observability**
129
- - Instrument pipelines with OpenTelemetry
130
- - Track model performance metrics in production
131
- - Set up alerts for model drift and degradation
132
- - Monitor inference latency and throughput
133
- - Reference: DevOps do-08 (Monitoring), MLOps mo-04
134
-
135
- ### Model Lifecycle Management (MLOps Integration)
136
-
137
- 10. **Experiment Tracking**
138
- - Track all experiments with MLflow/Azure ML
139
- - Log hyperparameters, metrics, and artifacts
140
- - Compare experiment results systematically
141
- - Version datasets alongside experiments
142
- - Reference: MLOps mo-01 (Experiment Tracking)
143
-
144
- 11. **Model Versioning & Registry**
145
- - Register all production models
146
- - Track model lineage (data + code + config)
147
- - Implement model approval workflows
148
- - Version control model configurations
149
- - Reference: MLOps mo-03 (Model Versioning)
150
-
151
- 12. **Feature Engineering & Feature Store**
152
- - Centralize features in a feature store
153
- - Track feature versions and lineage
154
- - Monitor feature drift
155
- - Reuse features across models
156
- - Reference: ML Engineer ml-02, MLOps mo-02
157
-
158
- 13. **Model Monitoring**
159
- - Monitor model performance in production
160
- - Detect data drift and concept drift
161
- - Track prediction distribution shifts
162
- - Set up automated retraining triggers
163
- - Reference: MLOps mo-04 (Monitoring), mo-05 (Drift Detection)
164
-
165
- ### Security & Compliance
166
-
167
- 14. **Model Security**
168
- - Scan model dependencies for vulnerabilities
169
- - Implement model access controls
170
- - Encrypt model artifacts at rest
171
- - Audit model predictions for compliance
172
- - Reference: Security Architect sa-02 (IAM), sa-08 (LLM Security)
173
-
174
- 15. **Data Privacy in Training**
175
- - Remove PII before training
176
- - Implement differential privacy where needed
177
- - Track data usage for compliance
178
- - Document data sources and lineage
179
- - Reference: Security Architect sa-01 (PII Detection), sa-06 (Governance)
180
-
181
- 16. **Model Provenance**
182
- - Track complete model lineage
183
- - Document training data sources
184
- - Version all training code and configs
185
- - Maintain audit logs for model decisions
186
- - Reference: MLOps mo-06 (Lineage), Security Architect sa-06
187
-
188
- ### Azure-Specific Best Practices
189
-
190
- 17. **Azure Machine Learning**
191
- - Use managed compute clusters
192
- - Enable auto-scaling for training and inference
193
- - Implement managed endpoints for serving
194
- - Use Azure ML Pipelines for orchestration
195
- - Reference: Azure az-04 (AI/ML Services)
196
-
197
- 18. **Cost Management in Azure ML**
198
- - Use low-priority compute for training
199
- - Enable compute instance auto-shutdown
200
- - Monitor compute utilization
201
- - Set spending limits per workspace
202
- - Reference: Azure az-04, FinOps fo-06
203
-
204
- ### Production Best Practices
205
-
206
- 19. **A/B Testing for Models**
207
- - Deploy multiple model versions
208
- - Route traffic based on experiment design
209
- - Track statistical significance
210
- - Automated winner selection
211
- - Reference: Data Scientist ds-08 (Experimentation)
212
-
213
- 20. **Model Performance Optimization**
214
- - Optimize model inference latency
215
- - Implement model quantization
216
- - Use ONNX for cross-platform deployment
217
- - Batch predictions for throughput
218
- - Reference: ML Engineer best practices
219
-
220
- ## 💰 Cost Optimization Examples
221
-
222
- ### Training Cost Tracking
223
- ```python
224
- from ml_pipeline_orchestrator import MLPipeline
225
- from finops_tracker import MLCostTracker
226
-
227
- cost_tracker = MLCostTracker()
228
-
229
- # Track training costs
230
- @cost_tracker.track_training_cost
231
- def train_model(config: dict):
232
- pipeline = MLPipeline(
233
- name="customer_churn_predictor",
234
- compute_target="spot-gpu-cluster", # Use spot instances
235
- auto_scale=True,
236
- max_nodes=4
237
- )
238
-
239
- # Track costs per experiment
240
- with cost_tracker.experiment_context("churn_v2"):
241
- pipeline.train(
242
- data_path="gold.customer_features",
243
- model_type="xgboost",
244
- hyperparams=config
245
- )
246
-
247
- # Generate cost report
248
- report = cost_tracker.generate_training_report(period="monthly")
249
- print(f"Total training costs: ${report.total_cost:.2f}")
250
- print(f"Cost per experiment: ${report.avg_cost_per_experiment:.2f}")
251
- print(f"Savings from spot instances: ${report.spot_savings:.2f}")
252
- print(f"Most expensive experiments: {report.top_experiments}")
253
-
254
- # Set budget alerts
255
- cost_tracker.set_budget_alert(
256
- experiment_name="churn_v2",
257
- budget_per_run=50.00,
258
- monthly_budget=500.00
259
- )
260
- ```
261
-
262
- ### Spot Instance Training (60-90% Savings)
263
- ```python
264
- from azure.ai.ml import command, Input
265
- from azure.ai.ml.entities import AmlCompute
266
-
267
- # Create spot compute cluster
268
- compute_config = AmlCompute(
269
- name="spot-training-cluster",
270
- size="Standard_NC6s_v3", # GPU instance
271
- min_instances=0,
272
- max_instances=4,
273
- tier="LowPriority", # Spot instances!
274
- idle_time_before_scale_down=300
275
- )
276
-
277
- # Submit training job with checkpointing
278
- job = command(
279
- code="./src",
280
- command="python train.py --checkpoint-freq 100", # Save checkpoints
281
- environment="azureml:training-env:1",
282
- compute="spot-training-cluster",
283
- inputs={
284
- "data": Input(path="azureml://datasets/customer_features/labels/latest")
285
- }
286
- )
287
-
288
- # Job will automatically resume from checkpoint if preempted
289
- ml_client.jobs.create_or_update(job)
290
- ```
291
-
292
- ### Model Serving Cost Optimization
293
- ```python
294
- from model_registry_manager import ModelRegistry
295
- from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment
296
-
297
- registry = ModelRegistry()
298
-
299
- # Deploy with auto-scaling
300
- endpoint = ManagedOnlineEndpoint(
301
- name="churn-prediction",
302
- auth_mode="key"
303
- )
304
-
305
- deployment = ManagedOnlineDeployment(
306
- name="churn-v2",
307
- endpoint_name="churn-prediction",
308
- model=registry.get_model("churn_predictor:v2"),
309
- instance_type="Standard_DS2_v2",
310
- instance_count=1,
311
- # Auto-scale based on load
312
- scale_settings={
313
- "scale_type": "target_utilization",
314
- "min_instances": 1,
315
- "max_instances": 5,
316
- "target_utilization_percentage": 70
317
- },
318
- # Request timeout for cost control
319
- request_timeout_ms=5000
320
- )
321
-
322
- # Monitor serving costs
323
- from finops_tracker import InferenceCostTracker
324
- inference_tracker = InferenceCostTracker()
325
-
326
- @inference_tracker.track_inference_cost
327
- def predict(data):
328
- return endpoint.invoke(data)
329
-
330
- # Cost report
331
- report = inference_tracker.generate_inference_report()
332
- print(f"Cost per 1000 predictions: ${report.cost_per_1k:.4f}")
333
- print(f"Monthly serving costs: ${report.monthly_cost:.2f}")
334
- ```
335
-
336
- ### Hyperparameter Tuning Cost Optimization
337
- ```python
338
- from azure.ai.ml.sweep import Choice, Uniform, BayesianSamplingAlgorithm
339
- from ml_cost_optimizer import EarlyStoppingPolicy
340
-
341
- # Use Bayesian optimization (more efficient than grid search)
342
- sweep_job = command_job.sweep(
343
- sampling_algorithm=BayesianSamplingAlgorithm(), # Smart search
344
- primary_metric="accuracy",
345
- goal="maximize",
346
- max_total_trials=20, # Limit trials
347
- max_concurrent_trials=4,
348
- early_termination_policy=EarlyStoppingPolicy(
349
- evaluation_interval=1,
350
- delay_evaluation=5,
351
- slack_factor=0.1 # Stop if 10% worse than best
352
- )
353
- )
354
-
355
- # Track tuning costs
356
- cost_tracker.track_sweep_cost(sweep_job)
357
- ```
358
-
359
- ## 🚀 CI/CD for ML Pipelines
360
-
361
- ### Automated ML Pipeline
362
- ```yaml
363
- # .github/workflows/ml-pipeline.yml
364
- name: ML Pipeline CI/CD
365
-
366
- on:
367
- push:
368
- paths:
369
- - 'models/**'
370
- - 'pipelines/**'
371
- branches:
372
- - main
373
- schedule:
374
- - cron: '0 2 * * 0' # Weekly retraining
375
-
376
- jobs:
377
- train-and-deploy:
378
- runs-on: ubuntu-latest
379
- steps:
380
- - uses: actions/checkout@v3
381
-
382
- - name: Azure Login
383
- uses: azure/login@v1
384
- with:
385
- creds: ${{ secrets.AZURE_CREDENTIALS }}
386
-
387
- - name: Run unit tests
388
- run: pytest tests/unit/
389
-
390
- - name: Validate data quality
391
- run: python scripts/validate_training_data.py
392
-
393
- - name: Train model (spot instances)
394
- run: |
395
- python pipelines/train.py \
396
- --compute-type spot \
397
- --max-cost 100.00 \
398
- --early-stopping true
399
-
400
- - name: Evaluate model
401
- run: |
402
- python pipelines/evaluate.py \
403
- --min-accuracy 0.85 \
404
- --min-auc 0.90
405
-
406
- - name: Register model
407
- if: success()
408
- run: python scripts/register_model.py
409
-
410
- - name: Deploy to staging
411
- run: |
412
- python scripts/deploy_model.py \
413
- --environment staging \
414
- --traffic-percent 100
415
-
416
- - name: Run integration tests
417
- run: pytest tests/integration/
418
-
419
- - name: Deploy to production (canary)
420
- if: success()
421
- run: |
422
- python scripts/deploy_model.py \
423
- --environment production \
424
- --strategy canary \
425
- --traffic-percent 10
426
-
427
- - name: Monitor model performance
428
- run: python scripts/monitor_model.py --duration 24h
429
-
430
- - name: Promote canary to full
431
- if: success()
432
- run: python scripts/promote_deployment.py --traffic-percent 100
433
-
434
- - name: Generate cost report
435
- run: python scripts/ml_cost_report.py
436
- ```
437
-
438
- ### Infrastructure as Code for ML
439
- ```hcl
440
- # ml-infrastructure.tf
441
- module "ml_workspace" {
442
- source = "./modules/azure-ml"
443
-
444
- resource_group = "rg-ml-prod"
445
- location = "eastus"
446
-
447
- workspace = {
448
- name = "ml-workspace-prod"
449
- sku = "Basic"
450
- public_network_access = false
451
- }
452
-
453
- compute_clusters = [
454
- {
455
- name = "cpu-cluster"
456
- vm_size = "Standard_D4s_v3"
457
- min_nodes = 0
458
- max_nodes = 4
459
- idle_time = 300
460
- tier = "Dedicated"
461
- },
462
- {
463
- name = "gpu-spot-cluster"
464
- vm_size = "Standard_NC6s_v3"
465
- min_nodes = 0
466
- max_nodes = 4
467
- idle_time = 120
468
- tier = "LowPriority" # 60-90% cost savings
469
- }
470
- ]
471
-
472
- endpoints = [
473
- {
474
- name = "churn-prediction"
475
- traffic_rules = {
476
- "blue" = 90 # Current production
477
- "green" = 10 # Canary deployment
478
- }
479
- auto_scale = {
480
- min_instances = 2
481
- max_instances = 10
482
- target_cpu = 70
483
- }
484
- }
485
- ]
486
-
487
- cost_management = {
488
- monthly_budget = 5000.00
489
- alert_threshold = 0.8
490
- auto_shutdown_enabled = true
491
- auto_shutdown_time = "19:00"
492
- }
493
-
494
- tags = {
495
- Environment = "Production"
496
- CostCenter = "ML-Platform"
497
- Owner = "MLOps-Team"
498
- }
499
- }
500
- ```
501
-
502
- ## 📊 Enhanced Metrics & Monitoring
503
-
504
- | Metric Category | Metric | Target | Tool |
505
- |-----------------|--------|--------|------|
506
- | **Training Costs** | Cost per experiment | <$50 | FinOps tracker |
507
- | | Monthly training budget | <$5000 | Azure Cost Management |
508
- | | Spot instance savings | >70% | Cost tracker |
509
- | | GPU utilization | >80% | Azure Monitor |
510
- | **Serving Costs** | Cost per 1000 predictions | <$0.10 | Inference tracker |
511
- | | Monthly serving costs | <$2000 | FinOps dashboard |
512
- | | Auto-scaling efficiency | >70% | Azure Monitor |
513
- | **Model Performance** | Production accuracy | >0.85 | MLflow |
514
- | | Prediction latency (p95) | <100ms | App Insights |
515
- | | Model drift score | <0.1 | Drift monitor |
516
- | **Pipeline Reliability** | Training success rate | >95% | Azure ML |
517
- | | Deployment success rate | >99% | DevOps metrics |
518
- | | Rollback frequency | <2/month | Deployment logs |
519
- | **Data Quality** | Feature freshness | <1 hour | Data quality checks |
520
- | | Training data completeness | >99% | Validation tests |
521
-
522
- ## 🔄 Integration Workflow
523
-
524
- ### End-to-End MLOps Pipeline
525
- ```
526
- 1. Feature Engineering (de-01, ml-02)
527
-
528
- 2. Data Validation (de-03)
529
-
530
- 3. PII Removal (sa-01)
531
-
532
- 4. Experiment Tracking (mo-01)
533
-
534
- 5. Model Training with Cost Tracking (ml-01, fo-07)
535
-
536
- 6. Model Evaluation (ml-01, ds-08)
537
-
538
- 7. Model Registry (mo-03)
539
-
540
- 8. Security Scan (sa-08)
541
-
542
- 9. CI/CD Deployment (do-01)
543
-
544
- 10. Canary Deployment (do-06)
545
-
546
- 11. Production Monitoring (mo-04, do-08)
547
-
548
- 12. Drift Detection (mo-05)
549
-
550
- 13. Cost Optimization (fo-01, fo-06, fo-07)
551
-
552
- 14. Automated Retraining (ml-01)
553
- ```
554
-
555
- ## 🎯 Quick Wins
556
-
557
- 1. **Use spot instances for training** - 60-90% compute cost savings
558
- 2. **Enable auto-scaling for inference** - 30-50% serving cost reduction
559
- 3. **Implement early stopping** - Reduce wasteful hyperparameter trials
560
- 4. **Set up experiment tracking** - Compare models systematically
561
- 5. **Automate model deployment** - Faster time to production
562
- 6. **Enable model monitoring** - Detect drift before degradation
563
- 7. **Implement cost tracking** - Know where ML budget is going
564
- 8. **Use canary deployments** - Safer model releases
565
- 9. **Containerize models** - Portable and scalable serving
566
- 10. **Set budget alerts** - Prevent cost overruns
1
+ # Skill 1: MLOps Pipeline Automation
2
+
3
+ ## 🎯 Overview
4
+ Build end-to-end MLOps pipelines with automated training, versioning, and deployment.
5
+
6
+ ## 🔗 Connections
7
+ - **Data Engineer**: Consumes data from feature pipelines (de-01, de-02, de-03)
8
+ - **AI Engineer**: Serves models for agent systems (ai-03, ai-07)
9
+ - **Data Scientist**: Promotes experiments to production (ds-01, ds-02, ds-08)
10
+ - **Security Architect**: Ensures model provenance and audit trails (sa-02, sa-06, sa-08)
11
+ - **MLOps**: Full lifecycle management (mo-01 through mo-08)
12
+ - **FinOps**: ML training and serving cost optimization (fo-01, fo-06, fo-07)
13
+ - **DevOps**: CI/CD for ML pipelines, container orchestration (do-01, do-03, do-08)
14
+ - **System Design**: Scalability and performance patterns (sd-03, sd-05, sd-07)
15
+
16
+ ## 🛠️ Tools Included
17
+
18
+ ### 1. `ml_pipeline_orchestrator.py`
19
+ End-to-end ML pipeline with Kedro/ZenML patterns.
20
+
21
+ ### 2. `model_registry_manager.py`
22
+ MLflow model registry with lifecycle management.
23
+
24
+ ### 3. `experiment_tracker.py`
25
+ Comprehensive experiment tracking with metrics, params, artifacts.
26
+
27
+ ### 4. `ci_cd_ml_pipeline.py`
28
+ CI/CD automation for ML workflows.
29
+
30
+ ### 5. `mlops_config.yaml`
31
+ Configuration templates for MLOps infrastructure.
32
+
33
+ ## 📊 Pipeline Stages
34
+
35
+ ```
36
+ Data Validation → Feature Engineering → Model Training →
37
+ Evaluation → Registration → Deployment → Monitoring
38
+ ```
39
+
40
+ ## 🚀 Quick Start
41
+
42
+ ```python
43
+ from ml_pipeline_orchestrator import MLPipeline
44
+
45
+ # Define pipeline
46
+ pipeline = MLPipeline(
47
+ name="customer_churn_predictor",
48
+ experiment_name="churn_model_v2"
49
+ )
50
+
51
+ # Run training
52
+ pipeline.train(
53
+ data_path="gold.customer_features",
54
+ model_type="xgboost",
55
+ hyperparams={"max_depth": 6, "n_estimators": 100}
56
+ )
57
+
58
+ # Promote to production
59
+ pipeline.promote_to_production(
60
+ stage="Production",
61
+ archive_existing=True
62
+ )
63
+ ```
64
+
65
+ ## 📚 Best Practices
66
+
67
+ ### ML Training Cost Optimization (FinOps Integration)
68
+
69
+ 1. **Compute Cost Optimization**
70
+ - Use spot/preemptible instances for training (60-90% savings)
71
+ - Right-size compute based on model requirements
72
+ - Auto-scale training clusters
73
+ - Schedule training during off-peak hours
74
+ - Reference: FinOps fo-06 (Compute Optimization), fo-07 (AI/ML Cost)
75
+
76
+ 2. **Track Training Costs**
77
+ - Log compute costs per experiment
78
+ - Track GPU utilization and cost efficiency
79
+ - Monitor training time vs accuracy trade-offs
80
+ - Set budget alerts for long-running experiments
81
+ - Reference: FinOps fo-01 (Cost Monitoring), fo-03 (Budget Management)
82
+
83
+ 3. **Optimize Hyperparameter Tuning**
84
+ - Use early stopping to prevent wasteful runs
85
+ - Implement intelligent search (Bayesian vs grid search)
86
+ - Parallelize trials efficiently
87
+ - Track cost per trial
88
+ - Reference: ML Engineer best practices, FinOps fo-07
89
+
90
+ 4. **Model Serving Cost Optimization**
91
+ - Use auto-scaling for inference endpoints
92
+ - Implement model caching for frequent predictions
93
+ - Batch predictions when possible
94
+ - Use smaller/distilled models for cost-sensitive applications
95
+ - Monitor inference costs per request
96
+ - Reference: FinOps fo-06, fo-07
97
+
98
+ 5. **Storage Cost Optimization**
99
+ - Compress model artifacts and datasets
100
+ - Implement lifecycle policies for experiments
101
+ - Archive old model versions to cold storage
102
+ - Monitor artifact storage costs
103
+ - Reference: FinOps fo-05 (Storage Optimization)
104
+
105
+ ### DevOps Integration for ML
106
+
107
+ 6. **CI/CD for ML Pipelines**
108
+ - Automate model training on code changes
109
+ - Run model validation tests before deployment
110
+ - Implement canary deployments for models
111
+ - Automate rollback on quality degradation
112
+ - Reference: DevOps do-01 (CI/CD), do-06 (Deployment Strategies)
113
+
114
+ 7. **Containerization**
115
+ - Package models in containers for portability
116
+ - Use multi-stage builds to minimize image size
117
+ - Implement health checks for model endpoints
118
+ - Deploy to AKS for production serving
119
+ - Reference: DevOps do-03 (Containerization)
120
+
121
+ 8. **Infrastructure as Code for ML**
122
+ - Deploy ML infrastructure with Terraform
123
+ - Version control all infrastructure
124
+ - Automate environment provisioning
125
+ - Implement disaster recovery
126
+ - Reference: DevOps do-04 (IaC)
127
+
128
+ 9. **Monitoring & Observability**
129
+ - Instrument pipelines with OpenTelemetry
130
+ - Track model performance metrics in production
131
+ - Set up alerts for model drift and degradation
132
+ - Monitor inference latency and throughput
133
+ - Reference: DevOps do-08 (Monitoring), MLOps mo-04
134
+
135
+ ### Model Lifecycle Management (MLOps Integration)
136
+
137
+ 10. **Experiment Tracking**
138
+ - Track all experiments with MLflow/Azure ML
139
+ - Log hyperparameters, metrics, and artifacts
140
+ - Compare experiment results systematically
141
+ - Version datasets alongside experiments
142
+ - Reference: MLOps mo-01 (Experiment Tracking)
143
+
144
+ 11. **Model Versioning & Registry**
145
+ - Register all production models
146
+ - Track model lineage (data + code + config)
147
+ - Implement model approval workflows
148
+ - Version control model configurations
149
+ - Reference: MLOps mo-03 (Model Versioning)
150
+
151
+ 12. **Feature Engineering & Feature Store**
152
+ - Centralize features in a feature store
153
+ - Track feature versions and lineage
154
+ - Monitor feature drift
155
+ - Reuse features across models
156
+ - Reference: ML Engineer ml-02, MLOps mo-02
157
+
158
+ 13. **Model Monitoring**
159
+ - Monitor model performance in production
160
+ - Detect data drift and concept drift
161
+ - Track prediction distribution shifts
162
+ - Set up automated retraining triggers
163
+ - Reference: MLOps mo-04 (Monitoring), mo-05 (Drift Detection)
164
+
165
+ ### Security & Compliance
166
+
167
+ 14. **Model Security**
168
+ - Scan model dependencies for vulnerabilities
169
+ - Implement model access controls
170
+ - Encrypt model artifacts at rest
171
+ - Audit model predictions for compliance
172
+ - Reference: Security Architect sa-02 (IAM), sa-08 (LLM Security)
173
+
174
+ 15. **Data Privacy in Training**
175
+ - Remove PII before training
176
+ - Implement differential privacy where needed
177
+ - Track data usage for compliance
178
+ - Document data sources and lineage
179
+ - Reference: Security Architect sa-01 (PII Detection), sa-06 (Governance)
180
+
181
+ 16. **Model Provenance**
182
+ - Track complete model lineage
183
+ - Document training data sources
184
+ - Version all training code and configs
185
+ - Maintain audit logs for model decisions
186
+ - Reference: MLOps mo-06 (Lineage), Security Architect sa-06
187
+
188
+ ### Azure-Specific Best Practices
189
+
190
+ 17. **Azure Machine Learning**
191
+ - Use managed compute clusters
192
+ - Enable auto-scaling for training and inference
193
+ - Implement managed endpoints for serving
194
+ - Use Azure ML Pipelines for orchestration
195
+ - Reference: Azure az-04 (AI/ML Services)
196
+
197
+ 18. **Cost Management in Azure ML**
198
+ - Use low-priority compute for training
199
+ - Enable compute instance auto-shutdown
200
+ - Monitor compute utilization
201
+ - Set spending limits per workspace
202
+ - Reference: Azure az-04, FinOps fo-06
203
+
204
+ ### Production Best Practices
205
+
206
+ 19. **A/B Testing for Models**
207
+ - Deploy multiple model versions
208
+ - Route traffic based on experiment design
209
+ - Track statistical significance
210
+ - Automated winner selection
211
+ - Reference: Data Scientist ds-08 (Experimentation)
212
+
213
+ 20. **Model Performance Optimization**
214
+ - Optimize model inference latency
215
+ - Implement model quantization
216
+ - Use ONNX for cross-platform deployment
217
+ - Batch predictions for throughput
218
+ - Reference: ML Engineer best practices
219
+
220
+ ## 💰 Cost Optimization Examples
221
+
222
+ ### Training Cost Tracking
223
+ ```python
224
+ from ml_pipeline_orchestrator import MLPipeline
225
+ from finops_tracker import MLCostTracker
226
+
227
+ cost_tracker = MLCostTracker()
228
+
229
+ # Track training costs
230
+ @cost_tracker.track_training_cost
231
+ def train_model(config: dict):
232
+ pipeline = MLPipeline(
233
+ name="customer_churn_predictor",
234
+ compute_target="spot-gpu-cluster", # Use spot instances
235
+ auto_scale=True,
236
+ max_nodes=4
237
+ )
238
+
239
+ # Track costs per experiment
240
+ with cost_tracker.experiment_context("churn_v2"):
241
+ pipeline.train(
242
+ data_path="gold.customer_features",
243
+ model_type="xgboost",
244
+ hyperparams=config
245
+ )
246
+
247
+ # Generate cost report
248
+ report = cost_tracker.generate_training_report(period="monthly")
249
+ print(f"Total training costs: ${report.total_cost:.2f}")
250
+ print(f"Cost per experiment: ${report.avg_cost_per_experiment:.2f}")
251
+ print(f"Savings from spot instances: ${report.spot_savings:.2f}")
252
+ print(f"Most expensive experiments: {report.top_experiments}")
253
+
254
+ # Set budget alerts
255
+ cost_tracker.set_budget_alert(
256
+ experiment_name="churn_v2",
257
+ budget_per_run=50.00,
258
+ monthly_budget=500.00
259
+ )
260
+ ```
261
+
262
+ ### Spot Instance Training (60-90% Savings)
263
+ ```python
264
+ from azure.ai.ml import command, Input
265
+ from azure.ai.ml.entities import AmlCompute
266
+
267
+ # Create spot compute cluster
268
+ compute_config = AmlCompute(
269
+ name="spot-training-cluster",
270
+ size="Standard_NC6s_v3", # GPU instance
271
+ min_instances=0,
272
+ max_instances=4,
273
+ tier="LowPriority", # Spot instances!
274
+ idle_time_before_scale_down=300
275
+ )
276
+
277
+ # Submit training job with checkpointing
278
+ job = command(
279
+ code="./src",
280
+ command="python train.py --checkpoint-freq 100", # Save checkpoints
281
+ environment="azureml:training-env:1",
282
+ compute="spot-training-cluster",
283
+ inputs={
284
+ "data": Input(path="azureml://datasets/customer_features/labels/latest")
285
+ }
286
+ )
287
+
288
+ # Job will automatically resume from checkpoint if preempted
289
+ ml_client.jobs.create_or_update(job)
290
+ ```
291
+
292
+ ### Model Serving Cost Optimization
293
+ ```python
294
+ from model_registry_manager import ModelRegistry
295
+ from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment
296
+
297
+ registry = ModelRegistry()
298
+
299
+ # Deploy with auto-scaling
300
+ endpoint = ManagedOnlineEndpoint(
301
+ name="churn-prediction",
302
+ auth_mode="key"
303
+ )
304
+
305
+ deployment = ManagedOnlineDeployment(
306
+ name="churn-v2",
307
+ endpoint_name="churn-prediction",
308
+ model=registry.get_model("churn_predictor:v2"),
309
+ instance_type="Standard_DS2_v2",
310
+ instance_count=1,
311
+ # Auto-scale based on load
312
+ scale_settings={
313
+ "scale_type": "target_utilization",
314
+ "min_instances": 1,
315
+ "max_instances": 5,
316
+ "target_utilization_percentage": 70
317
+ },
318
+ # Request timeout for cost control
319
+ request_timeout_ms=5000
320
+ )
321
+
322
+ # Monitor serving costs
323
+ from finops_tracker import InferenceCostTracker
324
+ inference_tracker = InferenceCostTracker()
325
+
326
+ @inference_tracker.track_inference_cost
327
+ def predict(data):
328
+ return endpoint.invoke(data)
329
+
330
+ # Cost report
331
+ report = inference_tracker.generate_inference_report()
332
+ print(f"Cost per 1000 predictions: ${report.cost_per_1k:.4f}")
333
+ print(f"Monthly serving costs: ${report.monthly_cost:.2f}")
334
+ ```
335
+
336
+ ### Hyperparameter Tuning Cost Optimization
337
+ ```python
338
+ from azure.ai.ml.sweep import Choice, Uniform, BayesianSamplingAlgorithm
339
+ from ml_cost_optimizer import EarlyStoppingPolicy
340
+
341
+ # Use Bayesian optimization (more efficient than grid search)
342
+ sweep_job = command_job.sweep(
343
+ sampling_algorithm=BayesianSamplingAlgorithm(), # Smart search
344
+ primary_metric="accuracy",
345
+ goal="maximize",
346
+ max_total_trials=20, # Limit trials
347
+ max_concurrent_trials=4,
348
+ early_termination_policy=EarlyStoppingPolicy(
349
+ evaluation_interval=1,
350
+ delay_evaluation=5,
351
+ slack_factor=0.1 # Stop if 10% worse than best
352
+ )
353
+ )
354
+
355
+ # Track tuning costs
356
+ cost_tracker.track_sweep_cost(sweep_job)
357
+ ```
358
+
359
+ ## 🚀 CI/CD for ML Pipelines
360
+
361
+ ### Automated ML Pipeline
362
+ ```yaml
363
+ # .github/workflows/ml-pipeline.yml
364
+ name: ML Pipeline CI/CD
365
+
366
+ on:
367
+ push:
368
+ paths:
369
+ - 'models/**'
370
+ - 'pipelines/**'
371
+ branches:
372
+ - main
373
+ schedule:
374
+ - cron: '0 2 * * 0' # Weekly retraining
375
+
376
+ jobs:
377
+ train-and-deploy:
378
+ runs-on: ubuntu-latest
379
+ steps:
380
+ - uses: actions/checkout@v3
381
+
382
+ - name: Azure Login
383
+ uses: azure/login@v1
384
+ with:
385
+ creds: ${{ secrets.AZURE_CREDENTIALS }}
386
+
387
+ - name: Run unit tests
388
+ run: pytest tests/unit/
389
+
390
+ - name: Validate data quality
391
+ run: python scripts/validate_training_data.py
392
+
393
+ - name: Train model (spot instances)
394
+ run: |
395
+ python pipelines/train.py \
396
+ --compute-type spot \
397
+ --max-cost 100.00 \
398
+ --early-stopping true
399
+
400
+ - name: Evaluate model
401
+ run: |
402
+ python pipelines/evaluate.py \
403
+ --min-accuracy 0.85 \
404
+ --min-auc 0.90
405
+
406
+ - name: Register model
407
+ if: success()
408
+ run: python scripts/register_model.py
409
+
410
+ - name: Deploy to staging
411
+ run: |
412
+ python scripts/deploy_model.py \
413
+ --environment staging \
414
+ --traffic-percent 100
415
+
416
+ - name: Run integration tests
417
+ run: pytest tests/integration/
418
+
419
+ - name: Deploy to production (canary)
420
+ if: success()
421
+ run: |
422
+ python scripts/deploy_model.py \
423
+ --environment production \
424
+ --strategy canary \
425
+ --traffic-percent 10
426
+
427
+ - name: Monitor model performance
428
+ run: python scripts/monitor_model.py --duration 24h
429
+
430
+ - name: Promote canary to full
431
+ if: success()
432
+ run: python scripts/promote_deployment.py --traffic-percent 100
433
+
434
+ - name: Generate cost report
435
+ run: python scripts/ml_cost_report.py
436
+ ```
437
+
438
+ ### Infrastructure as Code for ML
439
+ ```hcl
440
+ # ml-infrastructure.tf
441
+ module "ml_workspace" {
442
+ source = "./modules/azure-ml"
443
+
444
+ resource_group = "rg-ml-prod"
445
+ location = "eastus"
446
+
447
+ workspace = {
448
+ name = "ml-workspace-prod"
449
+ sku = "Basic"
450
+ public_network_access = false
451
+ }
452
+
453
+ compute_clusters = [
454
+ {
455
+ name = "cpu-cluster"
456
+ vm_size = "Standard_D4s_v3"
457
+ min_nodes = 0
458
+ max_nodes = 4
459
+ idle_time = 300
460
+ tier = "Dedicated"
461
+ },
462
+ {
463
+ name = "gpu-spot-cluster"
464
+ vm_size = "Standard_NC6s_v3"
465
+ min_nodes = 0
466
+ max_nodes = 4
467
+ idle_time = 120
468
+ tier = "LowPriority" # 60-90% cost savings
469
+ }
470
+ ]
471
+
472
+ endpoints = [
473
+ {
474
+ name = "churn-prediction"
475
+ traffic_rules = {
476
+ "blue" = 90 # Current production
477
+ "green" = 10 # Canary deployment
478
+ }
479
+ auto_scale = {
480
+ min_instances = 2
481
+ max_instances = 10
482
+ target_cpu = 70
483
+ }
484
+ }
485
+ ]
486
+
487
+ cost_management = {
488
+ monthly_budget = 5000.00
489
+ alert_threshold = 0.8
490
+ auto_shutdown_enabled = true
491
+ auto_shutdown_time = "19:00"
492
+ }
493
+
494
+ tags = {
495
+ Environment = "Production"
496
+ CostCenter = "ML-Platform"
497
+ Owner = "MLOps-Team"
498
+ }
499
+ }
500
+ ```
501
+
502
+ ## 📊 Enhanced Metrics & Monitoring
503
+
504
+ | Metric Category | Metric | Target | Tool |
505
+ |-----------------|--------|--------|------|
506
+ | **Training Costs** | Cost per experiment | <$50 | FinOps tracker |
507
+ | | Monthly training budget | <$5000 | Azure Cost Management |
508
+ | | Spot instance savings | >70% | Cost tracker |
509
+ | | GPU utilization | >80% | Azure Monitor |
510
+ | **Serving Costs** | Cost per 1000 predictions | <$0.10 | Inference tracker |
511
+ | | Monthly serving costs | <$2000 | FinOps dashboard |
512
+ | | Auto-scaling efficiency | >70% | Azure Monitor |
513
+ | **Model Performance** | Production accuracy | >0.85 | MLflow |
514
+ | | Prediction latency (p95) | <100ms | App Insights |
515
+ | | Model drift score | <0.1 | Drift monitor |
516
+ | **Pipeline Reliability** | Training success rate | >95% | Azure ML |
517
+ | | Deployment success rate | >99% | DevOps metrics |
518
+ | | Rollback frequency | <2/month | Deployment logs |
519
+ | **Data Quality** | Feature freshness | <1 hour | Data quality checks |
520
+ | | Training data completeness | >99% | Validation tests |
521
+
522
+ ## 🔄 Integration Workflow
523
+
524
+ ### End-to-End MLOps Pipeline
525
+ ```
526
+ 1. Feature Engineering (de-01, ml-02)
527
+
528
+ 2. Data Validation (de-03)
529
+
530
+ 3. PII Removal (sa-01)
531
+
532
+ 4. Experiment Tracking (mo-01)
533
+
534
+ 5. Model Training with Cost Tracking (ml-01, fo-07)
535
+
536
+ 6. Model Evaluation (ml-01, ds-08)
537
+
538
+ 7. Model Registry (mo-03)
539
+
540
+ 8. Security Scan (sa-08)
541
+
542
+ 9. CI/CD Deployment (do-01)
543
+
544
+ 10. Canary Deployment (do-06)
545
+
546
+ 11. Production Monitoring (mo-04, do-08)
547
+
548
+ 12. Drift Detection (mo-05)
549
+
550
+ 13. Cost Optimization (fo-01, fo-06, fo-07)
551
+
552
+ 14. Automated Retraining (ml-01)
553
+ ```
554
+
555
+ ## 🎯 Quick Wins
556
+
557
+ 1. **Use spot instances for training** - 60-90% compute cost savings
558
+ 2. **Enable auto-scaling for inference** - 30-50% serving cost reduction
559
+ 3. **Implement early stopping** - Reduce wasteful hyperparameter trials
560
+ 4. **Set up experiment tracking** - Compare models systematically
561
+ 5. **Automate model deployment** - Faster time to production
562
+ 6. **Enable model monitoring** - Detect drift before degradation
563
+ 7. **Implement cost tracking** - Know where ML budget is going
564
+ 8. **Use canary deployments** - Safer model releases
565
+ 9. **Containerize models** - Portable and scalable serving
566
+ 10. **Set budget alerts** - Prevent cost overruns