tech-hub-skills 1.2.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/{LICENSE → .claude/LICENSE} +21 -21
  2. package/.claude/README.md +291 -0
  3. package/.claude/bin/cli.js +266 -0
  4. package/{bin → .claude/bin}/copilot.js +182 -182
  5. package/{bin → .claude/bin}/postinstall.js +42 -42
  6. package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
  7. package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
  8. package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
  9. package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
  10. package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
  11. package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
  12. package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
  13. package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
  14. package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
  15. package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
  16. package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
  17. package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
  18. package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
  19. package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
  20. package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
  21. package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
  22. package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
  23. package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
  24. package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
  25. package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
  26. package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
  27. package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
  28. package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
  29. package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
  30. package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
  31. package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
  32. package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
  33. package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
  34. package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
  35. package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
  36. package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
  37. package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
  38. package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
  39. package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
  40. package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
  41. package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -0
  43. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  47. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  50. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
  59. package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
  64. package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  74. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  86. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  158. package/.claude/skills/README.md +336 -0
  159. package/.claude/skills/ai-engineer.md +104 -0
  160. package/.claude/skills/aws.md +143 -0
  161. package/.claude/skills/azure.md +149 -0
  162. package/.claude/skills/backend-developer.md +108 -0
  163. package/.claude/skills/code-review.md +399 -0
  164. package/.claude/skills/compliance-automation.md +747 -0
  165. package/.claude/skills/compliance-officer.md +108 -0
  166. package/.claude/skills/data-engineer.md +113 -0
  167. package/.claude/skills/data-governance.md +102 -0
  168. package/.claude/skills/data-scientist.md +123 -0
  169. package/.claude/skills/database-admin.md +109 -0
  170. package/.claude/skills/devops.md +160 -0
  171. package/.claude/skills/docker.md +160 -0
  172. package/.claude/skills/enterprise-dashboard.md +613 -0
  173. package/.claude/skills/finops.md +184 -0
  174. package/.claude/skills/frontend-developer.md +108 -0
  175. package/.claude/skills/gcp.md +143 -0
  176. package/.claude/skills/ml-engineer.md +115 -0
  177. package/.claude/skills/mlops.md +187 -0
  178. package/.claude/skills/network-engineer.md +109 -0
  179. package/.claude/skills/optimization-advisor.md +329 -0
  180. package/.claude/skills/orchestrator.md +623 -0
  181. package/.claude/skills/platform-engineer.md +102 -0
  182. package/.claude/skills/process-automation.md +226 -0
  183. package/.claude/skills/process-changelog.md +184 -0
  184. package/.claude/skills/process-documentation.md +484 -0
  185. package/.claude/skills/process-kanban.md +324 -0
  186. package/.claude/skills/process-versioning.md +214 -0
  187. package/.claude/skills/product-designer.md +104 -0
  188. package/.claude/skills/project-starter.md +443 -0
  189. package/.claude/skills/qa-engineer.md +109 -0
  190. package/.claude/skills/security-architect.md +135 -0
  191. package/.claude/skills/sre.md +109 -0
  192. package/.claude/skills/system-design.md +126 -0
  193. package/.claude/skills/technical-writer.md +101 -0
  194. package/.gitattributes +2 -0
  195. package/GITHUB_COPILOT.md +106 -0
  196. package/README.md +192 -291
  197. package/package.json +16 -46
  198. package/bin/cli.js +0 -241
@@ -1,845 +1,845 @@
1
- # Skill 4: Model Serving & API Development
2
-
3
- ## 🎯 Overview
4
- Deploy production-ready ML models with high-performance REST/gRPC APIs, auto-scaling, and comprehensive monitoring.
5
-
6
- ## 🔗 Connections
7
- - **ML Engineer**: Serves trained models from registry (ml-03, ml-07)
8
- - **AI Engineer**: Powers agent systems and LLM applications (ai-03, ai-07)
9
- - **MLOps**: Model deployment and endpoint management (mo-03, mo-04)
10
- - **FinOps**: Optimizes serving costs and resource usage (fo-06, fo-07)
11
- - **DevOps**: Container orchestration and deployment (do-03, do-06, do-08)
12
- - **Security Architect**: Secures API endpoints and authentication (sa-02, sa-03)
13
- - **System Design**: Scalable serving architecture (sd-03, sd-05, sd-06)
14
- - **Data Engineer**: Serves features for inference (de-02, ml-02)
15
-
16
- ## 🛠️ Tools Included
17
-
18
- ### 1. `model_server.py`
19
- FastAPI/Flask production model serving with async support.
20
-
21
- ### 2. `batch_inference.py`
22
- Efficient batch prediction pipeline for large-scale inference.
23
-
24
- ### 3. `model_optimizer.py`
25
- Model optimization (ONNX, TensorRT, quantization) for low latency.
26
-
27
- ### 4. `api_gateway.py`
28
- API gateway with rate limiting, authentication, and monitoring.
29
-
30
- ### 5. `deployment_config.yaml`
31
- Configuration templates for model deployment infrastructure.
32
-
33
- ## 🏗️ Model Serving Architecture
34
-
35
- ```
36
- API Gateway → Load Balancer → Model Servers → Feature Store
37
- ↓ ↓ ↓ ↓
38
- Auth/Rate Traffic Split Predictions Online Features
39
- Monitoring A/B Testing Caching Low Latency
40
- Logging Auto-scale Batching Consistency
41
- ```
42
-
43
- ## 🚀 Quick Start
44
-
45
- ```python
46
- from model_server import ModelServer, FastAPIApp
47
- from model_optimizer import ModelOptimizer
48
-
49
- # Load and optimize model
50
- optimizer = ModelOptimizer()
51
- model = optimizer.load_model("models/churn_predictor_v2")
52
- optimized_model = optimizer.optimize(
53
- model,
54
- target_format="onnx",
55
- optimization_level=2
56
- )
57
-
58
- # Create FastAPI server
59
- app = FastAPIApp(
60
- model=optimized_model,
61
- feature_store=feature_store,
62
- enable_caching=True,
63
- enable_batching=True
64
- )
65
-
66
- # Define prediction endpoint
67
- @app.post("/predict")
68
- async def predict(request: PredictionRequest):
69
- """Real-time prediction endpoint"""
70
-
71
- # Get online features
72
- features = await app.get_online_features(
73
- feature_refs=["customer_behavior:v1"],
74
- entity_keys={"customer_id": request.customer_id}
75
- )
76
-
77
- # Predict with caching
78
- prediction = await app.predict(features)
79
-
80
- return {
81
- "customer_id": request.customer_id,
82
- "churn_probability": prediction["probability"],
83
- "prediction": prediction["class"],
84
- "model_version": "v2"
85
- }
86
-
87
- # Health check
88
- @app.get("/health")
89
- async def health():
90
- return {"status": "healthy", "model_loaded": app.model_loaded}
91
-
92
- # Run server
93
- if __name__ == "__main__":
94
- app.run(
95
- host="0.0.0.0",
96
- port=8000,
97
- workers=4,
98
- reload=False
99
- )
100
- ```
101
-
102
- ## 📚 Best Practices
103
-
104
- ### Serving Cost Optimization (FinOps Integration)
105
-
106
- 1. **Auto-Scaling for Variable Loads**
107
- - Scale instances based on request rate
108
- - Set appropriate min/max instances
109
- - Use horizontal pod autoscaling (HPA)
110
- - Monitor scaling efficiency
111
- - Scale to zero during off-hours
112
- - Reference: FinOps fo-06 (Compute Optimization), fo-07 (AI/ML Cost)
113
-
114
- 2. **Model Caching for Cost Reduction**
115
- - Cache predictions for frequent inputs
116
- - Use Redis for distributed caching
117
- - Implement cache warming strategies
118
- - Monitor cache hit rates (target >80%)
119
- - Reference: FinOps fo-06, System Design sd-05
120
-
121
- 3. **Request Batching**
122
- - Batch requests for throughput optimization
123
- - Reduce per-request overhead
124
- - Optimize batch size for latency/throughput
125
- - Use dynamic batching
126
- - Reference: ML Engineer best practices
127
-
128
- 4. **Model Optimization**
129
- - Quantize models (4-8x size reduction)
130
- - Convert to ONNX for faster inference
131
- - Use TensorRT for GPU optimization
132
- - Implement model pruning
133
- - Reference: ML Engineer ml-08 (Model Compression)
134
-
135
- 5. **Right-Size Serving Instances**
136
- - Profile inference workload
137
- - Choose appropriate instance types
138
- - Use CPU for most models (cheaper than GPU)
139
- - Reserve GPUs for large deep learning models
140
- - Monitor resource utilization
141
- - Reference: FinOps fo-06
142
-
143
- 6. **Monitoring Serving Costs**
144
- - Track cost per prediction
145
- - Monitor monthly serving costs
146
- - Alert on cost anomalies
147
- - Optimize expensive endpoints
148
- - Reference: FinOps fo-01 (Cost Monitoring), fo-03 (Budget Management)
149
-
150
- ### DevOps Integration for Serving
151
-
152
- 7. **Containerized Deployments**
153
- - Package models in Docker containers
154
- - Use multi-stage builds to minimize size
155
- - Implement health checks
156
- - Version container images
157
- - Reference: DevOps do-03 (Containerization)
158
-
159
- 8. **Blue-Green & Canary Deployments**
160
- - Test new models with small traffic percentage
161
- - Gradual traffic shifting
162
- - Automated rollback on errors
163
- - A/B testing infrastructure
164
- - Reference: DevOps do-06 (Deployment Strategies)
165
-
166
- 9. **CI/CD for Model Deployment**
167
- - Automate model deployment pipelines
168
- - Run inference tests before deployment
169
- - Validate model performance in staging
170
- - Automated promotion to production
171
- - Reference: DevOps do-01 (CI/CD)
172
-
173
- 10. **Infrastructure as Code**
174
- - Define serving infrastructure in Terraform
175
- - Version control all configurations
176
- - Automate environment provisioning
177
- - Implement disaster recovery
178
- - Reference: DevOps do-04 (IaC)
179
-
180
- 11. **Comprehensive Monitoring**
181
- - Monitor prediction latency (p50, p95, p99)
182
- - Track request rates and throughput
183
- - Monitor error rates and types
184
- - Set up alerts for degradation
185
- - Reference: DevOps do-08 (Monitoring), MLOps mo-04
186
-
187
- ### Performance Optimization
188
-
189
- 12. **Low-Latency Inference**
190
- - Optimize model inference code
191
- - Use async/await for I/O operations
192
- - Implement connection pooling
193
- - Minimize feature retrieval latency
194
- - Pre-load models at startup
195
- - Reference: System Design sd-06 (Performance)
196
-
197
- 13. **Async & Concurrent Processing**
198
- - Use async frameworks (FastAPI, aiohttp)
199
- - Implement concurrent request handling
200
- - Non-blocking I/O for feature fetching
201
- - Thread pools for CPU-bound inference
202
- - Reference: System Design sd-03 (Scalability)
203
-
204
- 14. **Load Balancing**
205
- - Distribute traffic across instances
206
- - Use health-based routing
207
- - Implement sticky sessions if needed
208
- - Configure timeout policies
209
- - Reference: System Design sd-05 (Load Balancing)
210
-
211
- ### Security & Compliance
212
-
213
- 15. **API Authentication & Authorization**
214
- - Implement API key authentication
215
- - Use OAuth 2.0 for user authentication
216
- - Implement RBAC for endpoints
217
- - Audit API access logs
218
- - Reference: Security Architect sa-02 (IAM)
219
-
220
- 16. **Rate Limiting & Throttling**
221
- - Prevent API abuse with rate limits
222
- - Implement per-user quotas
223
- - Graceful degradation under load
224
- - DDoS protection
225
- - Reference: Security Architect sa-03 (Network Security)
226
-
227
- 17. **Input Validation & Sanitization**
228
- - Validate all input data
229
- - Sanitize inputs to prevent injection
230
- - Implement schema validation
231
- - Handle malformed requests gracefully
232
- - Reference: Security Architect sa-08 (LLM Security)
233
-
234
- 18. **Secure Model Serving**
235
- - Encrypt model artifacts at rest
236
- - Use TLS for API endpoints
237
- - Implement network isolation
238
- - Audit prediction requests
239
- - Reference: Security Architect sa-02, sa-03
240
-
241
- ### MLOps Integration
242
-
243
- 19. **Model Version Management**
244
- - Serve multiple model versions simultaneously
245
- - Gradual migration between versions
246
- - Track which version served each request
247
- - Rollback capabilities
248
- - Reference: MLOps mo-03 (Model Versioning)
249
-
250
- 20. **Prediction Logging & Monitoring**
251
- - Log predictions for analysis
252
- - Monitor prediction distributions
253
- - Detect model drift in production
254
- - Track model performance metrics
255
- - Reference: MLOps mo-04 (Monitoring), mo-05 (Drift Detection)
256
-
257
- ### Azure-Specific Best Practices
258
-
259
- 21. **Azure ML Managed Endpoints**
260
- - Use managed online endpoints
261
- - Enable auto-scaling
262
- - Implement multi-model endpoints
263
- - Use Azure Monitor for observability
264
- - Reference: Azure az-04 (AI/ML Services)
265
-
266
- 22. **Azure API Management**
267
- - Centralize API management
268
- - Implement rate limiting and quotas
269
- - Enable caching at API gateway
270
- - Monitor API usage and costs
271
- - Reference: Azure az-05 (Application Services)
272
-
273
- ## 💰 Cost Optimization Examples
274
-
275
- ### Auto-Scaling Model Deployment
276
- ```python
277
- from azure.ai.ml.entities import (
278
- ManagedOnlineEndpoint,
279
- ManagedOnlineDeployment,
280
- ProbeSettings,
281
- ResourceRequests,
282
- ResourceSettings
283
- )
284
- from finops_tracker import ServingCostTracker
285
-
286
- cost_tracker = ServingCostTracker()
287
-
288
- # Create endpoint
289
- endpoint = ManagedOnlineEndpoint(
290
- name="churn-prediction-optimized",
291
- description="Cost-optimized churn prediction endpoint",
292
- auth_mode="key",
293
- tags={
294
- "cost_center": "ml-platform",
295
- "environment": "production"
296
- }
297
- )
298
-
299
- ml_client.online_endpoints.begin_create_or_update(endpoint).result()
300
-
301
- # Cost-optimized deployment with auto-scaling
302
- deployment = ManagedOnlineDeployment(
303
- name="churn-v2-optimized",
304
- endpoint_name=endpoint.name,
305
- model="azureml:churn_predictor:2",
306
-
307
- # Right-sized instance
308
- instance_type="Standard_DS2_v2", # CPU instance (cheaper than GPU)
309
- instance_count=1,
310
-
311
- # Auto-scaling configuration
312
- scale_settings={
313
- "scale_type": "target_utilization",
314
- "min_instances": 1, # Scale to 1 during off-hours
315
- "max_instances": 10, # Scale up for peak traffic
316
- "polling_interval": 30, # Check every 30 seconds
317
- "target_utilization_percentage": 70, # Scale at 70% CPU
318
- "cooldown_period": 300 # 5 min cooldown
319
- },
320
-
321
- # Resource limits
322
- request_settings=ResourceSettings(
323
- request_timeout_ms=5000, # 5s timeout
324
- max_concurrent_requests_per_instance=10,
325
- max_queue_wait_ms=500
326
- ),
327
-
328
- # Health monitoring
329
- liveness_probe=ProbeSettings(
330
- initial_delay=10,
331
- period=10,
332
- timeout=2,
333
- failure_threshold=3
334
- ),
335
-
336
- readiness_probe=ProbeSettings(
337
- initial_delay=10,
338
- period=10,
339
- timeout=2,
340
- failure_threshold=3
341
- ),
342
-
343
- # Environment variables
344
- environment_variables={
345
- "ENABLE_CACHING": "true",
346
- "CACHE_TTL": "3600",
347
- "ENABLE_BATCHING": "true",
348
- "MAX_BATCH_SIZE": "32"
349
- }
350
- )
351
-
352
- ml_client.online_deployments.begin_create_or_update(deployment).result()
353
-
354
- # Set traffic to new deployment
355
- endpoint.traffic = {"churn-v2-optimized": 100}
356
- ml_client.online_endpoints.begin_create_or_update(endpoint).result()
357
-
358
- # Monitor costs
359
- cost_tracker.track_endpoint(
360
- endpoint_name=endpoint.name,
361
- deployment_name=deployment.name
362
- )
363
-
364
- # Cost report
365
- report = cost_tracker.generate_serving_report(period="daily")
366
- print(f"Daily serving cost: ${report.daily_cost:.2f}")
367
- print(f"Cost per 1000 predictions: ${report.cost_per_1k:.4f}")
368
- print(f"Average instances: {report.avg_instances:.2f}")
369
- print(f"Peak instances: {report.peak_instances}")
370
- print(f"Auto-scaling savings: ${report.autoscale_savings:.2f}")
371
- ```
372
-
373
- ### Prediction Caching for Cost Reduction
374
- ```python
375
- from fastapi import FastAPI, HTTPException
376
- from pydantic import BaseModel
377
- import redis
378
- import hashlib
379
- import json
380
- from datetime import timedelta
381
- from finops_tracker import CacheCostTracker
382
-
383
- app = FastAPI()
384
-
385
- # Redis cache
386
- cache = redis.Redis(
387
- host="ml-cache.redis.cache.windows.net",
388
- port=6380,
389
- password=os.getenv("REDIS_PASSWORD"),
390
- ssl=True,
391
- decode_responses=True,
392
- connection_pool=redis.ConnectionPool(max_connections=50)
393
- )
394
-
395
- cost_tracker = CacheCostTracker()
396
-
397
- class PredictionRequest(BaseModel):
398
- customer_id: str
399
- features: dict
400
-
401
- class PredictionResponse(BaseModel):
402
- prediction: float
403
- cached: bool
404
- model_version: str
405
-
406
- def generate_cache_key(request: PredictionRequest) -> str:
407
- """Generate deterministic cache key"""
408
- content = f"{request.customer_id}:{json.dumps(request.features, sort_keys=True)}"
409
- return f"pred:{hashlib.md5(content.encode()).hexdigest()}"
410
-
411
- @app.post("/predict", response_model=PredictionResponse)
412
- async def predict(request: PredictionRequest):
413
- """Prediction endpoint with caching"""
414
-
415
- cache_key = generate_cache_key(request)
416
-
417
- # Check cache first
418
- cached_prediction = cache.get(cache_key)
419
- if cached_prediction:
420
- cost_tracker.record_cache_hit()
421
- return PredictionResponse(
422
- prediction=float(cached_prediction),
423
- cached=True,
424
- model_version="v2"
425
- )
426
-
427
- # Cache miss - compute prediction
428
- cost_tracker.record_cache_miss()
429
-
430
- # Get features and predict
431
- features = await feature_store.get_online_features(
432
- feature_refs=["customer_behavior:v1"],
433
- entity_keys={"customer_id": request.customer_id}
434
- )
435
-
436
- prediction = model.predict(features)[0]
437
-
438
- # Cache result (1 hour TTL)
439
- cache.setex(
440
- cache_key,
441
- timedelta(hours=1),
442
- str(prediction)
443
- )
444
-
445
- return PredictionResponse(
446
- prediction=prediction,
447
- cached=False,
448
- model_version="v2"
449
- )
450
-
451
- @app.get("/cache-stats")
452
- async def cache_stats():
453
- """Cache performance and cost metrics"""
454
- stats = cost_tracker.get_stats()
455
-
456
- return {
457
- "cache_hit_rate": stats.hit_rate,
458
- "total_requests": stats.total_requests,
459
- "cache_hits": stats.cache_hits,
460
- "cache_misses": stats.cache_misses,
461
- "cost_savings": f"${stats.cost_savings:.2f}",
462
- "avg_latency_cached": f"{stats.avg_latency_cached:.2f}ms",
463
- "avg_latency_uncached": f"{stats.avg_latency_uncached:.2f}ms"
464
- }
465
-
466
- # Expected results:
467
- # - Cache hit rate: 80-95%
468
- # - Cost reduction: 60-80% (fewer model inferences)
469
- # - Latency improvement: 10-50x for cached requests
470
- ```
471
-
472
- ### Batch Inference for Cost Efficiency
473
- ```python
474
- from batch_inference import BatchInferenceEngine
475
- from azure.ai.ml import Input, Output
476
- from finops_tracker import BatchCostTracker
477
-
478
- class OptimizedBatchInference:
479
- """Cost-optimized batch inference"""
480
-
481
- def __init__(self):
482
- self.engine = BatchInferenceEngine()
483
- self.cost_tracker = BatchCostTracker()
484
-
485
- def batch_predict(
486
- self,
487
- input_data_path: str,
488
- output_path: str,
489
- batch_size: int = 1000,
490
- use_spot: bool = True
491
- ):
492
- """Run batch inference with cost optimization"""
493
-
494
- # Use spot instances for 60-90% savings
495
- compute_config = {
496
- "instance_type": "Standard_D4s_v3",
497
- "instance_count": 4,
498
- "tier": "LowPriority" if use_spot else "Dedicated",
499
- "max_concurrent_tasks": 4
500
- }
501
-
502
- with self.cost_tracker.track_batch_job():
503
- # Configure batch job
504
- batch_job = self.engine.create_batch_job(
505
- name="churn_prediction_batch",
506
- model="azureml:churn_predictor:2",
507
- compute=compute_config,
508
- mini_batch_size=batch_size,
509
- retry_settings={
510
- "max_retries": 3,
511
- "timeout": 300
512
- },
513
- environment_variables={
514
- "BATCH_SIZE": str(batch_size),
515
- "ENABLE_OPTIMIZATION": "true"
516
- },
517
- inputs={
518
- "input_data": Input(
519
- type="uri_folder",
520
- path=input_data_path
521
- )
522
- },
523
- outputs={
524
- "predictions": Output(
525
- type="uri_folder",
526
- path=output_path
527
- )
528
- }
529
- )
530
-
531
- # Run batch inference
532
- job = ml_client.batch_deployments.invoke(
533
- deployment_name="batch-deployment",
534
- inputs=batch_job.inputs,
535
- outputs=batch_job.outputs
536
- )
537
-
538
- # Wait for completion
539
- ml_client.jobs.stream(job.name)
540
-
541
- # Cost analysis
542
- cost_report = self.cost_tracker.generate_report()
543
- print(f"\nBatch Inference Cost Report:")
544
- print(f"Total predictions: {cost_report.total_predictions:,}")
545
- print(f"Total cost: ${cost_report.total_cost:.2f}")
546
- print(f"Cost per 1000 predictions: ${cost_report.cost_per_1k:.4f}")
547
- print(f"Spot savings: ${cost_report.spot_savings:.2f}")
548
- print(f"Duration: {cost_report.duration_minutes:.2f} minutes")
549
- print(f"Throughput: {cost_report.throughput_per_minute:,.0f} predictions/min")
550
-
551
- # Compare with online serving
552
- online_cost = cost_report.total_predictions * 0.001 # Assume $0.001 per prediction
553
- print(f"\nCost comparison:")
554
- print(f"Batch inference: ${cost_report.total_cost:.2f}")
555
- print(f"Online serving equivalent: ${online_cost:.2f}")
556
- print(f"Savings: ${online_cost - cost_report.total_cost:.2f} ({((online_cost - cost_report.total_cost) / online_cost * 100):.1f}%)")
557
-
558
- return cost_report
559
-
560
- # Usage
561
- batch_engine = OptimizedBatchInference()
562
-
563
- # Run batch prediction (100x cheaper than online for large batches)
564
- cost_report = batch_engine.batch_predict(
565
- input_data_path="azureml://datasets/scoring_data/labels/latest",
566
- output_path="azureml://datastores/predictions/paths/batch_2024_01/",
567
- batch_size=1000,
568
- use_spot=True
569
- )
570
- ```
571
-
572
- ### Model Optimization for Faster Inference
573
- ```python
574
- from model_optimizer import ModelOptimizer, ONNXConverter
575
- import onnxruntime as ort
576
- import numpy as np
577
- from finops_tracker import InferenceCostTracker
578
-
579
- class OptimizedModelServer:
580
- """Optimized model serving with ONNX"""
581
-
582
- def __init__(self, model_path: str):
583
- self.optimizer = ModelOptimizer()
584
- self.cost_tracker = InferenceCostTracker()
585
-
586
- # Convert to ONNX for 2-5x speedup
587
- self.onnx_model = self.optimizer.convert_to_onnx(
588
- model_path=model_path,
589
- opset_version=13
590
- )
591
-
592
- # Quantize for 4x size reduction and faster inference
593
- self.quantized_model = self.optimizer.quantize(
594
- self.onnx_model,
595
- quantization_mode="dynamic", # or "static" for more accuracy
596
- optimize_for="latency" # or "throughput"
597
- )
598
-
599
- # Create ONNX Runtime session
600
- sess_options = ort.SessionOptions()
601
- sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
602
- sess_options.intra_op_num_threads = 4
603
-
604
- self.session = ort.InferenceSession(
605
- self.quantized_model,
606
- sess_options,
607
- providers=['CPUExecutionProvider'] # Use CPU for cost savings
608
- )
609
-
610
- def predict(self, features: np.ndarray) -> np.ndarray:
611
- """Optimized prediction"""
612
-
613
- with self.cost_tracker.track_inference():
614
- # ONNX inference (2-5x faster than native)
615
- input_name = self.session.get_inputs()[0].name
616
- output_name = self.session.get_outputs()[0].name
617
-
618
- predictions = self.session.run(
619
- [output_name],
620
- {input_name: features.astype(np.float32)}
621
- )[0]
622
-
623
- return predictions
624
-
625
- def benchmark(self, test_data: np.ndarray, num_iterations: int = 1000):
626
- """Benchmark optimized vs original model"""
627
-
628
- # Original model (for comparison)
629
- original_model = self.optimizer.load_original_model()
630
-
631
- print("Benchmarking optimized model...")
632
-
633
- # Warm up
634
- for _ in range(10):
635
- self.predict(test_data[:1])
636
- original_model.predict(test_data[:1])
637
-
638
- # Benchmark optimized
639
- import time
640
- start = time.time()
641
- for _ in range(num_iterations):
642
- self.predict(test_data[:1])
643
- optimized_time = time.time() - start
644
-
645
- # Benchmark original
646
- start = time.time()
647
- for _ in range(num_iterations):
648
- original_model.predict(test_data[:1])
649
- original_time = time.time() - start
650
-
651
- # Results
652
- speedup = original_time / optimized_time
653
- cost_reduction = 1 - (1 / speedup)
654
-
655
- print(f"\nBenchmark Results:")
656
- print(f"Original model time: {original_time:.3f}s ({original_time/num_iterations*1000:.2f}ms per prediction)")
657
- print(f"Optimized model time: {optimized_time:.3f}s ({optimized_time/num_iterations*1000:.2f}ms per prediction)")
658
- print(f"Speedup: {speedup:.2f}x")
659
- print(f"Latency reduction: {(1 - optimized_time/original_time)*100:.1f}%")
660
- print(f"Cost reduction: {cost_reduction*100:.1f}%")
661
- print(f"Model size reduction: {self.optimizer.get_size_reduction():.1f}x")
662
-
663
- return {
664
- "speedup": speedup,
665
- "cost_reduction_percent": cost_reduction * 100,
666
- "optimized_latency_ms": optimized_time / num_iterations * 1000
667
- }
668
-
669
- # Usage
670
- server = OptimizedModelServer("models/churn_predictor_v2.pkl")
671
-
672
- # Benchmark
673
- test_features = np.random.rand(100, 20)
674
- results = server.benchmark(test_features)
675
-
676
- # Expected results:
677
- # - 2-5x speedup with ONNX
678
- # - 4x model size reduction with quantization
679
- # - 50-80% cost reduction (same throughput with fewer instances)
680
- ```
681
-
682
- ## 🚀 CI/CD for Model Serving
683
-
684
- ### Automated Deployment Pipeline
685
- ```yaml
686
- # .github/workflows/model-deployment.yml
687
- name: Model Deployment Pipeline
688
-
689
- on:
690
- workflow_run:
691
- workflows: ["Model Training Pipeline"]
692
- types:
693
- - completed
694
- workflow_dispatch:
695
- inputs:
696
- model_version:
697
- description: 'Model version to deploy'
698
- required: true
699
-
700
- jobs:
701
- deploy-model:
702
- runs-on: ubuntu-latest
703
- if: ${{ github.event.workflow_run.conclusion == 'success' }}
704
-
705
- steps:
706
- - uses: actions/checkout@v3
707
-
708
- - name: Azure Login
709
- uses: azure/login@v1
710
- with:
711
- creds: ${{ secrets.AZURE_CREDENTIALS }}
712
-
713
- - name: Get model from registry
714
- run: |
715
- python scripts/download_model.py \
716
- --model-name churn_predictor \
717
- --version ${{ github.event.inputs.model_version || 'latest' }}
718
-
719
- - name: Optimize model for serving
720
- run: |
721
- python scripts/optimize_model.py \
722
- --input-model ./model \
723
- --output-model ./optimized_model \
724
- --format onnx \
725
- --quantize dynamic
726
-
727
- - name: Build Docker image
728
- run: |
729
- docker build -t ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }} \
730
- -f Dockerfile.serving .
731
-
732
- - name: Run container security scan
733
- run: |
734
- docker scan ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }}
735
-
736
- - name: Push to container registry
737
- run: |
738
- az acr login --name ${{ secrets.ACR_NAME }}
739
- docker push ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }}
740
-
741
- - name: Run inference tests
742
- run: |
743
- docker run -d -p 8000:8000 \
744
- ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }}
745
- sleep 10
746
- pytest tests/inference/ --endpoint http://localhost:8000
747
-
748
- - name: Deploy to staging (canary)
749
- run: |
750
- python scripts/deploy_model.py \
751
- --environment staging \
752
- --image ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }} \
753
- --traffic-percent 10
754
-
755
- - name: Run load tests
756
- run: |
757
- locust -f tests/load/test_endpoint.py \
758
- --headless \
759
- --users 100 \
760
- --spawn-rate 10 \
761
- --run-time 5m \
762
- --host https://staging-churn-api.azurewebsites.net
763
-
764
- - name: Monitor canary performance
765
- run: |
766
- python scripts/monitor_canary.py \
767
- --duration 30m \
768
- --min-success-rate 99 \
769
- --max-latency-p95 100
770
-
771
- - name: Promote to production
772
- if: success()
773
- run: |
774
- python scripts/deploy_model.py \
775
- --environment production \
776
- --image ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }} \
777
- --strategy blue-green \
778
- --traffic-percent 100
779
-
780
- - name: Generate deployment report
781
- run: python scripts/deployment_report.py
782
- ```
783
-
784
- ## 📊 Metrics & Monitoring
785
-
786
- | Metric Category | Metric | Target | Tool |
787
- |-----------------|--------|--------|------|
788
- | **Serving Costs** | Cost per 1000 predictions | <$0.05 | FinOps tracker |
789
- | | Monthly serving costs | <$1500 | Azure Cost Management |
790
- | | Auto-scaling savings | >50% | Cost tracker |
791
- | | Cache savings | >60% | Redis metrics |
792
- | **Performance** | Prediction latency (p95) | <100ms | App Insights |
793
- | | Throughput | >1000 req/s | Load balancer |
794
- | | Cache hit rate | >80% | Redis |
795
- | | Model load time | <10s | Startup metrics |
796
- | **Reliability** | Availability (SLA) | >99.9% | Azure Monitor |
797
- | | Error rate | <0.1% | API metrics |
798
- | | Deployment success rate | >99% | CI/CD metrics |
799
- | **Resource Usage** | CPU utilization | 60-80% | Azure Monitor |
800
- | | Memory utilization | <80% | Container metrics |
801
- | | Instance count | Auto-scaled | HPA metrics |
802
- | **API Usage** | Requests per minute | Monitored | API Gateway |
803
- | | Rate limit violations | <1% | Gateway logs |
804
-
805
- ## 🔄 Integration Workflow
806
-
807
- ### End-to-End Serving Pipeline
808
- ```
809
- 1. Model Registry (ml-07)
810
-
811
- 2. Model Optimization (ml-08)
812
-
813
- 3. Container Build (do-03)
814
-
815
- 4. Security Scan (sa-08)
816
-
817
- 5. Staging Deployment (do-06)
818
-
819
- 6. Load Testing (ml-04)
820
-
821
- 7. Canary Deployment (do-06)
822
-
823
- 8. Performance Monitoring (mo-04, do-08)
824
-
825
- 9. Production Promotion (ml-04)
826
-
827
- 10. Auto-Scaling (fo-06)
828
-
829
- 11. Cost Monitoring (fo-01, fo-07)
830
-
831
- 12. Drift Detection (mo-05)
832
- ```
833
-
834
- ## 🎯 Quick Wins
835
-
836
- 1. **Enable auto-scaling** - 40-60% serving cost reduction
837
- 2. **Implement prediction caching** - 60-80% cost savings
838
- 3. **Convert models to ONNX** - 2-5x inference speedup
839
- 4. **Use model quantization** - 4x model size reduction
840
- 5. **Implement request batching** - 10-50x throughput increase
841
- 6. **Right-size instances** - 30-50% cost reduction
842
- 7. **Use canary deployments** - Zero-downtime releases
843
- 8. **Add health checks** - Better reliability
844
- 9. **Enable async processing** - Higher concurrency
845
- 10. **Monitor serving costs** - Identify optimization opportunities
1
+ # Skill 4: Model Serving & API Development
2
+
3
+ ## 🎯 Overview
4
+ Deploy production-ready ML models with high-performance REST/gRPC APIs, auto-scaling, and comprehensive monitoring.
5
+
6
+ ## 🔗 Connections
7
+ - **ML Engineer**: Serves trained models from registry (ml-03, ml-07)
8
+ - **AI Engineer**: Powers agent systems and LLM applications (ai-03, ai-07)
9
+ - **MLOps**: Model deployment and endpoint management (mo-03, mo-04)
10
+ - **FinOps**: Optimizes serving costs and resource usage (fo-06, fo-07)
11
+ - **DevOps**: Container orchestration and deployment (do-03, do-06, do-08)
12
+ - **Security Architect**: Secures API endpoints and authentication (sa-02, sa-03)
13
+ - **System Design**: Scalable serving architecture (sd-03, sd-05, sd-06)
14
+ - **Data Engineer**: Serves features for inference (de-02, ml-02)
15
+
16
+ ## 🛠️ Tools Included
17
+
18
+ ### 1. `model_server.py`
19
+ FastAPI/Flask production model serving with async support.
20
+
21
+ ### 2. `batch_inference.py`
22
+ Efficient batch prediction pipeline for large-scale inference.
23
+
24
+ ### 3. `model_optimizer.py`
25
+ Model optimization (ONNX, TensorRT, quantization) for low latency.
26
+
27
+ ### 4. `api_gateway.py`
28
+ API gateway with rate limiting, authentication, and monitoring.
29
+
30
+ ### 5. `deployment_config.yaml`
31
+ Configuration templates for model deployment infrastructure.
32
+
33
+ ## 🏗️ Model Serving Architecture
34
+
35
+ ```
36
+ API Gateway → Load Balancer → Model Servers → Feature Store
37
+ ↓ ↓ ↓ ↓
38
+ Auth/Rate Traffic Split Predictions Online Features
39
+ Monitoring A/B Testing Caching Low Latency
40
+ Logging Auto-scale Batching Consistency
41
+ ```
42
+
43
+ ## 🚀 Quick Start
44
+
45
+ ```python
46
+ from model_server import ModelServer, FastAPIApp
47
+ from model_optimizer import ModelOptimizer
48
+
49
+ # Load and optimize model
50
+ optimizer = ModelOptimizer()
51
+ model = optimizer.load_model("models/churn_predictor_v2")
52
+ optimized_model = optimizer.optimize(
53
+ model,
54
+ target_format="onnx",
55
+ optimization_level=2
56
+ )
57
+
58
+ # Create FastAPI server
59
+ app = FastAPIApp(
60
+ model=optimized_model,
61
+ feature_store=feature_store,
62
+ enable_caching=True,
63
+ enable_batching=True
64
+ )
65
+
66
+ # Define prediction endpoint
67
+ @app.post("/predict")
68
+ async def predict(request: PredictionRequest):
69
+ """Real-time prediction endpoint"""
70
+
71
+ # Get online features
72
+ features = await app.get_online_features(
73
+ feature_refs=["customer_behavior:v1"],
74
+ entity_keys={"customer_id": request.customer_id}
75
+ )
76
+
77
+ # Predict with caching
78
+ prediction = await app.predict(features)
79
+
80
+ return {
81
+ "customer_id": request.customer_id,
82
+ "churn_probability": prediction["probability"],
83
+ "prediction": prediction["class"],
84
+ "model_version": "v2"
85
+ }
86
+
87
+ # Health check
88
+ @app.get("/health")
89
+ async def health():
90
+ return {"status": "healthy", "model_loaded": app.model_loaded}
91
+
92
+ # Run server
93
+ if __name__ == "__main__":
94
+ app.run(
95
+ host="0.0.0.0",
96
+ port=8000,
97
+ workers=4,
98
+ reload=False
99
+ )
100
+ ```
101
+
102
+ ## 📚 Best Practices
103
+
104
+ ### Serving Cost Optimization (FinOps Integration)
105
+
106
+ 1. **Auto-Scaling for Variable Loads**
107
+ - Scale instances based on request rate
108
+ - Set appropriate min/max instances
109
+ - Use horizontal pod autoscaling (HPA)
110
+ - Monitor scaling efficiency
111
+ - Scale to zero during off-hours
112
+ - Reference: FinOps fo-06 (Compute Optimization), fo-07 (AI/ML Cost)
113
+
114
+ 2. **Model Caching for Cost Reduction**
115
+ - Cache predictions for frequent inputs
116
+ - Use Redis for distributed caching
117
+ - Implement cache warming strategies
118
+ - Monitor cache hit rates (target >80%)
119
+ - Reference: FinOps fo-06, System Design sd-05
120
+
121
+ 3. **Request Batching**
122
+ - Batch requests for throughput optimization
123
+ - Reduce per-request overhead
124
+ - Optimize batch size for latency/throughput
125
+ - Use dynamic batching
126
+ - Reference: ML Engineer best practices
127
+
128
+ 4. **Model Optimization**
129
+ - Quantize models (4-8x size reduction)
130
+ - Convert to ONNX for faster inference
131
+ - Use TensorRT for GPU optimization
132
+ - Implement model pruning
133
+ - Reference: ML Engineer ml-08 (Model Compression)
134
+
135
+ 5. **Right-Size Serving Instances**
136
+ - Profile inference workload
137
+ - Choose appropriate instance types
138
+ - Use CPU for most models (cheaper than GPU)
139
+ - Reserve GPUs for large deep learning models
140
+ - Monitor resource utilization
141
+ - Reference: FinOps fo-06
142
+
143
+ 6. **Monitoring Serving Costs**
144
+ - Track cost per prediction
145
+ - Monitor monthly serving costs
146
+ - Alert on cost anomalies
147
+ - Optimize expensive endpoints
148
+ - Reference: FinOps fo-01 (Cost Monitoring), fo-03 (Budget Management)
149
+
150
+ ### DevOps Integration for Serving
151
+
152
+ 7. **Containerized Deployments**
153
+ - Package models in Docker containers
154
+ - Use multi-stage builds to minimize size
155
+ - Implement health checks
156
+ - Version container images
157
+ - Reference: DevOps do-03 (Containerization)
158
+
159
+ 8. **Blue-Green & Canary Deployments**
160
+ - Test new models with small traffic percentage
161
+ - Gradual traffic shifting
162
+ - Automated rollback on errors
163
+ - A/B testing infrastructure
164
+ - Reference: DevOps do-06 (Deployment Strategies)
165
+
166
+ 9. **CI/CD for Model Deployment**
167
+ - Automate model deployment pipelines
168
+ - Run inference tests before deployment
169
+ - Validate model performance in staging
170
+ - Automated promotion to production
171
+ - Reference: DevOps do-01 (CI/CD)
172
+
173
+ 10. **Infrastructure as Code**
174
+ - Define serving infrastructure in Terraform
175
+ - Version control all configurations
176
+ - Automate environment provisioning
177
+ - Implement disaster recovery
178
+ - Reference: DevOps do-04 (IaC)
179
+
180
+ 11. **Comprehensive Monitoring**
181
+ - Monitor prediction latency (p50, p95, p99)
182
+ - Track request rates and throughput
183
+ - Monitor error rates and types
184
+ - Set up alerts for degradation
185
+ - Reference: DevOps do-08 (Monitoring), MLOps mo-04
186
+
187
+ ### Performance Optimization
188
+
189
+ 12. **Low-Latency Inference**
190
+ - Optimize model inference code
191
+ - Use async/await for I/O operations
192
+ - Implement connection pooling
193
+ - Minimize feature retrieval latency
194
+ - Pre-load models at startup
195
+ - Reference: System Design sd-06 (Performance)
196
+
197
+ 13. **Async & Concurrent Processing**
198
+ - Use async frameworks (FastAPI, aiohttp)
199
+ - Implement concurrent request handling
200
+ - Non-blocking I/O for feature fetching
201
+ - Thread pools for CPU-bound inference
202
+ - Reference: System Design sd-03 (Scalability)
203
+
204
+ 14. **Load Balancing**
205
+ - Distribute traffic across instances
206
+ - Use health-based routing
207
+ - Implement sticky sessions if needed
208
+ - Configure timeout policies
209
+ - Reference: System Design sd-05 (Load Balancing)
210
+
211
+ ### Security & Compliance
212
+
213
+ 15. **API Authentication & Authorization**
214
+ - Implement API key authentication
215
+ - Use OAuth 2.0 for user authentication
216
+ - Implement RBAC for endpoints
217
+ - Audit API access logs
218
+ - Reference: Security Architect sa-02 (IAM)
219
+
220
+ 16. **Rate Limiting & Throttling**
221
+ - Prevent API abuse with rate limits
222
+ - Implement per-user quotas
223
+ - Graceful degradation under load
224
+ - DDoS protection
225
+ - Reference: Security Architect sa-03 (Network Security)
226
+
227
+ 17. **Input Validation & Sanitization**
228
+ - Validate all input data
229
+ - Sanitize inputs to prevent injection
230
+ - Implement schema validation
231
+ - Handle malformed requests gracefully
232
+ - Reference: Security Architect sa-08 (LLM Security)
233
+
234
+ 18. **Secure Model Serving**
235
+ - Encrypt model artifacts at rest
236
+ - Use TLS for API endpoints
237
+ - Implement network isolation
238
+ - Audit prediction requests
239
+ - Reference: Security Architect sa-02, sa-03
240
+
241
+ ### MLOps Integration
242
+
243
+ 19. **Model Version Management**
244
+ - Serve multiple model versions simultaneously
245
+ - Gradual migration between versions
246
+ - Track which version served each request
247
+ - Rollback capabilities
248
+ - Reference: MLOps mo-03 (Model Versioning)
249
+
250
+ 20. **Prediction Logging & Monitoring**
251
+ - Log predictions for analysis
252
+ - Monitor prediction distributions
253
+ - Detect model drift in production
254
+ - Track model performance metrics
255
+ - Reference: MLOps mo-04 (Monitoring), mo-05 (Drift Detection)
256
+
257
+ ### Azure-Specific Best Practices
258
+
259
+ 21. **Azure ML Managed Endpoints**
260
+ - Use managed online endpoints
261
+ - Enable auto-scaling
262
+ - Implement multi-model endpoints
263
+ - Use Azure Monitor for observability
264
+ - Reference: Azure az-04 (AI/ML Services)
265
+
266
+ 22. **Azure API Management**
267
+ - Centralize API management
268
+ - Implement rate limiting and quotas
269
+ - Enable caching at API gateway
270
+ - Monitor API usage and costs
271
+ - Reference: Azure az-05 (Application Services)
272
+
273
+ ## 💰 Cost Optimization Examples
274
+
275
+ ### Auto-Scaling Model Deployment
276
+ ```python
277
+ from azure.ai.ml.entities import (
278
+ ManagedOnlineEndpoint,
279
+ ManagedOnlineDeployment,
280
+ ProbeSettings,
281
+ ResourceRequests,
282
+ ResourceSettings
283
+ )
284
+ from finops_tracker import ServingCostTracker
285
+
286
+ cost_tracker = ServingCostTracker()
287
+
288
+ # Create endpoint
289
+ endpoint = ManagedOnlineEndpoint(
290
+ name="churn-prediction-optimized",
291
+ description="Cost-optimized churn prediction endpoint",
292
+ auth_mode="key",
293
+ tags={
294
+ "cost_center": "ml-platform",
295
+ "environment": "production"
296
+ }
297
+ )
298
+
299
+ ml_client.online_endpoints.begin_create_or_update(endpoint).result()
300
+
301
+ # Cost-optimized deployment with auto-scaling
302
+ deployment = ManagedOnlineDeployment(
303
+ name="churn-v2-optimized",
304
+ endpoint_name=endpoint.name,
305
+ model="azureml:churn_predictor:2",
306
+
307
+ # Right-sized instance
308
+ instance_type="Standard_DS2_v2", # CPU instance (cheaper than GPU)
309
+ instance_count=1,
310
+
311
+ # Auto-scaling configuration
312
+ scale_settings={
313
+ "scale_type": "target_utilization",
314
+ "min_instances": 1, # Scale to 1 during off-hours
315
+ "max_instances": 10, # Scale up for peak traffic
316
+ "polling_interval": 30, # Check every 30 seconds
317
+ "target_utilization_percentage": 70, # Scale at 70% CPU
318
+ "cooldown_period": 300 # 5 min cooldown
319
+ },
320
+
321
+ # Resource limits
322
+ request_settings=ResourceSettings(
323
+ request_timeout_ms=5000, # 5s timeout
324
+ max_concurrent_requests_per_instance=10,
325
+ max_queue_wait_ms=500
326
+ ),
327
+
328
+ # Health monitoring
329
+ liveness_probe=ProbeSettings(
330
+ initial_delay=10,
331
+ period=10,
332
+ timeout=2,
333
+ failure_threshold=3
334
+ ),
335
+
336
+ readiness_probe=ProbeSettings(
337
+ initial_delay=10,
338
+ period=10,
339
+ timeout=2,
340
+ failure_threshold=3
341
+ ),
342
+
343
+ # Environment variables
344
+ environment_variables={
345
+ "ENABLE_CACHING": "true",
346
+ "CACHE_TTL": "3600",
347
+ "ENABLE_BATCHING": "true",
348
+ "MAX_BATCH_SIZE": "32"
349
+ }
350
+ )
351
+
352
+ ml_client.online_deployments.begin_create_or_update(deployment).result()
353
+
354
+ # Set traffic to new deployment
355
+ endpoint.traffic = {"churn-v2-optimized": 100}
356
+ ml_client.online_endpoints.begin_create_or_update(endpoint).result()
357
+
358
+ # Monitor costs
359
+ cost_tracker.track_endpoint(
360
+ endpoint_name=endpoint.name,
361
+ deployment_name=deployment.name
362
+ )
363
+
364
+ # Cost report
365
+ report = cost_tracker.generate_serving_report(period="daily")
366
+ print(f"Daily serving cost: ${report.daily_cost:.2f}")
367
+ print(f"Cost per 1000 predictions: ${report.cost_per_1k:.4f}")
368
+ print(f"Average instances: {report.avg_instances:.2f}")
369
+ print(f"Peak instances: {report.peak_instances}")
370
+ print(f"Auto-scaling savings: ${report.autoscale_savings:.2f}")
371
+ ```
372
+
373
+ ### Prediction Caching for Cost Reduction
374
+ ```python
375
+ from fastapi import FastAPI, HTTPException
376
+ from pydantic import BaseModel
377
+ import redis
378
+ import hashlib
379
+ import json
380
+ from datetime import timedelta
381
+ from finops_tracker import CacheCostTracker
382
+
383
+ app = FastAPI()
384
+
385
+ # Redis cache
386
+ cache = redis.Redis(
387
+ host="ml-cache.redis.cache.windows.net",
388
+ port=6380,
389
+ password=os.getenv("REDIS_PASSWORD"),
390
+ ssl=True,
391
+ decode_responses=True,
392
+ connection_pool=redis.ConnectionPool(max_connections=50)
393
+ )
394
+
395
+ cost_tracker = CacheCostTracker()
396
+
397
+ class PredictionRequest(BaseModel):
398
+ customer_id: str
399
+ features: dict
400
+
401
+ class PredictionResponse(BaseModel):
402
+ prediction: float
403
+ cached: bool
404
+ model_version: str
405
+
406
+ def generate_cache_key(request: PredictionRequest) -> str:
407
+ """Generate deterministic cache key"""
408
+ content = f"{request.customer_id}:{json.dumps(request.features, sort_keys=True)}"
409
+ return f"pred:{hashlib.md5(content.encode()).hexdigest()}"
410
+
411
+ @app.post("/predict", response_model=PredictionResponse)
412
+ async def predict(request: PredictionRequest):
413
+ """Prediction endpoint with caching"""
414
+
415
+ cache_key = generate_cache_key(request)
416
+
417
+ # Check cache first
418
+ cached_prediction = cache.get(cache_key)
419
+ if cached_prediction:
420
+ cost_tracker.record_cache_hit()
421
+ return PredictionResponse(
422
+ prediction=float(cached_prediction),
423
+ cached=True,
424
+ model_version="v2"
425
+ )
426
+
427
+ # Cache miss - compute prediction
428
+ cost_tracker.record_cache_miss()
429
+
430
+ # Get features and predict
431
+ features = await feature_store.get_online_features(
432
+ feature_refs=["customer_behavior:v1"],
433
+ entity_keys={"customer_id": request.customer_id}
434
+ )
435
+
436
+ prediction = model.predict(features)[0]
437
+
438
+ # Cache result (1 hour TTL)
439
+ cache.setex(
440
+ cache_key,
441
+ timedelta(hours=1),
442
+ str(prediction)
443
+ )
444
+
445
+ return PredictionResponse(
446
+ prediction=prediction,
447
+ cached=False,
448
+ model_version="v2"
449
+ )
450
+
451
+ @app.get("/cache-stats")
452
+ async def cache_stats():
453
+ """Cache performance and cost metrics"""
454
+ stats = cost_tracker.get_stats()
455
+
456
+ return {
457
+ "cache_hit_rate": stats.hit_rate,
458
+ "total_requests": stats.total_requests,
459
+ "cache_hits": stats.cache_hits,
460
+ "cache_misses": stats.cache_misses,
461
+ "cost_savings": f"${stats.cost_savings:.2f}",
462
+ "avg_latency_cached": f"{stats.avg_latency_cached:.2f}ms",
463
+ "avg_latency_uncached": f"{stats.avg_latency_uncached:.2f}ms"
464
+ }
465
+
466
+ # Expected results:
467
+ # - Cache hit rate: 80-95%
468
+ # - Cost reduction: 60-80% (fewer model inferences)
469
+ # - Latency improvement: 10-50x for cached requests
470
+ ```
471
+
472
+ ### Batch Inference for Cost Efficiency
473
+ ```python
474
+ from batch_inference import BatchInferenceEngine
475
+ from azure.ai.ml import Input, Output
476
+ from finops_tracker import BatchCostTracker
477
+
478
+ class OptimizedBatchInference:
479
+ """Cost-optimized batch inference"""
480
+
481
+ def __init__(self):
482
+ self.engine = BatchInferenceEngine()
483
+ self.cost_tracker = BatchCostTracker()
484
+
485
+ def batch_predict(
486
+ self,
487
+ input_data_path: str,
488
+ output_path: str,
489
+ batch_size: int = 1000,
490
+ use_spot: bool = True
491
+ ):
492
+ """Run batch inference with cost optimization"""
493
+
494
+ # Use spot instances for 60-90% savings
495
+ compute_config = {
496
+ "instance_type": "Standard_D4s_v3",
497
+ "instance_count": 4,
498
+ "tier": "LowPriority" if use_spot else "Dedicated",
499
+ "max_concurrent_tasks": 4
500
+ }
501
+
502
+ with self.cost_tracker.track_batch_job():
503
+ # Configure batch job
504
+ batch_job = self.engine.create_batch_job(
505
+ name="churn_prediction_batch",
506
+ model="azureml:churn_predictor:2",
507
+ compute=compute_config,
508
+ mini_batch_size=batch_size,
509
+ retry_settings={
510
+ "max_retries": 3,
511
+ "timeout": 300
512
+ },
513
+ environment_variables={
514
+ "BATCH_SIZE": str(batch_size),
515
+ "ENABLE_OPTIMIZATION": "true"
516
+ },
517
+ inputs={
518
+ "input_data": Input(
519
+ type="uri_folder",
520
+ path=input_data_path
521
+ )
522
+ },
523
+ outputs={
524
+ "predictions": Output(
525
+ type="uri_folder",
526
+ path=output_path
527
+ )
528
+ }
529
+ )
530
+
531
+ # Run batch inference
532
+ job = ml_client.batch_deployments.invoke(
533
+ deployment_name="batch-deployment",
534
+ inputs=batch_job.inputs,
535
+ outputs=batch_job.outputs
536
+ )
537
+
538
+ # Wait for completion
539
+ ml_client.jobs.stream(job.name)
540
+
541
+ # Cost analysis
542
+ cost_report = self.cost_tracker.generate_report()
543
+ print(f"\nBatch Inference Cost Report:")
544
+ print(f"Total predictions: {cost_report.total_predictions:,}")
545
+ print(f"Total cost: ${cost_report.total_cost:.2f}")
546
+ print(f"Cost per 1000 predictions: ${cost_report.cost_per_1k:.4f}")
547
+ print(f"Spot savings: ${cost_report.spot_savings:.2f}")
548
+ print(f"Duration: {cost_report.duration_minutes:.2f} minutes")
549
+ print(f"Throughput: {cost_report.throughput_per_minute:,.0f} predictions/min")
550
+
551
+ # Compare with online serving
552
+ online_cost = cost_report.total_predictions * 0.001 # Assume $0.001 per prediction
553
+ print(f"\nCost comparison:")
554
+ print(f"Batch inference: ${cost_report.total_cost:.2f}")
555
+ print(f"Online serving equivalent: ${online_cost:.2f}")
556
+ print(f"Savings: ${online_cost - cost_report.total_cost:.2f} ({((online_cost - cost_report.total_cost) / online_cost * 100):.1f}%)")
557
+
558
+ return cost_report
559
+
560
+ # Usage
561
+ batch_engine = OptimizedBatchInference()
562
+
563
+ # Run batch prediction (100x cheaper than online for large batches)
564
+ cost_report = batch_engine.batch_predict(
565
+ input_data_path="azureml://datasets/scoring_data/labels/latest",
566
+ output_path="azureml://datastores/predictions/paths/batch_2024_01/",
567
+ batch_size=1000,
568
+ use_spot=True
569
+ )
570
+ ```
571
+
572
+ ### Model Optimization for Faster Inference
573
+ ```python
574
+ from model_optimizer import ModelOptimizer, ONNXConverter
575
+ import onnxruntime as ort
576
+ import numpy as np
577
+ from finops_tracker import InferenceCostTracker
578
+
579
+ class OptimizedModelServer:
580
+ """Optimized model serving with ONNX"""
581
+
582
+ def __init__(self, model_path: str):
583
+ self.optimizer = ModelOptimizer()
584
+ self.cost_tracker = InferenceCostTracker()
585
+
586
+ # Convert to ONNX for 2-5x speedup
587
+ self.onnx_model = self.optimizer.convert_to_onnx(
588
+ model_path=model_path,
589
+ opset_version=13
590
+ )
591
+
592
+ # Quantize for 4x size reduction and faster inference
593
+ self.quantized_model = self.optimizer.quantize(
594
+ self.onnx_model,
595
+ quantization_mode="dynamic", # or "static" for more accuracy
596
+ optimize_for="latency" # or "throughput"
597
+ )
598
+
599
+ # Create ONNX Runtime session
600
+ sess_options = ort.SessionOptions()
601
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
602
+ sess_options.intra_op_num_threads = 4
603
+
604
+ self.session = ort.InferenceSession(
605
+ self.quantized_model,
606
+ sess_options,
607
+ providers=['CPUExecutionProvider'] # Use CPU for cost savings
608
+ )
609
+
610
+ def predict(self, features: np.ndarray) -> np.ndarray:
611
+ """Optimized prediction"""
612
+
613
+ with self.cost_tracker.track_inference():
614
+ # ONNX inference (2-5x faster than native)
615
+ input_name = self.session.get_inputs()[0].name
616
+ output_name = self.session.get_outputs()[0].name
617
+
618
+ predictions = self.session.run(
619
+ [output_name],
620
+ {input_name: features.astype(np.float32)}
621
+ )[0]
622
+
623
+ return predictions
624
+
625
+ def benchmark(self, test_data: np.ndarray, num_iterations: int = 1000):
626
+ """Benchmark optimized vs original model"""
627
+
628
+ # Original model (for comparison)
629
+ original_model = self.optimizer.load_original_model()
630
+
631
+ print("Benchmarking optimized model...")
632
+
633
+ # Warm up
634
+ for _ in range(10):
635
+ self.predict(test_data[:1])
636
+ original_model.predict(test_data[:1])
637
+
638
+ # Benchmark optimized
639
+ import time
640
+ start = time.time()
641
+ for _ in range(num_iterations):
642
+ self.predict(test_data[:1])
643
+ optimized_time = time.time() - start
644
+
645
+ # Benchmark original
646
+ start = time.time()
647
+ for _ in range(num_iterations):
648
+ original_model.predict(test_data[:1])
649
+ original_time = time.time() - start
650
+
651
+ # Results
652
+ speedup = original_time / optimized_time
653
+ cost_reduction = 1 - (1 / speedup)
654
+
655
+ print(f"\nBenchmark Results:")
656
+ print(f"Original model time: {original_time:.3f}s ({original_time/num_iterations*1000:.2f}ms per prediction)")
657
+ print(f"Optimized model time: {optimized_time:.3f}s ({optimized_time/num_iterations*1000:.2f}ms per prediction)")
658
+ print(f"Speedup: {speedup:.2f}x")
659
+ print(f"Latency reduction: {(1 - optimized_time/original_time)*100:.1f}%")
660
+ print(f"Cost reduction: {cost_reduction*100:.1f}%")
661
+ print(f"Model size reduction: {self.optimizer.get_size_reduction():.1f}x")
662
+
663
+ return {
664
+ "speedup": speedup,
665
+ "cost_reduction_percent": cost_reduction * 100,
666
+ "optimized_latency_ms": optimized_time / num_iterations * 1000
667
+ }
668
+
669
+ # Usage
670
+ server = OptimizedModelServer("models/churn_predictor_v2.pkl")
671
+
672
+ # Benchmark
673
+ test_features = np.random.rand(100, 20)
674
+ results = server.benchmark(test_features)
675
+
676
+ # Expected results:
677
+ # - 2-5x speedup with ONNX
678
+ # - 4x model size reduction with quantization
679
+ # - 50-80% cost reduction (same throughput with fewer instances)
680
+ ```
681
+
682
+ ## 🚀 CI/CD for Model Serving
683
+
684
+ ### Automated Deployment Pipeline
685
+ ```yaml
686
+ # .github/workflows/model-deployment.yml
687
+ name: Model Deployment Pipeline
688
+
689
+ on:
690
+ workflow_run:
691
+ workflows: ["Model Training Pipeline"]
692
+ types:
693
+ - completed
694
+ workflow_dispatch:
695
+ inputs:
696
+ model_version:
697
+ description: 'Model version to deploy'
698
+ required: true
699
+
700
+ jobs:
701
+ deploy-model:
702
+ runs-on: ubuntu-latest
703
+ if: ${{ github.event.workflow_run.conclusion == 'success' }}
704
+
705
+ steps:
706
+ - uses: actions/checkout@v3
707
+
708
+ - name: Azure Login
709
+ uses: azure/login@v1
710
+ with:
711
+ creds: ${{ secrets.AZURE_CREDENTIALS }}
712
+
713
+ - name: Get model from registry
714
+ run: |
715
+ python scripts/download_model.py \
716
+ --model-name churn_predictor \
717
+ --version ${{ github.event.inputs.model_version || 'latest' }}
718
+
719
+ - name: Optimize model for serving
720
+ run: |
721
+ python scripts/optimize_model.py \
722
+ --input-model ./model \
723
+ --output-model ./optimized_model \
724
+ --format onnx \
725
+ --quantize dynamic
726
+
727
+ - name: Build Docker image
728
+ run: |
729
+ docker build -t ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }} \
730
+ -f Dockerfile.serving .
731
+
732
+ - name: Run container security scan
733
+ run: |
734
+ docker scan ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }}
735
+
736
+ - name: Push to container registry
737
+ run: |
738
+ az acr login --name ${{ secrets.ACR_NAME }}
739
+ docker push ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }}
740
+
741
+ - name: Run inference tests
742
+ run: |
743
+ docker run -d -p 8000:8000 \
744
+ ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }}
745
+ sleep 10
746
+ pytest tests/inference/ --endpoint http://localhost:8000
747
+
748
+ - name: Deploy to staging (canary)
749
+ run: |
750
+ python scripts/deploy_model.py \
751
+ --environment staging \
752
+ --image ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }} \
753
+ --traffic-percent 10
754
+
755
+ - name: Run load tests
756
+ run: |
757
+ locust -f tests/load/test_endpoint.py \
758
+ --headless \
759
+ --users 100 \
760
+ --spawn-rate 10 \
761
+ --run-time 5m \
762
+ --host https://staging-churn-api.azurewebsites.net
763
+
764
+ - name: Monitor canary performance
765
+ run: |
766
+ python scripts/monitor_canary.py \
767
+ --duration 30m \
768
+ --min-success-rate 99 \
769
+ --max-latency-p95 100
770
+
771
+ - name: Promote to production
772
+ if: success()
773
+ run: |
774
+ python scripts/deploy_model.py \
775
+ --environment production \
776
+ --image ${{ secrets.ACR_NAME }}.azurecr.io/churn-predictor:${{ github.sha }} \
777
+ --strategy blue-green \
778
+ --traffic-percent 100
779
+
780
+ - name: Generate deployment report
781
+ run: python scripts/deployment_report.py
782
+ ```
783
+
784
+ ## 📊 Metrics & Monitoring
785
+
786
+ | Metric Category | Metric | Target | Tool |
787
+ |-----------------|--------|--------|------|
788
+ | **Serving Costs** | Cost per 1000 predictions | <$0.05 | FinOps tracker |
789
+ | | Monthly serving costs | <$1500 | Azure Cost Management |
790
+ | | Auto-scaling savings | >50% | Cost tracker |
791
+ | | Cache savings | >60% | Redis metrics |
792
+ | **Performance** | Prediction latency (p95) | <100ms | App Insights |
793
+ | | Throughput | >1000 req/s | Load balancer |
794
+ | | Cache hit rate | >80% | Redis |
795
+ | | Model load time | <10s | Startup metrics |
796
+ | **Reliability** | Availability (SLA) | >99.9% | Azure Monitor |
797
+ | | Error rate | <0.1% | API metrics |
798
+ | | Deployment success rate | >99% | CI/CD metrics |
799
+ | **Resource Usage** | CPU utilization | 60-80% | Azure Monitor |
800
+ | | Memory utilization | <80% | Container metrics |
801
+ | | Instance count | Auto-scaled | HPA metrics |
802
+ | **API Usage** | Requests per minute | Monitored | API Gateway |
803
+ | | Rate limit violations | <1% | Gateway logs |
804
+
805
+ ## 🔄 Integration Workflow
806
+
807
+ ### End-to-End Serving Pipeline
808
+ ```
809
+ 1. Model Registry (ml-07)
810
+
811
+ 2. Model Optimization (ml-08)
812
+
813
+ 3. Container Build (do-03)
814
+
815
+ 4. Security Scan (sa-08)
816
+
817
+ 5. Staging Deployment (do-06)
818
+
819
+ 6. Load Testing (ml-04)
820
+
821
+ 7. Canary Deployment (do-06)
822
+
823
+ 8. Performance Monitoring (mo-04, do-08)
824
+
825
+ 9. Production Promotion (ml-04)
826
+
827
+ 10. Auto-Scaling (fo-06)
828
+
829
+ 11. Cost Monitoring (fo-01, fo-07)
830
+
831
+ 12. Drift Detection (mo-05)
832
+ ```
833
+
834
+ ## 🎯 Quick Wins
835
+
836
+ 1. **Enable auto-scaling** - 40-60% serving cost reduction
837
+ 2. **Implement prediction caching** - 60-80% cost savings
838
+ 3. **Convert models to ONNX** - 2-5x inference speedup
839
+ 4. **Use model quantization** - 4x model size reduction
840
+ 5. **Implement request batching** - 10-50x throughput increase
841
+ 6. **Right-size instances** - 30-50% cost reduction
842
+ 7. **Use canary deployments** - Zero-downtime releases
843
+ 8. **Add health checks** - Better reliability
844
+ 9. **Enable async processing** - Higher concurrency
845
+ 10. **Monitor serving costs** - Identify optimization opportunities