tech-hub-skills 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +250 -0
  3. package/bin/cli.js +241 -0
  4. package/bin/copilot.js +182 -0
  5. package/bin/postinstall.js +42 -0
  6. package/package.json +46 -0
  7. package/tech_hub_skills/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -0
  8. package/tech_hub_skills/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -0
  9. package/tech_hub_skills/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -0
  10. package/tech_hub_skills/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -0
  11. package/tech_hub_skills/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -0
  12. package/tech_hub_skills/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -0
  13. package/tech_hub_skills/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -0
  14. package/tech_hub_skills/roles/azure/skills/02-data-factory/README.md +264 -0
  15. package/tech_hub_skills/roles/azure/skills/03-synapse-analytics/README.md +264 -0
  16. package/tech_hub_skills/roles/azure/skills/04-databricks/README.md +264 -0
  17. package/tech_hub_skills/roles/azure/skills/05-functions/README.md +264 -0
  18. package/tech_hub_skills/roles/azure/skills/06-kubernetes-service/README.md +264 -0
  19. package/tech_hub_skills/roles/azure/skills/07-openai-service/README.md +264 -0
  20. package/tech_hub_skills/roles/azure/skills/08-machine-learning/README.md +264 -0
  21. package/tech_hub_skills/roles/azure/skills/09-storage-adls/README.md +264 -0
  22. package/tech_hub_skills/roles/azure/skills/10-networking/README.md +264 -0
  23. package/tech_hub_skills/roles/azure/skills/11-sql-cosmos/README.md +264 -0
  24. package/tech_hub_skills/roles/azure/skills/12-event-hubs/README.md +264 -0
  25. package/tech_hub_skills/roles/code-review/skills/01-automated-code-review/README.md +394 -0
  26. package/tech_hub_skills/roles/code-review/skills/02-pr-review-workflow/README.md +427 -0
  27. package/tech_hub_skills/roles/code-review/skills/03-code-quality-gates/README.md +518 -0
  28. package/tech_hub_skills/roles/code-review/skills/04-reviewer-assignment/README.md +504 -0
  29. package/tech_hub_skills/roles/code-review/skills/05-review-analytics/README.md +540 -0
  30. package/tech_hub_skills/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -0
  31. package/tech_hub_skills/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -0
  32. package/tech_hub_skills/roles/data-engineer/skills/03-data-quality/README.md +579 -0
  33. package/tech_hub_skills/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -0
  34. package/tech_hub_skills/roles/data-engineer/skills/05-performance-optimization/README.md +547 -0
  35. package/tech_hub_skills/roles/data-governance/skills/01-data-catalog/README.md +112 -0
  36. package/tech_hub_skills/roles/data-governance/skills/02-data-lineage/README.md +129 -0
  37. package/tech_hub_skills/roles/data-governance/skills/03-data-quality-framework/README.md +182 -0
  38. package/tech_hub_skills/roles/data-governance/skills/04-access-control/README.md +39 -0
  39. package/tech_hub_skills/roles/data-governance/skills/05-master-data-management/README.md +40 -0
  40. package/tech_hub_skills/roles/data-governance/skills/06-compliance-privacy/README.md +46 -0
  41. package/tech_hub_skills/roles/data-scientist/skills/01-eda-automation/README.md +230 -0
  42. package/tech_hub_skills/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -0
  43. package/tech_hub_skills/roles/data-scientist/skills/03-feature-engineering/README.md +264 -0
  44. package/tech_hub_skills/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -0
  45. package/tech_hub_skills/roles/data-scientist/skills/05-customer-analytics/README.md +264 -0
  46. package/tech_hub_skills/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -0
  47. package/tech_hub_skills/roles/data-scientist/skills/07-experimentation/README.md +264 -0
  48. package/tech_hub_skills/roles/data-scientist/skills/08-data-visualization/README.md +264 -0
  49. package/tech_hub_skills/roles/devops/skills/01-cicd-pipeline/README.md +264 -0
  50. package/tech_hub_skills/roles/devops/skills/02-container-orchestration/README.md +264 -0
  51. package/tech_hub_skills/roles/devops/skills/03-infrastructure-as-code/README.md +264 -0
  52. package/tech_hub_skills/roles/devops/skills/04-gitops/README.md +264 -0
  53. package/tech_hub_skills/roles/devops/skills/05-environment-management/README.md +264 -0
  54. package/tech_hub_skills/roles/devops/skills/06-automated-testing/README.md +264 -0
  55. package/tech_hub_skills/roles/devops/skills/07-release-management/README.md +264 -0
  56. package/tech_hub_skills/roles/devops/skills/08-monitoring-alerting/README.md +264 -0
  57. package/tech_hub_skills/roles/devops/skills/09-devsecops/README.md +265 -0
  58. package/tech_hub_skills/roles/finops/skills/01-cost-visibility/README.md +264 -0
  59. package/tech_hub_skills/roles/finops/skills/02-resource-tagging/README.md +264 -0
  60. package/tech_hub_skills/roles/finops/skills/03-budget-management/README.md +264 -0
  61. package/tech_hub_skills/roles/finops/skills/04-reserved-instances/README.md +264 -0
  62. package/tech_hub_skills/roles/finops/skills/05-spot-optimization/README.md +264 -0
  63. package/tech_hub_skills/roles/finops/skills/06-storage-tiering/README.md +264 -0
  64. package/tech_hub_skills/roles/finops/skills/07-compute-rightsizing/README.md +264 -0
  65. package/tech_hub_skills/roles/finops/skills/08-chargeback/README.md +264 -0
  66. package/tech_hub_skills/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -0
  67. package/tech_hub_skills/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -0
  68. package/tech_hub_skills/roles/ml-engineer/skills/03-model-training/README.md +704 -0
  69. package/tech_hub_skills/roles/ml-engineer/skills/04-model-serving/README.md +845 -0
  70. package/tech_hub_skills/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -0
  71. package/tech_hub_skills/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -0
  72. package/tech_hub_skills/roles/mlops/skills/02-experiment-tracking/README.md +264 -0
  73. package/tech_hub_skills/roles/mlops/skills/03-model-registry/README.md +264 -0
  74. package/tech_hub_skills/roles/mlops/skills/04-feature-store/README.md +264 -0
  75. package/tech_hub_skills/roles/mlops/skills/05-model-deployment/README.md +264 -0
  76. package/tech_hub_skills/roles/mlops/skills/06-model-observability/README.md +264 -0
  77. package/tech_hub_skills/roles/mlops/skills/07-data-versioning/README.md +264 -0
  78. package/tech_hub_skills/roles/mlops/skills/08-ab-testing/README.md +264 -0
  79. package/tech_hub_skills/roles/mlops/skills/09-automated-retraining/README.md +264 -0
  80. package/tech_hub_skills/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -0
  81. package/tech_hub_skills/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -0
  82. package/tech_hub_skills/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -0
  83. package/tech_hub_skills/roles/platform-engineer/skills/04-developer-experience/README.md +57 -0
  84. package/tech_hub_skills/roles/platform-engineer/skills/05-incident-management/README.md +73 -0
  85. package/tech_hub_skills/roles/platform-engineer/skills/06-capacity-management/README.md +59 -0
  86. package/tech_hub_skills/roles/product-designer/skills/01-requirements-discovery/README.md +407 -0
  87. package/tech_hub_skills/roles/product-designer/skills/02-user-research/README.md +382 -0
  88. package/tech_hub_skills/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -0
  89. package/tech_hub_skills/roles/product-designer/skills/04-ux-design/README.md +496 -0
  90. package/tech_hub_skills/roles/product-designer/skills/05-product-market-fit/README.md +376 -0
  91. package/tech_hub_skills/roles/product-designer/skills/06-stakeholder-management/README.md +412 -0
  92. package/tech_hub_skills/roles/security-architect/skills/01-pii-detection/README.md +319 -0
  93. package/tech_hub_skills/roles/security-architect/skills/02-threat-modeling/README.md +264 -0
  94. package/tech_hub_skills/roles/security-architect/skills/03-infrastructure-security/README.md +264 -0
  95. package/tech_hub_skills/roles/security-architect/skills/04-iam/README.md +264 -0
  96. package/tech_hub_skills/roles/security-architect/skills/05-application-security/README.md +264 -0
  97. package/tech_hub_skills/roles/security-architect/skills/06-secrets-management/README.md +264 -0
  98. package/tech_hub_skills/roles/security-architect/skills/07-security-monitoring/README.md +264 -0
  99. package/tech_hub_skills/roles/system-design/skills/01-architecture-patterns/README.md +337 -0
  100. package/tech_hub_skills/roles/system-design/skills/02-requirements-engineering/README.md +264 -0
  101. package/tech_hub_skills/roles/system-design/skills/03-scalability/README.md +264 -0
  102. package/tech_hub_skills/roles/system-design/skills/04-high-availability/README.md +264 -0
  103. package/tech_hub_skills/roles/system-design/skills/05-cost-optimization-design/README.md +264 -0
  104. package/tech_hub_skills/roles/system-design/skills/06-api-design/README.md +264 -0
  105. package/tech_hub_skills/roles/system-design/skills/07-observability-architecture/README.md +264 -0
  106. package/tech_hub_skills/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -0
  107. package/tech_hub_skills/roles/system-design/skills/08-process-automation/README.md +521 -0
  108. package/tech_hub_skills/skills/README.md +336 -0
  109. package/tech_hub_skills/skills/ai-engineer.md +104 -0
  110. package/tech_hub_skills/skills/azure.md +149 -0
  111. package/tech_hub_skills/skills/code-review.md +399 -0
  112. package/tech_hub_skills/skills/compliance-automation.md +747 -0
  113. package/tech_hub_skills/skills/data-engineer.md +113 -0
  114. package/tech_hub_skills/skills/data-governance.md +102 -0
  115. package/tech_hub_skills/skills/data-scientist.md +123 -0
  116. package/tech_hub_skills/skills/devops.md +160 -0
  117. package/tech_hub_skills/skills/docker.md +160 -0
  118. package/tech_hub_skills/skills/enterprise-dashboard.md +613 -0
  119. package/tech_hub_skills/skills/finops.md +184 -0
  120. package/tech_hub_skills/skills/ml-engineer.md +115 -0
  121. package/tech_hub_skills/skills/mlops.md +187 -0
  122. package/tech_hub_skills/skills/optimization-advisor.md +329 -0
  123. package/tech_hub_skills/skills/orchestrator.md +497 -0
  124. package/tech_hub_skills/skills/platform-engineer.md +102 -0
  125. package/tech_hub_skills/skills/process-automation.md +226 -0
  126. package/tech_hub_skills/skills/process-changelog.md +184 -0
  127. package/tech_hub_skills/skills/process-documentation.md +484 -0
  128. package/tech_hub_skills/skills/process-kanban.md +324 -0
  129. package/tech_hub_skills/skills/process-versioning.md +214 -0
  130. package/tech_hub_skills/skills/product-designer.md +104 -0
  131. package/tech_hub_skills/skills/project-starter.md +443 -0
  132. package/tech_hub_skills/skills/security-architect.md +135 -0
  133. package/tech_hub_skills/skills/system-design.md +126 -0
@@ -0,0 +1,608 @@
1
+ # Skill 4: Real-Time Streaming Pipelines
2
+
3
+ ## 🎯 Overview
4
+ Build and operate production-grade real-time streaming data pipelines with event processing, stateful transformations, exactly-once semantics, and low-latency analytics.
5
+
6
+ ## 🔗 Connections
7
+ - **Data Engineer**: Feeds real-time data to lakehouse (de-01, de-02, de-03)
8
+ - **ML Engineer**: Real-time feature computation (ml-02, ml-04)
9
+ - **MLOps**: Online model serving and monitoring (mo-04, mo-05)
10
+ - **AI Engineer**: Real-time RAG updates (ai-02)
11
+ - **Data Scientist**: Streaming analytics and dashboards (ds-01)
12
+ - **Security Architect**: Event encryption and access control (sa-04, sa-05)
13
+ - **FinOps**: Streaming compute cost optimization (fo-01, fo-06)
14
+ - **DevOps**: Streaming infrastructure and monitoring (do-03, do-08)
15
+ - **System Design**: Event-driven architecture patterns (sd-02, sd-05)
16
+
17
+ ## 🛠️ Tools Included
18
+
19
+ ### 1. `stream_processor.py`
20
+ Unified streaming processor supporting Kafka, Event Hubs, and Kinesis.
21
+
22
+ ### 2. `stateful_transformer.py`
23
+ Windowing, aggregations, and stateful operations for streaming data.
24
+
25
+ ### 3. `exactly_once_handler.py`
26
+ Idempotency and exactly-once processing guarantees.
27
+
28
+ ### 4. `stream_monitor.py`
29
+ Real-time monitoring of lag, throughput, and data quality.
30
+
31
+ ### 5. `stream_schemas.py`
32
+ Schema registry integration and evolution management.
33
+
34
+ ## 📊 Architecture
35
+
36
+ ```
37
+ Event Sources → Stream Ingestion → Processing → Output Sinks
38
+ ↓ ↓ ↓ ↓
39
+ IoT/Apps Kafka/EventHub Transformations Lakehouse
40
+ Webhooks Checkpointing Aggregations Real-time DB
41
+ CDC Partitioning Windowing Analytics
42
+ ```
43
+
44
+ ## 🚀 Quick Start
45
+
46
+ ```python
47
+ from stream_processor import StreamProcessor
48
+ from transformations import window, aggregate
49
+
50
+ # Initialize stream processor
51
+ processor = StreamProcessor(
52
+ source="azure_event_hub",
53
+ connection_string=os.getenv("EVENT_HUB_CONNECTION"),
54
+ consumer_group="streaming-pipeline",
55
+ checkpoint_location="abfss://checkpoints@storage.dfs.core.windows.net"
56
+ )
57
+
58
+ # Define streaming query
59
+ stream = processor.read_stream(
60
+ topic="user-events",
61
+ schema=user_event_schema
62
+ )
63
+
64
+ # Transformations with windowing
65
+ processed = (
66
+ stream
67
+ .withWatermark("timestamp", "10 minutes") # Handle late data
68
+ .groupBy(
69
+ window("timestamp", "5 minutes", "1 minute"), # Sliding window
70
+ "user_id"
71
+ )
72
+ .agg(
73
+ count("*").alias("event_count"),
74
+ countDistinct("session_id").alias("session_count"),
75
+ avg("duration").alias("avg_duration")
76
+ )
77
+ )
78
+
79
+ # Write to multiple sinks
80
+ query = (
81
+ processed.writeStream
82
+ .foreachBatch(lambda batch, batch_id: (
83
+ # Write to Delta Lake for historical analysis
84
+ batch.write.format("delta")
85
+ .mode("append")
86
+ .save("abfss://gold@storage.dfs.core.windows.net/user_metrics"),
87
+
88
+ # Write to Redis for real-time serving
89
+ write_to_redis(batch),
90
+
91
+ # Update feature store for ML
92
+ update_feature_store(batch)
93
+ ))
94
+ .option("checkpointLocation", "abfss://checkpoints@storage.dfs.core.windows.net/user_metrics")
95
+ .trigger(processingTime="1 minute")
96
+ .start()
97
+ )
98
+
99
+ # Monitor stream health
100
+ from stream_monitor import StreamMonitor
101
+ monitor = StreamMonitor(query)
102
+ monitor.track_metrics(["lag", "throughput", "latency"])
103
+ ```
104
+
105
+ ## 📚 Best Practices
106
+
107
+ ### Streaming Architecture (System Design Integration)
108
+
109
+ 1. **Event-Driven Design**
110
+ - Use pub/sub pattern for decoupling
111
+ - Implement event sourcing for auditability
112
+ - Design idempotent consumers
113
+ - Use dead letter queues for failed events
114
+ - Reference: System Design sd-02 (Event-Driven Architecture)
115
+
116
+ 2. **Partitioning Strategy**
117
+ - Partition by key for ordered processing
118
+ - Balance partition sizes for even load
119
+ - Monitor partition skew
120
+ - Plan for repartitioning as scale grows
121
+ - Reference: System Design sd-03 (Scalability)
122
+
123
+ 3. **Backpressure Handling**
124
+ - Implement rate limiting
125
+ - Use buffering for traffic spikes
126
+ - Auto-scale consumers based on lag
127
+ - Circuit breakers for downstream failures
128
+ - Reference: System Design sd-05 (Resilience Patterns)
129
+
130
+ ### Exactly-Once Semantics
131
+
132
+ 4. **Idempotent Processing**
133
+ - Use deterministic keys for deduplication
134
+ - Implement idempotent writes
135
+ - Track processed message IDs
136
+ - Handle retries gracefully
137
+ - Reference: Data Engineer best practices
138
+
139
+ 5. **Checkpointing Strategy**
140
+ - Frequent checkpoints for fault tolerance
141
+ - Store checkpoints in reliable storage
142
+ - Test checkpoint recovery regularly
143
+ - Monitor checkpoint lag
144
+ - Reference: Data Engineer best practices
145
+
146
+ 6. **Transaction Management**
147
+ - Use transactional writes where possible
148
+ - Implement two-phase commit for distributed transactions
149
+ - Handle partial failures gracefully
150
+ - Maintain strong consistency guarantees
151
+ - Reference: System Design best practices
152
+
153
+ ### Performance Optimization
154
+
155
+ 7. **Throughput Optimization**
156
+ - Batch processing where latency allows
157
+ - Optimize serialization (Avro > JSON)
158
+ - Use compression for network transfer
159
+ - Parallel processing with proper partitioning
160
+ - Reference: Data Engineer de-05 (Performance)
161
+
162
+ 8. **Latency Optimization**
163
+ - Minimize transformation complexity
164
+ - Optimize window sizes for use case
165
+ - Use in-memory state stores
166
+ - Reduce network hops
167
+ - Reference: Data Engineer de-05 (Performance)
168
+
169
+ 9. **State Management**
170
+ - Use RocksDB for large state
171
+ - Implement state compaction
172
+ - Monitor state store size
173
+ - Backup state periodically
174
+ - Reference: Data Engineer best practices
175
+
176
+ ### Cost Optimization (FinOps Integration)
177
+
178
+ 10. **Right-Size Streaming Clusters**
179
+ - Monitor CPU and memory utilization
180
+ - Auto-scale based on lag and throughput
181
+ - Use spot instances for dev/test
182
+ - Consolidate low-volume streams
183
+ - Reference: FinOps fo-06 (Compute Optimization)
184
+
185
+ 11. **Optimize Data Transfer Costs**
186
+ - Compress events before transmission
187
+ - Use regional endpoints to avoid egress
188
+ - Batch small messages
189
+ - Filter early to reduce downstream processing
190
+ - Reference: FinOps fo-05 (Storage Optimization)
191
+
192
+ 12. **Retention and Tiering**
193
+ - Set appropriate retention policies
194
+ - Tier old data to cheaper storage
195
+ - Archive to blob storage for compliance
196
+ - Monitor storage growth
197
+ - Reference: FinOps fo-05 (Storage Optimization)
198
+
199
+ ### Security (Security Architect Integration)
200
+
201
+ 13. **Event Encryption**
202
+ - Encrypt data in transit (TLS)
203
+ - Encrypt data at rest in event store
204
+ - Use managed keys from Key Vault
205
+ - Rotate encryption keys regularly
206
+ - Reference: Security Architect sa-04 (Encryption)
207
+
208
+ 14. **Access Control**
209
+ - Use RBAC for topic access
210
+ - Implement consumer group isolation
211
+ - Audit access to streaming data
212
+ - Managed identities for authentication
213
+ - Reference: Security Architect sa-02 (IAM)
214
+
215
+ 15. **PII in Streaming Data**
216
+ - Detect and mask PII in real-time
217
+ - Implement data retention policies
218
+ - Log PII access for compliance
219
+ - Right-to-erasure for GDPR
220
+ - Reference: Security Architect sa-01 (PII Detection)
221
+
222
+ ### Monitoring & Observability (DevOps Integration)
223
+
224
+ 16. **Streaming Metrics**
225
+ - Monitor consumer lag continuously
226
+ - Track throughput (events/sec)
227
+ - Measure end-to-end latency
228
+ - Alert on processing failures
229
+ - Reference: DevOps do-08 (Monitoring & Observability)
230
+
231
+ 17. **Distributed Tracing**
232
+ - Trace events end-to-end
233
+ - Correlate events across systems
234
+ - Identify bottlenecks
235
+ - Debug processing issues
236
+ - Reference: DevOps do-08 (Monitoring & Observability)
237
+
238
+ ### Azure-Specific Best Practices
239
+
240
+ 18. **Azure Event Hubs**
241
+ - Use capture for automatic archival
242
+ - Enable auto-inflate for throughput
243
+ - Partition key design for even distribution
244
+ - Monitor namespace metrics
245
+ - Reference: Azure az-03 (Event-Driven Services)
246
+
247
+ 19. **Azure Stream Analytics**
248
+ - Use for simple transformations (no code)
249
+ - Optimize streaming units (SU)
250
+ - Enable diagnostic logs
251
+ - Test queries with sample data
252
+ - Reference: Azure best practices
253
+
254
+ 20. **Databricks Structured Streaming**
255
+ - Use Delta Lake for ACID guarantees
256
+ - Optimize shuffle partitions
257
+ - Monitor streaming query metrics
258
+ - Use optimized writes
259
+ - Reference: Azure az-02 (Synapse/Databricks)
260
+
261
+ ## 💰 Cost Optimization Examples
262
+
263
+ ### Auto-Scaling Based on Lag
264
+ ```python
265
+ from stream_processor import StreamProcessor
266
+ from auto_scaler import StreamAutoScaler
267
+
268
+ processor = StreamProcessor(source="event_hub")
269
+ scaler = StreamAutoScaler(
270
+ min_consumers=2,
271
+ max_consumers=10,
272
+ target_lag_seconds=30
273
+ )
274
+
275
+ # Monitor and auto-scale
276
+ @scaler.auto_scale
277
+ def process_stream():
278
+ stream = processor.read_stream("user-events")
279
+
280
+ # Check current lag
281
+ lag = processor.get_consumer_lag()
282
+
283
+ if lag > 60: # More than 1 minute lag
284
+ scaler.scale_up()
285
+ print(f"Scaling up: lag={lag}s")
286
+ elif lag < 10: # Very low lag
287
+ scaler.scale_down()
288
+ print(f"Scaling down: lag={lag}s")
289
+
290
+ return stream
291
+
292
+ # Cost tracking
293
+ from finops_tracker import StreamingCostTracker
294
+
295
+ cost_tracker = StreamingCostTracker()
296
+ cost_tracker.track_stream(
297
+ stream_name="user-events",
298
+ consumers=scaler.current_consumers,
299
+ throughput_mb=processor.get_throughput(),
300
+ storage_gb=processor.get_storage_usage()
301
+ )
302
+
303
+ # Generate cost report
304
+ report = cost_tracker.generate_report(period="daily")
305
+ print(f"Compute cost: ${report.compute_cost:.2f}")
306
+ print(f"Storage cost: ${report.storage_cost:.2f}")
307
+ print(f"Data transfer cost: ${report.transfer_cost:.2f}")
308
+ print(f"Total: ${report.total_cost:.2f}")
309
+ ```
310
+
311
+ ### Optimize Event Serialization
312
+ ```python
313
+ from stream_processor import StreamProcessor
314
+ import avro
315
+ import json
316
+
317
+ # Bad: JSON (verbose, slow)
318
+ def serialize_json(event: dict) -> bytes:
319
+ return json.dumps(event).encode('utf-8')
320
+
321
+ # Good: Avro (compact, fast, schema evolution)
322
+ def serialize_avro(event: dict, schema: avro.Schema) -> bytes:
323
+ return avro.serialize(event, schema)
324
+
325
+ # Compare sizes and costs
326
+ json_size = len(serialize_json(sample_event))
327
+ avro_size = len(serialize_avro(sample_event, event_schema))
328
+
329
+ print(f"JSON size: {json_size} bytes")
330
+ print(f"Avro size: {avro_size} bytes")
331
+ print(f"Size reduction: {(1 - avro_size/json_size)*100:.1f}%")
332
+
333
+ # Cost impact (assuming 1B events/month)
334
+ events_per_month = 1_000_000_000
335
+ json_transfer_gb = (json_size * events_per_month) / (1024**3)
336
+ avro_transfer_gb = (avro_size * events_per_month) / (1024**3)
337
+
338
+ transfer_cost_per_gb = 0.05 # Example
339
+ json_cost = json_transfer_gb * transfer_cost_per_gb
340
+ avro_cost = avro_transfer_gb * transfer_cost_per_gb
341
+
342
+ print(f"\nMonthly data transfer:")
343
+ print(f"JSON: {json_transfer_gb:.2f} GB → ${json_cost:.2f}")
344
+ print(f"Avro: {avro_transfer_gb:.2f} GB → ${avro_cost:.2f}")
345
+ print(f"Monthly savings: ${json_cost - avro_cost:.2f}")
346
+ ```
347
+
348
+ ### Retention Policy Optimization
349
+ ```python
350
+ from azure.eventhub import EventHubProducerClient
351
+ from datetime import timedelta
352
+
353
+ # Configure retention based on use case
354
+ retention_policies = {
355
+ "hot_events": {
356
+ "retention_days": 7, # Recent data in Event Hub
357
+ "tier": "premium"
358
+ },
359
+ "warm_events": {
360
+ "retention_days": 30, # Move to blob storage
361
+ "tier": "cool"
362
+ },
363
+ "cold_events": {
364
+ "retention_days": 365, # Archive for compliance
365
+ "tier": "archive"
366
+ }
367
+ }
368
+
369
+ # Implement tiering
370
+ def tier_streaming_data():
371
+ # Hot: Keep in Event Hub (7 days)
372
+ event_hub.set_retention(days=7)
373
+
374
+ # Warm: Capture to cool blob storage (8-30 days)
375
+ event_hub.enable_capture(
376
+ destination="cool_storage",
377
+ interval_seconds=300, # 5 minutes
378
+ size_limit_bytes=314572800 # 300 MB
379
+ )
380
+
381
+ # Cold: Archive old data (30+ days)
382
+ # Move cool storage to archive tier
383
+ from azure.storage.blob import BlobServiceClient
384
+
385
+ blob_client = BlobServiceClient(connection_string=conn_str)
386
+ container = blob_client.get_container_client("warm-events")
387
+
388
+ for blob in container.list_blobs():
389
+ age_days = (datetime.now() - blob.last_modified).days
390
+ if age_days > 30:
391
+ blob_client.get_blob_client(
392
+ container="warm-events",
393
+ blob=blob.name
394
+ ).set_standard_blob_tier("Archive")
395
+
396
+ # Cost comparison
397
+ hot_cost = 7 * 100 * 0.015 # 7 days, 100GB/day, $0.015/GB
398
+ warm_cost = 23 * 100 * 0.01 # 23 days, cool tier
399
+ cold_cost = 335 * 100 * 0.002 # 335 days, archive tier
400
+
401
+ print(f"Hot (Event Hub): ${hot_cost:.2f}")
402
+ print(f"Warm (Cool Storage): ${warm_cost:.2f}")
403
+ print(f"Cold (Archive): ${cold_cost:.2f}")
404
+ print(f"Total: ${hot_cost + warm_cost + cold_cost:.2f}")
405
+ print(f"vs. all hot: ${365 * 100 * 0.015:.2f}")
406
+ ```
407
+
408
+ ## 🔒 Security Examples
409
+
410
+ ### Encrypt Streaming Data
411
+ ```python
412
+ from azure.eventhub import EventHubProducerClient
413
+ from azure.identity import DefaultAzureCredential
414
+ from cryptography.fernet import Fernet
415
+
416
+ # Use managed identity
417
+ credential = DefaultAzureCredential()
418
+
419
+ # Initialize Event Hub with encryption
420
+ producer = EventHubProducerClient(
421
+ fully_qualified_namespace="mynamespace.servicebus.windows.net",
422
+ eventhub_name="secure-events",
423
+ credential=credential
424
+ )
425
+
426
+ # Encrypt sensitive fields
427
+ def encrypt_sensitive_data(event: dict, encryption_key: bytes) -> dict:
428
+ cipher = Fernet(encryption_key)
429
+
430
+ # Encrypt PII fields
431
+ if "email" in event:
432
+ event["email"] = cipher.encrypt(event["email"].encode()).decode()
433
+ if "phone" in event:
434
+ event["phone"] = cipher.encrypt(event["phone"].encode()).decode()
435
+
436
+ return event
437
+
438
+ # Send encrypted events
439
+ from azure.keyvault.secrets import SecretClient
440
+
441
+ # Get encryption key from Key Vault
442
+ kv_client = SecretClient(
443
+ vault_url="https://my-keyvault.vault.azure.net/",
444
+ credential=credential
445
+ )
446
+ encryption_key = kv_client.get_secret("stream-encryption-key").value
447
+
448
+ # Produce events
449
+ event_batch = producer.create_batch()
450
+ for event in events:
451
+ encrypted_event = encrypt_sensitive_data(event, encryption_key)
452
+ event_batch.add(EventData(json.dumps(encrypted_event)))
453
+
454
+ producer.send_batch(event_batch)
455
+
456
+ # Audit
457
+ from audit_logger import AuditLogger
458
+ audit = AuditLogger()
459
+ audit.log_stream_access(
460
+ stream="secure-events",
461
+ action="write",
462
+ user=os.getenv("USER"),
463
+ encrypted=True,
464
+ timestamp=datetime.now()
465
+ )
466
+ ```
467
+
468
+ ## 📊 Enhanced Metrics & Monitoring
469
+
470
+ | Metric Category | Metric | Target | Tool |
471
+ |-----------------|--------|--------|------|
472
+ | **Throughput** | Events per second | >10,000 | Azure Monitor |
473
+ | | Data throughput (MB/s) | >100 | Stream metrics |
474
+ | | Batch processing time | <5s | Custom metrics |
475
+ | **Latency** | End-to-end latency (p95) | <1s | Application Insights |
476
+ | | Processing latency (p95) | <500ms | Stream processor |
477
+ | | Consumer lag | <30s | Event Hub metrics |
478
+ | **Reliability** | Processing success rate | >99.9% | Azure Monitor |
479
+ | | Exactly-once delivery | 100% | Custom validator |
480
+ | | Checkpoint success rate | >99.5% | Stream metrics |
481
+ | **Cost** | Cost per million events | <$0.50 | FinOps tracker |
482
+ | | Storage cost per GB/day | <$0.02 | Cost Management |
483
+ | | Compute utilization | 60-80% | Azure Monitor |
484
+ | **Quality** | Schema validation pass rate | >99.9% | Data validator |
485
+ | | Late event rate | <1% | Watermark metrics |
486
+ | **Security** | Encrypted events | 100% | Security scans |
487
+ | | Access violations | 0 | Audit logs |
488
+
489
+ ## 🚀 Deployment Example
490
+
491
+ ### Streaming Infrastructure as Code
492
+ ```hcl
493
+ # terraform/streaming.tf
494
+
495
+ resource "azurerm_eventhub_namespace" "streaming" {
496
+ name = "streaming-${var.environment}"
497
+ location = var.location
498
+ resource_group_name = azurerm_resource_group.main.name
499
+ sku = "Standard"
500
+ capacity = 2
501
+ auto_inflate_enabled = true
502
+ maximum_throughput_units = 10
503
+
504
+ tags = {
505
+ Environment = var.environment
506
+ CostCenter = "DataEngineering"
507
+ }
508
+ }
509
+
510
+ resource "azurerm_eventhub" "user_events" {
511
+ name = "user-events"
512
+ namespace_name = azurerm_eventhub_namespace.streaming.name
513
+ resource_group_name = azurerm_resource_group.main.name
514
+ partition_count = 32
515
+ message_retention = 7
516
+
517
+ capture_description {
518
+ enabled = true
519
+ encoding = "Avro"
520
+ interval_in_seconds = 300
521
+ size_limit_in_bytes = 314572800
522
+
523
+ destination {
524
+ name = "EventHubArchive.AzureBlockBlob"
525
+ archive_name_format = "{Namespace}/{EventHub}/{PartitionId}/{Year}/{Month}/{Day}/{Hour}/{Minute}/{Second}"
526
+ blob_container_name = "streaming-archive"
527
+ storage_account_id = azurerm_storage_account.lakehouse.id
528
+ }
529
+ }
530
+ }
531
+
532
+ # Databricks job for stream processing
533
+ resource "databricks_job" "stream_processor" {
534
+ name = "user-events-processor"
535
+
536
+ new_cluster {
537
+ num_workers = 4
538
+ spark_version = "13.3.x-scala2.12"
539
+ node_type_id = "Standard_DS3_v2"
540
+
541
+ autoscale {
542
+ min_workers = 2
543
+ max_workers = 10
544
+ }
545
+
546
+ spark_conf = {
547
+ "spark.databricks.delta.optimizeWrite.enabled" = "true"
548
+ "spark.databricks.delta.autoCompact.enabled" = "true"
549
+ }
550
+ }
551
+
552
+ spark_python_task {
553
+ python_file = "dbfs:/streaming/process_user_events.py"
554
+ }
555
+
556
+ schedule {
557
+ quartz_cron_expression = "0 0/5 * * * ?" # Every 5 minutes
558
+ timezone_id = "UTC"
559
+ }
560
+ }
561
+ ```
562
+
563
+ ## 🔄 Integration Workflow
564
+
565
+ ### End-to-End Streaming Pipeline
566
+ ```
567
+ 1. Event Production (IoT/Apps/CDC)
568
+
569
+ 2. Event Hub Ingestion (de-04)
570
+
571
+ 3. Schema Validation (de-03)
572
+
573
+ 4. PII Detection & Masking (sa-01)
574
+
575
+ 5. Stream Processing (de-04)
576
+ - Windowing
577
+ - Aggregations
578
+ - Enrichment
579
+
580
+ 6. Multi-Sink Output
581
+ ├── Delta Lake (de-01) → Historical analysis
582
+ ├── Redis → Real-time serving (ml-04)
583
+ ├── Feature Store → ML features (ml-02)
584
+ └── Analytics → Dashboards (ds-01)
585
+
586
+ 7. Monitoring (do-08)
587
+ - Lag tracking
588
+ - Throughput monitoring
589
+ - Quality metrics
590
+
591
+ 8. Cost Optimization (fo-01)
592
+ - Auto-scaling
593
+ - Retention policies
594
+ - Compression
595
+ ```
596
+
597
+ ## 🎯 Quick Wins
598
+
599
+ 1. **Enable Event Hub capture** - Automatic archival to blob storage
600
+ 2. **Implement auto-scaling** - 30-50% cost reduction
601
+ 3. **Use Avro serialization** - 40-60% bandwidth savings
602
+ 4. **Set up lag monitoring** - Prevent data delays
603
+ 5. **Implement checkpointing** - Fault tolerance and recovery
604
+ 6. **Add schema validation** - Catch bad events early
605
+ 7. **Enable encryption** - Data security compliance
606
+ 8. **Optimize partitioning** - Better parallelism and throughput
607
+ 9. **Set retention policies** - 60-80% storage cost reduction
608
+ 10. **Use watermarking** - Handle late-arriving data correctly