tech-hub-skills 1.2.0 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/{LICENSE → .claude/LICENSE} +21 -21
- package/.claude/README.md +291 -0
- package/.claude/bin/cli.js +266 -0
- package/{bin → .claude/bin}/copilot.js +182 -182
- package/{bin → .claude/bin}/postinstall.js +42 -42
- package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
- package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
- package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
- package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
- package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
- package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
- package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
- package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
- package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
- package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
- package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
- package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
- package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
- package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
- package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
- package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
- package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
- package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
- package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
- package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
- package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
- package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
- package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
- package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
- package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
- package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
- package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
- package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
- package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
- package/.claude/package.json +46 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
- package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
- package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
- package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
- package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
- package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
- package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
- package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
- package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
- package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
- package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
- package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
- package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
- package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
- package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
- package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
- package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
- package/.claude/skills/README.md +336 -0
- package/.claude/skills/ai-engineer.md +104 -0
- package/.claude/skills/aws.md +143 -0
- package/.claude/skills/azure.md +149 -0
- package/.claude/skills/backend-developer.md +108 -0
- package/.claude/skills/code-review.md +399 -0
- package/.claude/skills/compliance-automation.md +747 -0
- package/.claude/skills/compliance-officer.md +108 -0
- package/.claude/skills/data-engineer.md +113 -0
- package/.claude/skills/data-governance.md +102 -0
- package/.claude/skills/data-scientist.md +123 -0
- package/.claude/skills/database-admin.md +109 -0
- package/.claude/skills/devops.md +160 -0
- package/.claude/skills/docker.md +160 -0
- package/.claude/skills/enterprise-dashboard.md +613 -0
- package/.claude/skills/finops.md +184 -0
- package/.claude/skills/frontend-developer.md +108 -0
- package/.claude/skills/gcp.md +143 -0
- package/.claude/skills/ml-engineer.md +115 -0
- package/.claude/skills/mlops.md +187 -0
- package/.claude/skills/network-engineer.md +109 -0
- package/.claude/skills/optimization-advisor.md +329 -0
- package/.claude/skills/orchestrator.md +623 -0
- package/.claude/skills/platform-engineer.md +102 -0
- package/.claude/skills/process-automation.md +226 -0
- package/.claude/skills/process-changelog.md +184 -0
- package/.claude/skills/process-documentation.md +484 -0
- package/.claude/skills/process-kanban.md +324 -0
- package/.claude/skills/process-versioning.md +214 -0
- package/.claude/skills/product-designer.md +104 -0
- package/.claude/skills/project-starter.md +443 -0
- package/.claude/skills/qa-engineer.md +109 -0
- package/.claude/skills/security-architect.md +135 -0
- package/.claude/skills/sre.md +109 -0
- package/.claude/skills/system-design.md +126 -0
- package/.claude/skills/technical-writer.md +101 -0
- package/.gitattributes +2 -0
- package/GITHUB_COPILOT.md +106 -0
- package/README.md +192 -291
- package/package.json +16 -46
- package/bin/cli.js +0 -241
package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md
RENAMED
|
@@ -1,608 +1,608 @@
|
|
|
1
|
-
# Skill 4: Real-Time Streaming Pipelines
|
|
2
|
-
|
|
3
|
-
## 🎯 Overview
|
|
4
|
-
Build and operate production-grade real-time streaming data pipelines with event processing, stateful transformations, exactly-once semantics, and low-latency analytics.
|
|
5
|
-
|
|
6
|
-
## 🔗 Connections
|
|
7
|
-
- **Data Engineer**: Feeds real-time data to lakehouse (de-01, de-02, de-03)
|
|
8
|
-
- **ML Engineer**: Real-time feature computation (ml-02, ml-04)
|
|
9
|
-
- **MLOps**: Online model serving and monitoring (mo-04, mo-05)
|
|
10
|
-
- **AI Engineer**: Real-time RAG updates (ai-02)
|
|
11
|
-
- **Data Scientist**: Streaming analytics and dashboards (ds-01)
|
|
12
|
-
- **Security Architect**: Event encryption and access control (sa-04, sa-05)
|
|
13
|
-
- **FinOps**: Streaming compute cost optimization (fo-01, fo-06)
|
|
14
|
-
- **DevOps**: Streaming infrastructure and monitoring (do-03, do-08)
|
|
15
|
-
- **System Design**: Event-driven architecture patterns (sd-02, sd-05)
|
|
16
|
-
|
|
17
|
-
## 🛠️ Tools Included
|
|
18
|
-
|
|
19
|
-
### 1. `stream_processor.py`
|
|
20
|
-
Unified streaming processor supporting Kafka, Event Hubs, and Kinesis.
|
|
21
|
-
|
|
22
|
-
### 2. `stateful_transformer.py`
|
|
23
|
-
Windowing, aggregations, and stateful operations for streaming data.
|
|
24
|
-
|
|
25
|
-
### 3. `exactly_once_handler.py`
|
|
26
|
-
Idempotency and exactly-once processing guarantees.
|
|
27
|
-
|
|
28
|
-
### 4. `stream_monitor.py`
|
|
29
|
-
Real-time monitoring of lag, throughput, and data quality.
|
|
30
|
-
|
|
31
|
-
### 5. `stream_schemas.py`
|
|
32
|
-
Schema registry integration and evolution management.
|
|
33
|
-
|
|
34
|
-
## 📊 Architecture
|
|
35
|
-
|
|
36
|
-
```
|
|
37
|
-
Event Sources → Stream Ingestion → Processing → Output Sinks
|
|
38
|
-
↓ ↓ ↓ ↓
|
|
39
|
-
IoT/Apps Kafka/EventHub Transformations Lakehouse
|
|
40
|
-
Webhooks Checkpointing Aggregations Real-time DB
|
|
41
|
-
CDC Partitioning Windowing Analytics
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
## 🚀 Quick Start
|
|
45
|
-
|
|
46
|
-
```python
|
|
47
|
-
from stream_processor import StreamProcessor
|
|
48
|
-
from transformations import window, aggregate
|
|
49
|
-
|
|
50
|
-
# Initialize stream processor
|
|
51
|
-
processor = StreamProcessor(
|
|
52
|
-
source="azure_event_hub",
|
|
53
|
-
connection_string=os.getenv("EVENT_HUB_CONNECTION"),
|
|
54
|
-
consumer_group="streaming-pipeline",
|
|
55
|
-
checkpoint_location="abfss://checkpoints@storage.dfs.core.windows.net"
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
# Define streaming query
|
|
59
|
-
stream = processor.read_stream(
|
|
60
|
-
topic="user-events",
|
|
61
|
-
schema=user_event_schema
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
# Transformations with windowing
|
|
65
|
-
processed = (
|
|
66
|
-
stream
|
|
67
|
-
.withWatermark("timestamp", "10 minutes") # Handle late data
|
|
68
|
-
.groupBy(
|
|
69
|
-
window("timestamp", "5 minutes", "1 minute"), # Sliding window
|
|
70
|
-
"user_id"
|
|
71
|
-
)
|
|
72
|
-
.agg(
|
|
73
|
-
count("*").alias("event_count"),
|
|
74
|
-
countDistinct("session_id").alias("session_count"),
|
|
75
|
-
avg("duration").alias("avg_duration")
|
|
76
|
-
)
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
# Write to multiple sinks
|
|
80
|
-
query = (
|
|
81
|
-
processed.writeStream
|
|
82
|
-
.foreachBatch(lambda batch, batch_id: (
|
|
83
|
-
# Write to Delta Lake for historical analysis
|
|
84
|
-
batch.write.format("delta")
|
|
85
|
-
.mode("append")
|
|
86
|
-
.save("abfss://gold@storage.dfs.core.windows.net/user_metrics"),
|
|
87
|
-
|
|
88
|
-
# Write to Redis for real-time serving
|
|
89
|
-
write_to_redis(batch),
|
|
90
|
-
|
|
91
|
-
# Update feature store for ML
|
|
92
|
-
update_feature_store(batch)
|
|
93
|
-
))
|
|
94
|
-
.option("checkpointLocation", "abfss://checkpoints@storage.dfs.core.windows.net/user_metrics")
|
|
95
|
-
.trigger(processingTime="1 minute")
|
|
96
|
-
.start()
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
# Monitor stream health
|
|
100
|
-
from stream_monitor import StreamMonitor
|
|
101
|
-
monitor = StreamMonitor(query)
|
|
102
|
-
monitor.track_metrics(["lag", "throughput", "latency"])
|
|
103
|
-
```
|
|
104
|
-
|
|
105
|
-
## 📚 Best Practices
|
|
106
|
-
|
|
107
|
-
### Streaming Architecture (System Design Integration)
|
|
108
|
-
|
|
109
|
-
1. **Event-Driven Design**
|
|
110
|
-
- Use pub/sub pattern for decoupling
|
|
111
|
-
- Implement event sourcing for auditability
|
|
112
|
-
- Design idempotent consumers
|
|
113
|
-
- Use dead letter queues for failed events
|
|
114
|
-
- Reference: System Design sd-02 (Event-Driven Architecture)
|
|
115
|
-
|
|
116
|
-
2. **Partitioning Strategy**
|
|
117
|
-
- Partition by key for ordered processing
|
|
118
|
-
- Balance partition sizes for even load
|
|
119
|
-
- Monitor partition skew
|
|
120
|
-
- Plan for repartitioning as scale grows
|
|
121
|
-
- Reference: System Design sd-03 (Scalability)
|
|
122
|
-
|
|
123
|
-
3. **Backpressure Handling**
|
|
124
|
-
- Implement rate limiting
|
|
125
|
-
- Use buffering for traffic spikes
|
|
126
|
-
- Auto-scale consumers based on lag
|
|
127
|
-
- Circuit breakers for downstream failures
|
|
128
|
-
- Reference: System Design sd-05 (Resilience Patterns)
|
|
129
|
-
|
|
130
|
-
### Exactly-Once Semantics
|
|
131
|
-
|
|
132
|
-
4. **Idempotent Processing**
|
|
133
|
-
- Use deterministic keys for deduplication
|
|
134
|
-
- Implement idempotent writes
|
|
135
|
-
- Track processed message IDs
|
|
136
|
-
- Handle retries gracefully
|
|
137
|
-
- Reference: Data Engineer best practices
|
|
138
|
-
|
|
139
|
-
5. **Checkpointing Strategy**
|
|
140
|
-
- Frequent checkpoints for fault tolerance
|
|
141
|
-
- Store checkpoints in reliable storage
|
|
142
|
-
- Test checkpoint recovery regularly
|
|
143
|
-
- Monitor checkpoint lag
|
|
144
|
-
- Reference: Data Engineer best practices
|
|
145
|
-
|
|
146
|
-
6. **Transaction Management**
|
|
147
|
-
- Use transactional writes where possible
|
|
148
|
-
- Implement two-phase commit for distributed transactions
|
|
149
|
-
- Handle partial failures gracefully
|
|
150
|
-
- Maintain strong consistency guarantees
|
|
151
|
-
- Reference: System Design best practices
|
|
152
|
-
|
|
153
|
-
### Performance Optimization
|
|
154
|
-
|
|
155
|
-
7. **Throughput Optimization**
|
|
156
|
-
- Batch processing where latency allows
|
|
157
|
-
- Optimize serialization (Avro > JSON)
|
|
158
|
-
- Use compression for network transfer
|
|
159
|
-
- Parallel processing with proper partitioning
|
|
160
|
-
- Reference: Data Engineer de-05 (Performance)
|
|
161
|
-
|
|
162
|
-
8. **Latency Optimization**
|
|
163
|
-
- Minimize transformation complexity
|
|
164
|
-
- Optimize window sizes for use case
|
|
165
|
-
- Use in-memory state stores
|
|
166
|
-
- Reduce network hops
|
|
167
|
-
- Reference: Data Engineer de-05 (Performance)
|
|
168
|
-
|
|
169
|
-
9. **State Management**
|
|
170
|
-
- Use RocksDB for large state
|
|
171
|
-
- Implement state compaction
|
|
172
|
-
- Monitor state store size
|
|
173
|
-
- Backup state periodically
|
|
174
|
-
- Reference: Data Engineer best practices
|
|
175
|
-
|
|
176
|
-
### Cost Optimization (FinOps Integration)
|
|
177
|
-
|
|
178
|
-
10. **Right-Size Streaming Clusters**
|
|
179
|
-
- Monitor CPU and memory utilization
|
|
180
|
-
- Auto-scale based on lag and throughput
|
|
181
|
-
- Use spot instances for dev/test
|
|
182
|
-
- Consolidate low-volume streams
|
|
183
|
-
- Reference: FinOps fo-06 (Compute Optimization)
|
|
184
|
-
|
|
185
|
-
11. **Optimize Data Transfer Costs**
|
|
186
|
-
- Compress events before transmission
|
|
187
|
-
- Use regional endpoints to avoid egress
|
|
188
|
-
- Batch small messages
|
|
189
|
-
- Filter early to reduce downstream processing
|
|
190
|
-
- Reference: FinOps fo-05 (Storage Optimization)
|
|
191
|
-
|
|
192
|
-
12. **Retention and Tiering**
|
|
193
|
-
- Set appropriate retention policies
|
|
194
|
-
- Tier old data to cheaper storage
|
|
195
|
-
- Archive to blob storage for compliance
|
|
196
|
-
- Monitor storage growth
|
|
197
|
-
- Reference: FinOps fo-05 (Storage Optimization)
|
|
198
|
-
|
|
199
|
-
### Security (Security Architect Integration)
|
|
200
|
-
|
|
201
|
-
13. **Event Encryption**
|
|
202
|
-
- Encrypt data in transit (TLS)
|
|
203
|
-
- Encrypt data at rest in event store
|
|
204
|
-
- Use managed keys from Key Vault
|
|
205
|
-
- Rotate encryption keys regularly
|
|
206
|
-
- Reference: Security Architect sa-04 (Encryption)
|
|
207
|
-
|
|
208
|
-
14. **Access Control**
|
|
209
|
-
- Use RBAC for topic access
|
|
210
|
-
- Implement consumer group isolation
|
|
211
|
-
- Audit access to streaming data
|
|
212
|
-
- Managed identities for authentication
|
|
213
|
-
- Reference: Security Architect sa-02 (IAM)
|
|
214
|
-
|
|
215
|
-
15. **PII in Streaming Data**
|
|
216
|
-
- Detect and mask PII in real-time
|
|
217
|
-
- Implement data retention policies
|
|
218
|
-
- Log PII access for compliance
|
|
219
|
-
- Right-to-erasure for GDPR
|
|
220
|
-
- Reference: Security Architect sa-01 (PII Detection)
|
|
221
|
-
|
|
222
|
-
### Monitoring & Observability (DevOps Integration)
|
|
223
|
-
|
|
224
|
-
16. **Streaming Metrics**
|
|
225
|
-
- Monitor consumer lag continuously
|
|
226
|
-
- Track throughput (events/sec)
|
|
227
|
-
- Measure end-to-end latency
|
|
228
|
-
- Alert on processing failures
|
|
229
|
-
- Reference: DevOps do-08 (Monitoring & Observability)
|
|
230
|
-
|
|
231
|
-
17. **Distributed Tracing**
|
|
232
|
-
- Trace events end-to-end
|
|
233
|
-
- Correlate events across systems
|
|
234
|
-
- Identify bottlenecks
|
|
235
|
-
- Debug processing issues
|
|
236
|
-
- Reference: DevOps do-08 (Monitoring & Observability)
|
|
237
|
-
|
|
238
|
-
### Azure-Specific Best Practices
|
|
239
|
-
|
|
240
|
-
18. **Azure Event Hubs**
|
|
241
|
-
- Use capture for automatic archival
|
|
242
|
-
- Enable auto-inflate for throughput
|
|
243
|
-
- Partition key design for even distribution
|
|
244
|
-
- Monitor namespace metrics
|
|
245
|
-
- Reference: Azure az-03 (Event-Driven Services)
|
|
246
|
-
|
|
247
|
-
19. **Azure Stream Analytics**
|
|
248
|
-
- Use for simple transformations (no code)
|
|
249
|
-
- Optimize streaming units (SU)
|
|
250
|
-
- Enable diagnostic logs
|
|
251
|
-
- Test queries with sample data
|
|
252
|
-
- Reference: Azure best practices
|
|
253
|
-
|
|
254
|
-
20. **Databricks Structured Streaming**
|
|
255
|
-
- Use Delta Lake for ACID guarantees
|
|
256
|
-
- Optimize shuffle partitions
|
|
257
|
-
- Monitor streaming query metrics
|
|
258
|
-
- Use optimized writes
|
|
259
|
-
- Reference: Azure az-02 (Synapse/Databricks)
|
|
260
|
-
|
|
261
|
-
## 💰 Cost Optimization Examples
|
|
262
|
-
|
|
263
|
-
### Auto-Scaling Based on Lag
|
|
264
|
-
```python
|
|
265
|
-
from stream_processor import StreamProcessor
|
|
266
|
-
from auto_scaler import StreamAutoScaler
|
|
267
|
-
|
|
268
|
-
processor = StreamProcessor(source="event_hub")
|
|
269
|
-
scaler = StreamAutoScaler(
|
|
270
|
-
min_consumers=2,
|
|
271
|
-
max_consumers=10,
|
|
272
|
-
target_lag_seconds=30
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
# Monitor and auto-scale
|
|
276
|
-
@scaler.auto_scale
|
|
277
|
-
def process_stream():
|
|
278
|
-
stream = processor.read_stream("user-events")
|
|
279
|
-
|
|
280
|
-
# Check current lag
|
|
281
|
-
lag = processor.get_consumer_lag()
|
|
282
|
-
|
|
283
|
-
if lag > 60: # More than 1 minute lag
|
|
284
|
-
scaler.scale_up()
|
|
285
|
-
print(f"Scaling up: lag={lag}s")
|
|
286
|
-
elif lag < 10: # Very low lag
|
|
287
|
-
scaler.scale_down()
|
|
288
|
-
print(f"Scaling down: lag={lag}s")
|
|
289
|
-
|
|
290
|
-
return stream
|
|
291
|
-
|
|
292
|
-
# Cost tracking
|
|
293
|
-
from finops_tracker import StreamingCostTracker
|
|
294
|
-
|
|
295
|
-
cost_tracker = StreamingCostTracker()
|
|
296
|
-
cost_tracker.track_stream(
|
|
297
|
-
stream_name="user-events",
|
|
298
|
-
consumers=scaler.current_consumers,
|
|
299
|
-
throughput_mb=processor.get_throughput(),
|
|
300
|
-
storage_gb=processor.get_storage_usage()
|
|
301
|
-
)
|
|
302
|
-
|
|
303
|
-
# Generate cost report
|
|
304
|
-
report = cost_tracker.generate_report(period="daily")
|
|
305
|
-
print(f"Compute cost: ${report.compute_cost:.2f}")
|
|
306
|
-
print(f"Storage cost: ${report.storage_cost:.2f}")
|
|
307
|
-
print(f"Data transfer cost: ${report.transfer_cost:.2f}")
|
|
308
|
-
print(f"Total: ${report.total_cost:.2f}")
|
|
309
|
-
```
|
|
310
|
-
|
|
311
|
-
### Optimize Event Serialization
|
|
312
|
-
```python
|
|
313
|
-
from stream_processor import StreamProcessor
|
|
314
|
-
import avro
|
|
315
|
-
import json
|
|
316
|
-
|
|
317
|
-
# Bad: JSON (verbose, slow)
|
|
318
|
-
def serialize_json(event: dict) -> bytes:
|
|
319
|
-
return json.dumps(event).encode('utf-8')
|
|
320
|
-
|
|
321
|
-
# Good: Avro (compact, fast, schema evolution)
|
|
322
|
-
def serialize_avro(event: dict, schema: avro.Schema) -> bytes:
|
|
323
|
-
return avro.serialize(event, schema)
|
|
324
|
-
|
|
325
|
-
# Compare sizes and costs
|
|
326
|
-
json_size = len(serialize_json(sample_event))
|
|
327
|
-
avro_size = len(serialize_avro(sample_event, event_schema))
|
|
328
|
-
|
|
329
|
-
print(f"JSON size: {json_size} bytes")
|
|
330
|
-
print(f"Avro size: {avro_size} bytes")
|
|
331
|
-
print(f"Size reduction: {(1 - avro_size/json_size)*100:.1f}%")
|
|
332
|
-
|
|
333
|
-
# Cost impact (assuming 1B events/month)
|
|
334
|
-
events_per_month = 1_000_000_000
|
|
335
|
-
json_transfer_gb = (json_size * events_per_month) / (1024**3)
|
|
336
|
-
avro_transfer_gb = (avro_size * events_per_month) / (1024**3)
|
|
337
|
-
|
|
338
|
-
transfer_cost_per_gb = 0.05 # Example
|
|
339
|
-
json_cost = json_transfer_gb * transfer_cost_per_gb
|
|
340
|
-
avro_cost = avro_transfer_gb * transfer_cost_per_gb
|
|
341
|
-
|
|
342
|
-
print(f"\nMonthly data transfer:")
|
|
343
|
-
print(f"JSON: {json_transfer_gb:.2f} GB → ${json_cost:.2f}")
|
|
344
|
-
print(f"Avro: {avro_transfer_gb:.2f} GB → ${avro_cost:.2f}")
|
|
345
|
-
print(f"Monthly savings: ${json_cost - avro_cost:.2f}")
|
|
346
|
-
```
|
|
347
|
-
|
|
348
|
-
### Retention Policy Optimization
|
|
349
|
-
```python
|
|
350
|
-
from azure.eventhub import EventHubProducerClient
|
|
351
|
-
from datetime import timedelta
|
|
352
|
-
|
|
353
|
-
# Configure retention based on use case
|
|
354
|
-
retention_policies = {
|
|
355
|
-
"hot_events": {
|
|
356
|
-
"retention_days": 7, # Recent data in Event Hub
|
|
357
|
-
"tier": "premium"
|
|
358
|
-
},
|
|
359
|
-
"warm_events": {
|
|
360
|
-
"retention_days": 30, # Move to blob storage
|
|
361
|
-
"tier": "cool"
|
|
362
|
-
},
|
|
363
|
-
"cold_events": {
|
|
364
|
-
"retention_days": 365, # Archive for compliance
|
|
365
|
-
"tier": "archive"
|
|
366
|
-
}
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
# Implement tiering
|
|
370
|
-
def tier_streaming_data():
|
|
371
|
-
# Hot: Keep in Event Hub (7 days)
|
|
372
|
-
event_hub.set_retention(days=7)
|
|
373
|
-
|
|
374
|
-
# Warm: Capture to cool blob storage (8-30 days)
|
|
375
|
-
event_hub.enable_capture(
|
|
376
|
-
destination="cool_storage",
|
|
377
|
-
interval_seconds=300, # 5 minutes
|
|
378
|
-
size_limit_bytes=314572800 # 300 MB
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
# Cold: Archive old data (30+ days)
|
|
382
|
-
# Move cool storage to archive tier
|
|
383
|
-
from azure.storage.blob import BlobServiceClient
|
|
384
|
-
|
|
385
|
-
blob_client = BlobServiceClient(connection_string=conn_str)
|
|
386
|
-
container = blob_client.get_container_client("warm-events")
|
|
387
|
-
|
|
388
|
-
for blob in container.list_blobs():
|
|
389
|
-
age_days = (datetime.now() - blob.last_modified).days
|
|
390
|
-
if age_days > 30:
|
|
391
|
-
blob_client.get_blob_client(
|
|
392
|
-
container="warm-events",
|
|
393
|
-
blob=blob.name
|
|
394
|
-
).set_standard_blob_tier("Archive")
|
|
395
|
-
|
|
396
|
-
# Cost comparison
|
|
397
|
-
hot_cost = 7 * 100 * 0.015 # 7 days, 100GB/day, $0.015/GB
|
|
398
|
-
warm_cost = 23 * 100 * 0.01 # 23 days, cool tier
|
|
399
|
-
cold_cost = 335 * 100 * 0.002 # 335 days, archive tier
|
|
400
|
-
|
|
401
|
-
print(f"Hot (Event Hub): ${hot_cost:.2f}")
|
|
402
|
-
print(f"Warm (Cool Storage): ${warm_cost:.2f}")
|
|
403
|
-
print(f"Cold (Archive): ${cold_cost:.2f}")
|
|
404
|
-
print(f"Total: ${hot_cost + warm_cost + cold_cost:.2f}")
|
|
405
|
-
print(f"vs. all hot: ${365 * 100 * 0.015:.2f}")
|
|
406
|
-
```
|
|
407
|
-
|
|
408
|
-
## 🔒 Security Examples
|
|
409
|
-
|
|
410
|
-
### Encrypt Streaming Data
|
|
411
|
-
```python
|
|
412
|
-
from azure.eventhub import EventHubProducerClient
|
|
413
|
-
from azure.identity import DefaultAzureCredential
|
|
414
|
-
from cryptography.fernet import Fernet
|
|
415
|
-
|
|
416
|
-
# Use managed identity
|
|
417
|
-
credential = DefaultAzureCredential()
|
|
418
|
-
|
|
419
|
-
# Initialize Event Hub with encryption
|
|
420
|
-
producer = EventHubProducerClient(
|
|
421
|
-
fully_qualified_namespace="mynamespace.servicebus.windows.net",
|
|
422
|
-
eventhub_name="secure-events",
|
|
423
|
-
credential=credential
|
|
424
|
-
)
|
|
425
|
-
|
|
426
|
-
# Encrypt sensitive fields
|
|
427
|
-
def encrypt_sensitive_data(event: dict, encryption_key: bytes) -> dict:
|
|
428
|
-
cipher = Fernet(encryption_key)
|
|
429
|
-
|
|
430
|
-
# Encrypt PII fields
|
|
431
|
-
if "email" in event:
|
|
432
|
-
event["email"] = cipher.encrypt(event["email"].encode()).decode()
|
|
433
|
-
if "phone" in event:
|
|
434
|
-
event["phone"] = cipher.encrypt(event["phone"].encode()).decode()
|
|
435
|
-
|
|
436
|
-
return event
|
|
437
|
-
|
|
438
|
-
# Send encrypted events
|
|
439
|
-
from azure.keyvault.secrets import SecretClient
|
|
440
|
-
|
|
441
|
-
# Get encryption key from Key Vault
|
|
442
|
-
kv_client = SecretClient(
|
|
443
|
-
vault_url="https://my-keyvault.vault.azure.net/",
|
|
444
|
-
credential=credential
|
|
445
|
-
)
|
|
446
|
-
encryption_key = kv_client.get_secret("stream-encryption-key").value
|
|
447
|
-
|
|
448
|
-
# Produce events
|
|
449
|
-
event_batch = producer.create_batch()
|
|
450
|
-
for event in events:
|
|
451
|
-
encrypted_event = encrypt_sensitive_data(event, encryption_key)
|
|
452
|
-
event_batch.add(EventData(json.dumps(encrypted_event)))
|
|
453
|
-
|
|
454
|
-
producer.send_batch(event_batch)
|
|
455
|
-
|
|
456
|
-
# Audit
|
|
457
|
-
from audit_logger import AuditLogger
|
|
458
|
-
audit = AuditLogger()
|
|
459
|
-
audit.log_stream_access(
|
|
460
|
-
stream="secure-events",
|
|
461
|
-
action="write",
|
|
462
|
-
user=os.getenv("USER"),
|
|
463
|
-
encrypted=True,
|
|
464
|
-
timestamp=datetime.now()
|
|
465
|
-
)
|
|
466
|
-
```
|
|
467
|
-
|
|
468
|
-
## 📊 Enhanced Metrics & Monitoring
|
|
469
|
-
|
|
470
|
-
| Metric Category | Metric | Target | Tool |
|
|
471
|
-
|-----------------|--------|--------|------|
|
|
472
|
-
| **Throughput** | Events per second | >10,000 | Azure Monitor |
|
|
473
|
-
| | Data throughput (MB/s) | >100 | Stream metrics |
|
|
474
|
-
| | Batch processing time | <5s | Custom metrics |
|
|
475
|
-
| **Latency** | End-to-end latency (p95) | <1s | Application Insights |
|
|
476
|
-
| | Processing latency (p95) | <500ms | Stream processor |
|
|
477
|
-
| | Consumer lag | <30s | Event Hub metrics |
|
|
478
|
-
| **Reliability** | Processing success rate | >99.9% | Azure Monitor |
|
|
479
|
-
| | Exactly-once delivery | 100% | Custom validator |
|
|
480
|
-
| | Checkpoint success rate | >99.5% | Stream metrics |
|
|
481
|
-
| **Cost** | Cost per million events | <$0.50 | FinOps tracker |
|
|
482
|
-
| | Storage cost per GB/day | <$0.02 | Cost Management |
|
|
483
|
-
| | Compute utilization | 60-80% | Azure Monitor |
|
|
484
|
-
| **Quality** | Schema validation pass rate | >99.9% | Data validator |
|
|
485
|
-
| | Late event rate | <1% | Watermark metrics |
|
|
486
|
-
| **Security** | Encrypted events | 100% | Security scans |
|
|
487
|
-
| | Access violations | 0 | Audit logs |
|
|
488
|
-
|
|
489
|
-
## 🚀 Deployment Example
|
|
490
|
-
|
|
491
|
-
### Streaming Infrastructure as Code
|
|
492
|
-
```hcl
|
|
493
|
-
# terraform/streaming.tf
|
|
494
|
-
|
|
495
|
-
resource "azurerm_eventhub_namespace" "streaming" {
|
|
496
|
-
name = "streaming-${var.environment}"
|
|
497
|
-
location = var.location
|
|
498
|
-
resource_group_name = azurerm_resource_group.main.name
|
|
499
|
-
sku = "Standard"
|
|
500
|
-
capacity = 2
|
|
501
|
-
auto_inflate_enabled = true
|
|
502
|
-
maximum_throughput_units = 10
|
|
503
|
-
|
|
504
|
-
tags = {
|
|
505
|
-
Environment = var.environment
|
|
506
|
-
CostCenter = "DataEngineering"
|
|
507
|
-
}
|
|
508
|
-
}
|
|
509
|
-
|
|
510
|
-
resource "azurerm_eventhub" "user_events" {
|
|
511
|
-
name = "user-events"
|
|
512
|
-
namespace_name = azurerm_eventhub_namespace.streaming.name
|
|
513
|
-
resource_group_name = azurerm_resource_group.main.name
|
|
514
|
-
partition_count = 32
|
|
515
|
-
message_retention = 7
|
|
516
|
-
|
|
517
|
-
capture_description {
|
|
518
|
-
enabled = true
|
|
519
|
-
encoding = "Avro"
|
|
520
|
-
interval_in_seconds = 300
|
|
521
|
-
size_limit_in_bytes = 314572800
|
|
522
|
-
|
|
523
|
-
destination {
|
|
524
|
-
name = "EventHubArchive.AzureBlockBlob"
|
|
525
|
-
archive_name_format = "{Namespace}/{EventHub}/{PartitionId}/{Year}/{Month}/{Day}/{Hour}/{Minute}/{Second}"
|
|
526
|
-
blob_container_name = "streaming-archive"
|
|
527
|
-
storage_account_id = azurerm_storage_account.lakehouse.id
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
|
|
532
|
-
# Databricks job for stream processing
|
|
533
|
-
resource "databricks_job" "stream_processor" {
|
|
534
|
-
name = "user-events-processor"
|
|
535
|
-
|
|
536
|
-
new_cluster {
|
|
537
|
-
num_workers = 4
|
|
538
|
-
spark_version = "13.3.x-scala2.12"
|
|
539
|
-
node_type_id = "Standard_DS3_v2"
|
|
540
|
-
|
|
541
|
-
autoscale {
|
|
542
|
-
min_workers = 2
|
|
543
|
-
max_workers = 10
|
|
544
|
-
}
|
|
545
|
-
|
|
546
|
-
spark_conf = {
|
|
547
|
-
"spark.databricks.delta.optimizeWrite.enabled" = "true"
|
|
548
|
-
"spark.databricks.delta.autoCompact.enabled" = "true"
|
|
549
|
-
}
|
|
550
|
-
}
|
|
551
|
-
|
|
552
|
-
spark_python_task {
|
|
553
|
-
python_file = "dbfs:/streaming/process_user_events.py"
|
|
554
|
-
}
|
|
555
|
-
|
|
556
|
-
schedule {
|
|
557
|
-
quartz_cron_expression = "0 0/5 * * * ?" # Every 5 minutes
|
|
558
|
-
timezone_id = "UTC"
|
|
559
|
-
}
|
|
560
|
-
}
|
|
561
|
-
```
|
|
562
|
-
|
|
563
|
-
## 🔄 Integration Workflow
|
|
564
|
-
|
|
565
|
-
### End-to-End Streaming Pipeline
|
|
566
|
-
```
|
|
567
|
-
1. Event Production (IoT/Apps/CDC)
|
|
568
|
-
↓
|
|
569
|
-
2. Event Hub Ingestion (de-04)
|
|
570
|
-
↓
|
|
571
|
-
3. Schema Validation (de-03)
|
|
572
|
-
↓
|
|
573
|
-
4. PII Detection & Masking (sa-01)
|
|
574
|
-
↓
|
|
575
|
-
5. Stream Processing (de-04)
|
|
576
|
-
- Windowing
|
|
577
|
-
- Aggregations
|
|
578
|
-
- Enrichment
|
|
579
|
-
↓
|
|
580
|
-
6. Multi-Sink Output
|
|
581
|
-
├── Delta Lake (de-01) → Historical analysis
|
|
582
|
-
├── Redis → Real-time serving (ml-04)
|
|
583
|
-
├── Feature Store → ML features (ml-02)
|
|
584
|
-
└── Analytics → Dashboards (ds-01)
|
|
585
|
-
↓
|
|
586
|
-
7. Monitoring (do-08)
|
|
587
|
-
- Lag tracking
|
|
588
|
-
- Throughput monitoring
|
|
589
|
-
- Quality metrics
|
|
590
|
-
↓
|
|
591
|
-
8. Cost Optimization (fo-01)
|
|
592
|
-
- Auto-scaling
|
|
593
|
-
- Retention policies
|
|
594
|
-
- Compression
|
|
595
|
-
```
|
|
596
|
-
|
|
597
|
-
## 🎯 Quick Wins
|
|
598
|
-
|
|
599
|
-
1. **Enable Event Hub capture** - Automatic archival to blob storage
|
|
600
|
-
2. **Implement auto-scaling** - 30-50% cost reduction
|
|
601
|
-
3. **Use Avro serialization** - 40-60% bandwidth savings
|
|
602
|
-
4. **Set up lag monitoring** - Prevent data delays
|
|
603
|
-
5. **Implement checkpointing** - Fault tolerance and recovery
|
|
604
|
-
6. **Add schema validation** - Catch bad events early
|
|
605
|
-
7. **Enable encryption** - Data security compliance
|
|
606
|
-
8. **Optimize partitioning** - Better parallelism and throughput
|
|
607
|
-
9. **Set retention policies** - 60-80% storage cost reduction
|
|
608
|
-
10. **Use watermarking** - Handle late-arriving data correctly
|
|
1
|
+
# Skill 4: Real-Time Streaming Pipelines
|
|
2
|
+
|
|
3
|
+
## 🎯 Overview
|
|
4
|
+
Build and operate production-grade real-time streaming data pipelines with event processing, stateful transformations, exactly-once semantics, and low-latency analytics.
|
|
5
|
+
|
|
6
|
+
## 🔗 Connections
|
|
7
|
+
- **Data Engineer**: Feeds real-time data to lakehouse (de-01, de-02, de-03)
|
|
8
|
+
- **ML Engineer**: Real-time feature computation (ml-02, ml-04)
|
|
9
|
+
- **MLOps**: Online model serving and monitoring (mo-04, mo-05)
|
|
10
|
+
- **AI Engineer**: Real-time RAG updates (ai-02)
|
|
11
|
+
- **Data Scientist**: Streaming analytics and dashboards (ds-01)
|
|
12
|
+
- **Security Architect**: Event encryption and access control (sa-04, sa-05)
|
|
13
|
+
- **FinOps**: Streaming compute cost optimization (fo-01, fo-06)
|
|
14
|
+
- **DevOps**: Streaming infrastructure and monitoring (do-03, do-08)
|
|
15
|
+
- **System Design**: Event-driven architecture patterns (sd-02, sd-05)
|
|
16
|
+
|
|
17
|
+
## 🛠️ Tools Included
|
|
18
|
+
|
|
19
|
+
### 1. `stream_processor.py`
|
|
20
|
+
Unified streaming processor supporting Kafka, Event Hubs, and Kinesis.
|
|
21
|
+
|
|
22
|
+
### 2. `stateful_transformer.py`
|
|
23
|
+
Windowing, aggregations, and stateful operations for streaming data.
|
|
24
|
+
|
|
25
|
+
### 3. `exactly_once_handler.py`
|
|
26
|
+
Idempotency and exactly-once processing guarantees.
|
|
27
|
+
|
|
28
|
+
### 4. `stream_monitor.py`
|
|
29
|
+
Real-time monitoring of lag, throughput, and data quality.
|
|
30
|
+
|
|
31
|
+
### 5. `stream_schemas.py`
|
|
32
|
+
Schema registry integration and evolution management.
|
|
33
|
+
|
|
34
|
+
## 📊 Architecture
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
Event Sources → Stream Ingestion → Processing → Output Sinks
|
|
38
|
+
↓ ↓ ↓ ↓
|
|
39
|
+
IoT/Apps Kafka/EventHub Transformations Lakehouse
|
|
40
|
+
Webhooks Checkpointing Aggregations Real-time DB
|
|
41
|
+
CDC Partitioning Windowing Analytics
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## 🚀 Quick Start
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from stream_processor import StreamProcessor
|
|
48
|
+
from transformations import window, aggregate
|
|
49
|
+
|
|
50
|
+
# Initialize stream processor
|
|
51
|
+
processor = StreamProcessor(
|
|
52
|
+
source="azure_event_hub",
|
|
53
|
+
connection_string=os.getenv("EVENT_HUB_CONNECTION"),
|
|
54
|
+
consumer_group="streaming-pipeline",
|
|
55
|
+
checkpoint_location="abfss://checkpoints@storage.dfs.core.windows.net"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Define streaming query
|
|
59
|
+
stream = processor.read_stream(
|
|
60
|
+
topic="user-events",
|
|
61
|
+
schema=user_event_schema
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Transformations with windowing
|
|
65
|
+
processed = (
|
|
66
|
+
stream
|
|
67
|
+
.withWatermark("timestamp", "10 minutes") # Handle late data
|
|
68
|
+
.groupBy(
|
|
69
|
+
window("timestamp", "5 minutes", "1 minute"), # Sliding window
|
|
70
|
+
"user_id"
|
|
71
|
+
)
|
|
72
|
+
.agg(
|
|
73
|
+
count("*").alias("event_count"),
|
|
74
|
+
countDistinct("session_id").alias("session_count"),
|
|
75
|
+
avg("duration").alias("avg_duration")
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Write to multiple sinks
|
|
80
|
+
query = (
|
|
81
|
+
processed.writeStream
|
|
82
|
+
.foreachBatch(lambda batch, batch_id: (
|
|
83
|
+
# Write to Delta Lake for historical analysis
|
|
84
|
+
batch.write.format("delta")
|
|
85
|
+
.mode("append")
|
|
86
|
+
.save("abfss://gold@storage.dfs.core.windows.net/user_metrics"),
|
|
87
|
+
|
|
88
|
+
# Write to Redis for real-time serving
|
|
89
|
+
write_to_redis(batch),
|
|
90
|
+
|
|
91
|
+
# Update feature store for ML
|
|
92
|
+
update_feature_store(batch)
|
|
93
|
+
))
|
|
94
|
+
.option("checkpointLocation", "abfss://checkpoints@storage.dfs.core.windows.net/user_metrics")
|
|
95
|
+
.trigger(processingTime="1 minute")
|
|
96
|
+
.start()
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Monitor stream health
|
|
100
|
+
from stream_monitor import StreamMonitor
|
|
101
|
+
monitor = StreamMonitor(query)
|
|
102
|
+
monitor.track_metrics(["lag", "throughput", "latency"])
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## 📚 Best Practices
|
|
106
|
+
|
|
107
|
+
### Streaming Architecture (System Design Integration)
|
|
108
|
+
|
|
109
|
+
1. **Event-Driven Design**
|
|
110
|
+
- Use pub/sub pattern for decoupling
|
|
111
|
+
- Implement event sourcing for auditability
|
|
112
|
+
- Design idempotent consumers
|
|
113
|
+
- Use dead letter queues for failed events
|
|
114
|
+
- Reference: System Design sd-02 (Event-Driven Architecture)
|
|
115
|
+
|
|
116
|
+
2. **Partitioning Strategy**
|
|
117
|
+
- Partition by key for ordered processing
|
|
118
|
+
- Balance partition sizes for even load
|
|
119
|
+
- Monitor partition skew
|
|
120
|
+
- Plan for repartitioning as scale grows
|
|
121
|
+
- Reference: System Design sd-03 (Scalability)
|
|
122
|
+
|
|
123
|
+
3. **Backpressure Handling**
|
|
124
|
+
- Implement rate limiting
|
|
125
|
+
- Use buffering for traffic spikes
|
|
126
|
+
- Auto-scale consumers based on lag
|
|
127
|
+
- Circuit breakers for downstream failures
|
|
128
|
+
- Reference: System Design sd-05 (Resilience Patterns)
|
|
129
|
+
|
|
130
|
+
### Exactly-Once Semantics
|
|
131
|
+
|
|
132
|
+
4. **Idempotent Processing**
|
|
133
|
+
- Use deterministic keys for deduplication
|
|
134
|
+
- Implement idempotent writes
|
|
135
|
+
- Track processed message IDs
|
|
136
|
+
- Handle retries gracefully
|
|
137
|
+
- Reference: Data Engineer best practices
|
|
138
|
+
|
|
139
|
+
5. **Checkpointing Strategy**
|
|
140
|
+
- Frequent checkpoints for fault tolerance
|
|
141
|
+
- Store checkpoints in reliable storage
|
|
142
|
+
- Test checkpoint recovery regularly
|
|
143
|
+
- Monitor checkpoint lag
|
|
144
|
+
- Reference: Data Engineer best practices
|
|
145
|
+
|
|
146
|
+
6. **Transaction Management**
|
|
147
|
+
- Use transactional writes where possible
|
|
148
|
+
- Implement two-phase commit for distributed transactions
|
|
149
|
+
- Handle partial failures gracefully
|
|
150
|
+
- Maintain strong consistency guarantees
|
|
151
|
+
- Reference: System Design best practices
|
|
152
|
+
|
|
153
|
+
### Performance Optimization
|
|
154
|
+
|
|
155
|
+
7. **Throughput Optimization**
|
|
156
|
+
- Batch processing where latency allows
|
|
157
|
+
- Optimize serialization (Avro > JSON)
|
|
158
|
+
- Use compression for network transfer
|
|
159
|
+
- Parallel processing with proper partitioning
|
|
160
|
+
- Reference: Data Engineer de-05 (Performance)
|
|
161
|
+
|
|
162
|
+
8. **Latency Optimization**
|
|
163
|
+
- Minimize transformation complexity
|
|
164
|
+
- Optimize window sizes for use case
|
|
165
|
+
- Use in-memory state stores
|
|
166
|
+
- Reduce network hops
|
|
167
|
+
- Reference: Data Engineer de-05 (Performance)
|
|
168
|
+
|
|
169
|
+
9. **State Management**
|
|
170
|
+
- Use RocksDB for large state
|
|
171
|
+
- Implement state compaction
|
|
172
|
+
- Monitor state store size
|
|
173
|
+
- Backup state periodically
|
|
174
|
+
- Reference: Data Engineer best practices
|
|
175
|
+
|
|
176
|
+
### Cost Optimization (FinOps Integration)
|
|
177
|
+
|
|
178
|
+
10. **Right-Size Streaming Clusters**
|
|
179
|
+
- Monitor CPU and memory utilization
|
|
180
|
+
- Auto-scale based on lag and throughput
|
|
181
|
+
- Use spot instances for dev/test
|
|
182
|
+
- Consolidate low-volume streams
|
|
183
|
+
- Reference: FinOps fo-06 (Compute Optimization)
|
|
184
|
+
|
|
185
|
+
11. **Optimize Data Transfer Costs**
|
|
186
|
+
- Compress events before transmission
|
|
187
|
+
- Use regional endpoints to avoid egress
|
|
188
|
+
- Batch small messages
|
|
189
|
+
- Filter early to reduce downstream processing
|
|
190
|
+
- Reference: FinOps fo-05 (Storage Optimization)
|
|
191
|
+
|
|
192
|
+
12. **Retention and Tiering**
|
|
193
|
+
- Set appropriate retention policies
|
|
194
|
+
- Tier old data to cheaper storage
|
|
195
|
+
- Archive to blob storage for compliance
|
|
196
|
+
- Monitor storage growth
|
|
197
|
+
- Reference: FinOps fo-05 (Storage Optimization)
|
|
198
|
+
|
|
199
|
+
### Security (Security Architect Integration)
|
|
200
|
+
|
|
201
|
+
13. **Event Encryption**
|
|
202
|
+
- Encrypt data in transit (TLS)
|
|
203
|
+
- Encrypt data at rest in event store
|
|
204
|
+
- Use managed keys from Key Vault
|
|
205
|
+
- Rotate encryption keys regularly
|
|
206
|
+
- Reference: Security Architect sa-04 (Encryption)
|
|
207
|
+
|
|
208
|
+
14. **Access Control**
|
|
209
|
+
- Use RBAC for topic access
|
|
210
|
+
- Implement consumer group isolation
|
|
211
|
+
- Audit access to streaming data
|
|
212
|
+
- Managed identities for authentication
|
|
213
|
+
- Reference: Security Architect sa-02 (IAM)
|
|
214
|
+
|
|
215
|
+
15. **PII in Streaming Data**
|
|
216
|
+
- Detect and mask PII in real-time
|
|
217
|
+
- Implement data retention policies
|
|
218
|
+
- Log PII access for compliance
|
|
219
|
+
- Right-to-erasure for GDPR
|
|
220
|
+
- Reference: Security Architect sa-01 (PII Detection)
|
|
221
|
+
|
|
222
|
+
### Monitoring & Observability (DevOps Integration)
|
|
223
|
+
|
|
224
|
+
16. **Streaming Metrics**
|
|
225
|
+
- Monitor consumer lag continuously
|
|
226
|
+
- Track throughput (events/sec)
|
|
227
|
+
- Measure end-to-end latency
|
|
228
|
+
- Alert on processing failures
|
|
229
|
+
- Reference: DevOps do-08 (Monitoring & Observability)
|
|
230
|
+
|
|
231
|
+
17. **Distributed Tracing**
|
|
232
|
+
- Trace events end-to-end
|
|
233
|
+
- Correlate events across systems
|
|
234
|
+
- Identify bottlenecks
|
|
235
|
+
- Debug processing issues
|
|
236
|
+
- Reference: DevOps do-08 (Monitoring & Observability)
|
|
237
|
+
|
|
238
|
+
### Azure-Specific Best Practices
|
|
239
|
+
|
|
240
|
+
18. **Azure Event Hubs**
|
|
241
|
+
- Use capture for automatic archival
|
|
242
|
+
- Enable auto-inflate for throughput
|
|
243
|
+
- Partition key design for even distribution
|
|
244
|
+
- Monitor namespace metrics
|
|
245
|
+
- Reference: Azure az-03 (Event-Driven Services)
|
|
246
|
+
|
|
247
|
+
19. **Azure Stream Analytics**
|
|
248
|
+
- Use for simple transformations (no code)
|
|
249
|
+
- Optimize streaming units (SU)
|
|
250
|
+
- Enable diagnostic logs
|
|
251
|
+
- Test queries with sample data
|
|
252
|
+
- Reference: Azure best practices
|
|
253
|
+
|
|
254
|
+
20. **Databricks Structured Streaming**
|
|
255
|
+
- Use Delta Lake for ACID guarantees
|
|
256
|
+
- Optimize shuffle partitions
|
|
257
|
+
- Monitor streaming query metrics
|
|
258
|
+
- Use optimized writes
|
|
259
|
+
- Reference: Azure az-02 (Synapse/Databricks)
|
|
260
|
+
|
|
261
|
+
## 💰 Cost Optimization Examples
|
|
262
|
+
|
|
263
|
+
### Auto-Scaling Based on Lag
|
|
264
|
+
```python
|
|
265
|
+
from stream_processor import StreamProcessor
|
|
266
|
+
from auto_scaler import StreamAutoScaler
|
|
267
|
+
|
|
268
|
+
processor = StreamProcessor(source="event_hub")
|
|
269
|
+
scaler = StreamAutoScaler(
|
|
270
|
+
min_consumers=2,
|
|
271
|
+
max_consumers=10,
|
|
272
|
+
target_lag_seconds=30
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Monitor and auto-scale
|
|
276
|
+
@scaler.auto_scale
|
|
277
|
+
def process_stream():
|
|
278
|
+
stream = processor.read_stream("user-events")
|
|
279
|
+
|
|
280
|
+
# Check current lag
|
|
281
|
+
lag = processor.get_consumer_lag()
|
|
282
|
+
|
|
283
|
+
if lag > 60: # More than 1 minute lag
|
|
284
|
+
scaler.scale_up()
|
|
285
|
+
print(f"Scaling up: lag={lag}s")
|
|
286
|
+
elif lag < 10: # Very low lag
|
|
287
|
+
scaler.scale_down()
|
|
288
|
+
print(f"Scaling down: lag={lag}s")
|
|
289
|
+
|
|
290
|
+
return stream
|
|
291
|
+
|
|
292
|
+
# Cost tracking
|
|
293
|
+
from finops_tracker import StreamingCostTracker
|
|
294
|
+
|
|
295
|
+
cost_tracker = StreamingCostTracker()
|
|
296
|
+
cost_tracker.track_stream(
|
|
297
|
+
stream_name="user-events",
|
|
298
|
+
consumers=scaler.current_consumers,
|
|
299
|
+
throughput_mb=processor.get_throughput(),
|
|
300
|
+
storage_gb=processor.get_storage_usage()
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Generate cost report
|
|
304
|
+
report = cost_tracker.generate_report(period="daily")
|
|
305
|
+
print(f"Compute cost: ${report.compute_cost:.2f}")
|
|
306
|
+
print(f"Storage cost: ${report.storage_cost:.2f}")
|
|
307
|
+
print(f"Data transfer cost: ${report.transfer_cost:.2f}")
|
|
308
|
+
print(f"Total: ${report.total_cost:.2f}")
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### Optimize Event Serialization
|
|
312
|
+
```python
|
|
313
|
+
from stream_processor import StreamProcessor
|
|
314
|
+
import avro
|
|
315
|
+
import json
|
|
316
|
+
|
|
317
|
+
# Bad: JSON (verbose, slow)
|
|
318
|
+
def serialize_json(event: dict) -> bytes:
|
|
319
|
+
return json.dumps(event).encode('utf-8')
|
|
320
|
+
|
|
321
|
+
# Good: Avro (compact, fast, schema evolution)
|
|
322
|
+
def serialize_avro(event: dict, schema: avro.Schema) -> bytes:
|
|
323
|
+
return avro.serialize(event, schema)
|
|
324
|
+
|
|
325
|
+
# Compare sizes and costs
|
|
326
|
+
json_size = len(serialize_json(sample_event))
|
|
327
|
+
avro_size = len(serialize_avro(sample_event, event_schema))
|
|
328
|
+
|
|
329
|
+
print(f"JSON size: {json_size} bytes")
|
|
330
|
+
print(f"Avro size: {avro_size} bytes")
|
|
331
|
+
print(f"Size reduction: {(1 - avro_size/json_size)*100:.1f}%")
|
|
332
|
+
|
|
333
|
+
# Cost impact (assuming 1B events/month)
|
|
334
|
+
events_per_month = 1_000_000_000
|
|
335
|
+
json_transfer_gb = (json_size * events_per_month) / (1024**3)
|
|
336
|
+
avro_transfer_gb = (avro_size * events_per_month) / (1024**3)
|
|
337
|
+
|
|
338
|
+
transfer_cost_per_gb = 0.05 # Example
|
|
339
|
+
json_cost = json_transfer_gb * transfer_cost_per_gb
|
|
340
|
+
avro_cost = avro_transfer_gb * transfer_cost_per_gb
|
|
341
|
+
|
|
342
|
+
print(f"\nMonthly data transfer:")
|
|
343
|
+
print(f"JSON: {json_transfer_gb:.2f} GB → ${json_cost:.2f}")
|
|
344
|
+
print(f"Avro: {avro_transfer_gb:.2f} GB → ${avro_cost:.2f}")
|
|
345
|
+
print(f"Monthly savings: ${json_cost - avro_cost:.2f}")
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
### Retention Policy Optimization
|
|
349
|
+
```python
|
|
350
|
+
from azure.eventhub import EventHubProducerClient
|
|
351
|
+
from datetime import timedelta
|
|
352
|
+
|
|
353
|
+
# Configure retention based on use case
|
|
354
|
+
retention_policies = {
|
|
355
|
+
"hot_events": {
|
|
356
|
+
"retention_days": 7, # Recent data in Event Hub
|
|
357
|
+
"tier": "premium"
|
|
358
|
+
},
|
|
359
|
+
"warm_events": {
|
|
360
|
+
"retention_days": 30, # Move to blob storage
|
|
361
|
+
"tier": "cool"
|
|
362
|
+
},
|
|
363
|
+
"cold_events": {
|
|
364
|
+
"retention_days": 365, # Archive for compliance
|
|
365
|
+
"tier": "archive"
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
# Implement tiering
|
|
370
|
+
def tier_streaming_data():
|
|
371
|
+
# Hot: Keep in Event Hub (7 days)
|
|
372
|
+
event_hub.set_retention(days=7)
|
|
373
|
+
|
|
374
|
+
# Warm: Capture to cool blob storage (8-30 days)
|
|
375
|
+
event_hub.enable_capture(
|
|
376
|
+
destination="cool_storage",
|
|
377
|
+
interval_seconds=300, # 5 minutes
|
|
378
|
+
size_limit_bytes=314572800 # 300 MB
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Cold: Archive old data (30+ days)
|
|
382
|
+
# Move cool storage to archive tier
|
|
383
|
+
from azure.storage.blob import BlobServiceClient
|
|
384
|
+
|
|
385
|
+
blob_client = BlobServiceClient(connection_string=conn_str)
|
|
386
|
+
container = blob_client.get_container_client("warm-events")
|
|
387
|
+
|
|
388
|
+
for blob in container.list_blobs():
|
|
389
|
+
age_days = (datetime.now() - blob.last_modified).days
|
|
390
|
+
if age_days > 30:
|
|
391
|
+
blob_client.get_blob_client(
|
|
392
|
+
container="warm-events",
|
|
393
|
+
blob=blob.name
|
|
394
|
+
).set_standard_blob_tier("Archive")
|
|
395
|
+
|
|
396
|
+
# Cost comparison
|
|
397
|
+
hot_cost = 7 * 100 * 0.015 # 7 days, 100GB/day, $0.015/GB
|
|
398
|
+
warm_cost = 23 * 100 * 0.01 # 23 days, cool tier
|
|
399
|
+
cold_cost = 335 * 100 * 0.002 # 335 days, archive tier
|
|
400
|
+
|
|
401
|
+
print(f"Hot (Event Hub): ${hot_cost:.2f}")
|
|
402
|
+
print(f"Warm (Cool Storage): ${warm_cost:.2f}")
|
|
403
|
+
print(f"Cold (Archive): ${cold_cost:.2f}")
|
|
404
|
+
print(f"Total: ${hot_cost + warm_cost + cold_cost:.2f}")
|
|
405
|
+
print(f"vs. all hot: ${365 * 100 * 0.015:.2f}")
|
|
406
|
+
```
|
|
407
|
+
|
|
408
|
+
## 🔒 Security Examples
|
|
409
|
+
|
|
410
|
+
### Encrypt Streaming Data
|
|
411
|
+
```python
|
|
412
|
+
from azure.eventhub import EventHubProducerClient
|
|
413
|
+
from azure.identity import DefaultAzureCredential
|
|
414
|
+
from cryptography.fernet import Fernet
|
|
415
|
+
|
|
416
|
+
# Use managed identity
|
|
417
|
+
credential = DefaultAzureCredential()
|
|
418
|
+
|
|
419
|
+
# Initialize Event Hub with encryption
|
|
420
|
+
producer = EventHubProducerClient(
|
|
421
|
+
fully_qualified_namespace="mynamespace.servicebus.windows.net",
|
|
422
|
+
eventhub_name="secure-events",
|
|
423
|
+
credential=credential
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
# Encrypt sensitive fields
|
|
427
|
+
def encrypt_sensitive_data(event: dict, encryption_key: bytes) -> dict:
|
|
428
|
+
cipher = Fernet(encryption_key)
|
|
429
|
+
|
|
430
|
+
# Encrypt PII fields
|
|
431
|
+
if "email" in event:
|
|
432
|
+
event["email"] = cipher.encrypt(event["email"].encode()).decode()
|
|
433
|
+
if "phone" in event:
|
|
434
|
+
event["phone"] = cipher.encrypt(event["phone"].encode()).decode()
|
|
435
|
+
|
|
436
|
+
return event
|
|
437
|
+
|
|
438
|
+
# Send encrypted events
|
|
439
|
+
from azure.keyvault.secrets import SecretClient
|
|
440
|
+
|
|
441
|
+
# Get encryption key from Key Vault
|
|
442
|
+
kv_client = SecretClient(
|
|
443
|
+
vault_url="https://my-keyvault.vault.azure.net/",
|
|
444
|
+
credential=credential
|
|
445
|
+
)
|
|
446
|
+
encryption_key = kv_client.get_secret("stream-encryption-key").value
|
|
447
|
+
|
|
448
|
+
# Produce events
|
|
449
|
+
event_batch = producer.create_batch()
|
|
450
|
+
for event in events:
|
|
451
|
+
encrypted_event = encrypt_sensitive_data(event, encryption_key)
|
|
452
|
+
event_batch.add(EventData(json.dumps(encrypted_event)))
|
|
453
|
+
|
|
454
|
+
producer.send_batch(event_batch)
|
|
455
|
+
|
|
456
|
+
# Audit
|
|
457
|
+
from audit_logger import AuditLogger
|
|
458
|
+
audit = AuditLogger()
|
|
459
|
+
audit.log_stream_access(
|
|
460
|
+
stream="secure-events",
|
|
461
|
+
action="write",
|
|
462
|
+
user=os.getenv("USER"),
|
|
463
|
+
encrypted=True,
|
|
464
|
+
timestamp=datetime.now()
|
|
465
|
+
)
|
|
466
|
+
```
|
|
467
|
+
|
|
468
|
+
## 📊 Enhanced Metrics & Monitoring
|
|
469
|
+
|
|
470
|
+
| Metric Category | Metric | Target | Tool |
|
|
471
|
+
|-----------------|--------|--------|------|
|
|
472
|
+
| **Throughput** | Events per second | >10,000 | Azure Monitor |
|
|
473
|
+
| | Data throughput (MB/s) | >100 | Stream metrics |
|
|
474
|
+
| | Batch processing time | <5s | Custom metrics |
|
|
475
|
+
| **Latency** | End-to-end latency (p95) | <1s | Application Insights |
|
|
476
|
+
| | Processing latency (p95) | <500ms | Stream processor |
|
|
477
|
+
| | Consumer lag | <30s | Event Hub metrics |
|
|
478
|
+
| **Reliability** | Processing success rate | >99.9% | Azure Monitor |
|
|
479
|
+
| | Exactly-once delivery | 100% | Custom validator |
|
|
480
|
+
| | Checkpoint success rate | >99.5% | Stream metrics |
|
|
481
|
+
| **Cost** | Cost per million events | <$0.50 | FinOps tracker |
|
|
482
|
+
| | Storage cost per GB/day | <$0.02 | Cost Management |
|
|
483
|
+
| | Compute utilization | 60-80% | Azure Monitor |
|
|
484
|
+
| **Quality** | Schema validation pass rate | >99.9% | Data validator |
|
|
485
|
+
| | Late event rate | <1% | Watermark metrics |
|
|
486
|
+
| **Security** | Encrypted events | 100% | Security scans |
|
|
487
|
+
| | Access violations | 0 | Audit logs |
|
|
488
|
+
|
|
489
|
+
## 🚀 Deployment Example
|
|
490
|
+
|
|
491
|
+
### Streaming Infrastructure as Code
|
|
492
|
+
```hcl
|
|
493
|
+
# terraform/streaming.tf
|
|
494
|
+
|
|
495
|
+
resource "azurerm_eventhub_namespace" "streaming" {
|
|
496
|
+
name = "streaming-${var.environment}"
|
|
497
|
+
location = var.location
|
|
498
|
+
resource_group_name = azurerm_resource_group.main.name
|
|
499
|
+
sku = "Standard"
|
|
500
|
+
capacity = 2
|
|
501
|
+
auto_inflate_enabled = true
|
|
502
|
+
maximum_throughput_units = 10
|
|
503
|
+
|
|
504
|
+
tags = {
|
|
505
|
+
Environment = var.environment
|
|
506
|
+
CostCenter = "DataEngineering"
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
resource "azurerm_eventhub" "user_events" {
|
|
511
|
+
name = "user-events"
|
|
512
|
+
namespace_name = azurerm_eventhub_namespace.streaming.name
|
|
513
|
+
resource_group_name = azurerm_resource_group.main.name
|
|
514
|
+
partition_count = 32
|
|
515
|
+
message_retention = 7
|
|
516
|
+
|
|
517
|
+
capture_description {
|
|
518
|
+
enabled = true
|
|
519
|
+
encoding = "Avro"
|
|
520
|
+
interval_in_seconds = 300
|
|
521
|
+
size_limit_in_bytes = 314572800
|
|
522
|
+
|
|
523
|
+
destination {
|
|
524
|
+
name = "EventHubArchive.AzureBlockBlob"
|
|
525
|
+
archive_name_format = "{Namespace}/{EventHub}/{PartitionId}/{Year}/{Month}/{Day}/{Hour}/{Minute}/{Second}"
|
|
526
|
+
blob_container_name = "streaming-archive"
|
|
527
|
+
storage_account_id = azurerm_storage_account.lakehouse.id
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
# Databricks job for stream processing
|
|
533
|
+
resource "databricks_job" "stream_processor" {
|
|
534
|
+
name = "user-events-processor"
|
|
535
|
+
|
|
536
|
+
new_cluster {
|
|
537
|
+
num_workers = 4
|
|
538
|
+
spark_version = "13.3.x-scala2.12"
|
|
539
|
+
node_type_id = "Standard_DS3_v2"
|
|
540
|
+
|
|
541
|
+
autoscale {
|
|
542
|
+
min_workers = 2
|
|
543
|
+
max_workers = 10
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
spark_conf = {
|
|
547
|
+
"spark.databricks.delta.optimizeWrite.enabled" = "true"
|
|
548
|
+
"spark.databricks.delta.autoCompact.enabled" = "true"
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
spark_python_task {
|
|
553
|
+
python_file = "dbfs:/streaming/process_user_events.py"
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
schedule {
|
|
557
|
+
quartz_cron_expression = "0 0/5 * * * ?" # Every 5 minutes
|
|
558
|
+
timezone_id = "UTC"
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
```
|
|
562
|
+
|
|
563
|
+
## 🔄 Integration Workflow
|
|
564
|
+
|
|
565
|
+
### End-to-End Streaming Pipeline
|
|
566
|
+
```
|
|
567
|
+
1. Event Production (IoT/Apps/CDC)
|
|
568
|
+
↓
|
|
569
|
+
2. Event Hub Ingestion (de-04)
|
|
570
|
+
↓
|
|
571
|
+
3. Schema Validation (de-03)
|
|
572
|
+
↓
|
|
573
|
+
4. PII Detection & Masking (sa-01)
|
|
574
|
+
↓
|
|
575
|
+
5. Stream Processing (de-04)
|
|
576
|
+
- Windowing
|
|
577
|
+
- Aggregations
|
|
578
|
+
- Enrichment
|
|
579
|
+
↓
|
|
580
|
+
6. Multi-Sink Output
|
|
581
|
+
├── Delta Lake (de-01) → Historical analysis
|
|
582
|
+
├── Redis → Real-time serving (ml-04)
|
|
583
|
+
├── Feature Store → ML features (ml-02)
|
|
584
|
+
└── Analytics → Dashboards (ds-01)
|
|
585
|
+
↓
|
|
586
|
+
7. Monitoring (do-08)
|
|
587
|
+
- Lag tracking
|
|
588
|
+
- Throughput monitoring
|
|
589
|
+
- Quality metrics
|
|
590
|
+
↓
|
|
591
|
+
8. Cost Optimization (fo-01)
|
|
592
|
+
- Auto-scaling
|
|
593
|
+
- Retention policies
|
|
594
|
+
- Compression
|
|
595
|
+
```
|
|
596
|
+
|
|
597
|
+
## 🎯 Quick Wins
|
|
598
|
+
|
|
599
|
+
1. **Enable Event Hub capture** - Automatic archival to blob storage
|
|
600
|
+
2. **Implement auto-scaling** - 30-50% cost reduction
|
|
601
|
+
3. **Use Avro serialization** - 40-60% bandwidth savings
|
|
602
|
+
4. **Set up lag monitoring** - Prevent data delays
|
|
603
|
+
5. **Implement checkpointing** - Fault tolerance and recovery
|
|
604
|
+
6. **Add schema validation** - Catch bad events early
|
|
605
|
+
7. **Enable encryption** - Data security compliance
|
|
606
|
+
8. **Optimize partitioning** - Better parallelism and throughput
|
|
607
|
+
9. **Set retention policies** - 60-80% storage cost reduction
|
|
608
|
+
10. **Use watermarking** - Handle late-arriving data correctly
|