tech-hub-skills 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/.claude/LICENSE +21 -21
  2. package/.claude/README.md +291 -291
  3. package/.claude/bin/cli.js +266 -266
  4. package/.claude/bin/copilot.js +182 -182
  5. package/.claude/bin/postinstall.js +42 -42
  6. package/.claude/commands/README.md +336 -336
  7. package/.claude/commands/ai-engineer.md +104 -104
  8. package/.claude/commands/aws.md +143 -143
  9. package/.claude/commands/azure.md +149 -149
  10. package/.claude/commands/backend-developer.md +108 -108
  11. package/.claude/commands/code-review.md +399 -399
  12. package/.claude/commands/compliance-automation.md +747 -747
  13. package/.claude/commands/compliance-officer.md +108 -108
  14. package/.claude/commands/data-engineer.md +113 -113
  15. package/.claude/commands/data-governance.md +102 -102
  16. package/.claude/commands/data-scientist.md +123 -123
  17. package/.claude/commands/database-admin.md +109 -109
  18. package/.claude/commands/devops.md +160 -160
  19. package/.claude/commands/docker.md +160 -160
  20. package/.claude/commands/enterprise-dashboard.md +613 -613
  21. package/.claude/commands/finops.md +184 -184
  22. package/.claude/commands/frontend-developer.md +108 -108
  23. package/.claude/commands/gcp.md +143 -143
  24. package/.claude/commands/ml-engineer.md +115 -115
  25. package/.claude/commands/mlops.md +187 -187
  26. package/.claude/commands/network-engineer.md +109 -109
  27. package/.claude/commands/optimization-advisor.md +329 -329
  28. package/.claude/commands/orchestrator.md +623 -623
  29. package/.claude/commands/platform-engineer.md +102 -102
  30. package/.claude/commands/process-automation.md +226 -226
  31. package/.claude/commands/process-changelog.md +184 -184
  32. package/.claude/commands/process-documentation.md +484 -484
  33. package/.claude/commands/process-kanban.md +324 -324
  34. package/.claude/commands/process-versioning.md +214 -214
  35. package/.claude/commands/product-designer.md +104 -104
  36. package/.claude/commands/project-starter.md +443 -443
  37. package/.claude/commands/qa-engineer.md +109 -109
  38. package/.claude/commands/security-architect.md +135 -135
  39. package/.claude/commands/sre.md +109 -109
  40. package/.claude/commands/system-design.md +126 -126
  41. package/.claude/commands/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -46
  43. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -356
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -274
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -324
  47. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -336
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -213
  50. package/.claude/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/.claude/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/.claude/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/.claude/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/.claude/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/.claude/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/.claude/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/.claude/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/.claude/roles/azure/skills/05-functions/README.md +264 -264
  59. package/.claude/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/.claude/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/.claude/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/.claude/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/.claude/roles/azure/skills/10-networking/README.md +264 -264
  64. package/.claude/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/.claude/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/.claude/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/.claude/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/.claude/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/.claude/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/.claude/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -337
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -300
  74. package/.claude/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/.claude/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/.claude/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/.claude/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/.claude/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/.claude/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/.claude/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/.claude/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/.claude/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/.claude/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/.claude/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -446
  86. package/.claude/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/.claude/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/.claude/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/.claude/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/.claude/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/.claude/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/.claude/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/.claude/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/.claude/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/.claude/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/.claude/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/.claude/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/.claude/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/.claude/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/.claude/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/.claude/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/.claude/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/.claude/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/.claude/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/.claude/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/.claude/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/.claude/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/.claude/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/.claude/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/.claude/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/.claude/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/.claude/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/.claude/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/.claude/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/.claude/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/.claude/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/.claude/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/.claude/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/.claude/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/.claude/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/.claude/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/.claude/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/.claude/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/.claude/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/.claude/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/.claude/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/.claude/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/.claude/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/.claude/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/.claude/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/.claude/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/.claude/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/.claude/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/.claude/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/.claude/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/.claude/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/.claude/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/.claude/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/.claude/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/.claude/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/.claude/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/.claude/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/.claude/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/.claude/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/.claude/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/.claude/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/.claude/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/.claude/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/.claude/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/.claude/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/.claude/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -744
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -688
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -679
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -528
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -684
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -615
  158. package/.claude/skills/README.md +336 -336
  159. package/.claude/skills/ai-engineer.md +104 -104
  160. package/.claude/skills/aws.md +143 -143
  161. package/.claude/skills/azure.md +149 -149
  162. package/.claude/skills/backend-developer.md +108 -108
  163. package/.claude/skills/code-review.md +399 -399
  164. package/.claude/skills/compliance-automation.md +747 -747
  165. package/.claude/skills/compliance-officer.md +108 -108
  166. package/.claude/skills/data-engineer.md +113 -113
  167. package/.claude/skills/data-governance.md +102 -102
  168. package/.claude/skills/data-scientist.md +123 -123
  169. package/.claude/skills/database-admin.md +109 -109
  170. package/.claude/skills/devops.md +160 -160
  171. package/.claude/skills/docker.md +160 -160
  172. package/.claude/skills/enterprise-dashboard.md +613 -613
  173. package/.claude/skills/finops.md +184 -184
  174. package/.claude/skills/frontend-developer.md +108 -108
  175. package/.claude/skills/gcp.md +143 -143
  176. package/.claude/skills/ml-engineer.md +115 -115
  177. package/.claude/skills/mlops.md +187 -187
  178. package/.claude/skills/network-engineer.md +109 -109
  179. package/.claude/skills/optimization-advisor.md +329 -329
  180. package/.claude/skills/orchestrator.md +623 -623
  181. package/.claude/skills/platform-engineer.md +102 -102
  182. package/.claude/skills/process-automation.md +226 -226
  183. package/.claude/skills/process-changelog.md +184 -184
  184. package/.claude/skills/process-documentation.md +484 -484
  185. package/.claude/skills/process-kanban.md +324 -324
  186. package/.claude/skills/process-versioning.md +214 -214
  187. package/.claude/skills/product-designer.md +104 -104
  188. package/.claude/skills/project-starter.md +443 -443
  189. package/.claude/skills/qa-engineer.md +109 -109
  190. package/.claude/skills/security-architect.md +135 -135
  191. package/.claude/skills/sre.md +109 -109
  192. package/.claude/skills/system-design.md +126 -126
  193. package/.claude/skills/technical-writer.md +101 -101
  194. package/.gitattributes +2 -2
  195. package/GITHUB_COPILOT.md +106 -106
  196. package/README.md +192 -184
  197. package/package.json +16 -8
@@ -1,580 +1,580 @@
1
- # Skill 2: ETL/ELT Pipeline Orchestration
2
-
3
- ## 🎯 Overview
4
- Build and orchestrate production-grade ETL/ELT pipelines with scheduling, monitoring, error handling, and dependency management for scalable data workflows.
5
-
6
- ## 🔗 Connections
7
- - **Data Engineer**: Feeds lakehouse architecture layers (de-01, de-03)
8
- - **ML Engineer**: Provides training data pipelines (ml-01, ml-02)
9
- - **MLOps**: Data versioning and feature pipelines (mo-02, mo-06)
10
- - **AI Engineer**: Prepares data for RAG systems (ai-02)
11
- - **Data Scientist**: Delivers clean data for analysis (ds-01, ds-02)
12
- - **Security Architect**: Secure data movement, encryption in transit (sa-04, sa-05)
13
- - **FinOps**: Pipeline cost optimization and monitoring (fo-01, fo-06)
14
- - **DevOps**: CI/CD for pipeline deployment, IaC (do-01, do-04, do-08)
15
-
16
- ## 🛠️ Tools Included
17
-
18
- ### 1. `orchestrator.py`
19
- Pipeline orchestration with DAG-based workflow management using Airflow/Prefect.
20
-
21
- ### 2. `connector_factory.py`
22
- Universal connectors for databases, APIs, files, and cloud storage.
23
-
24
- ### 3. `transformation_engine.py`
25
- Scalable data transformations with schema validation and error handling.
26
-
27
- ### 4. `incremental_loader.py`
28
- Change Data Capture (CDC) and incremental loading strategies.
29
-
30
- ### 5. `pipeline_monitor.py`
31
- Real-time pipeline monitoring, alerting, and SLA tracking.
32
-
33
- ## 📊 Architecture
34
-
35
- ```
36
- Source Systems → Extract → Transform → Load → Target Systems
37
- ↓ ↓ ↓ ↓ ↓
38
- Connectors Staging Validation Loading Lakehouse
39
- API/DB/File Area Quality Strategy Layers
40
- Checks (Full/Inc)
41
- ```
42
-
43
- ## 🚀 Quick Start
44
-
45
- ```python
46
- from orchestrator import Pipeline, Task
47
- from connector_factory import ConnectorFactory
48
-
49
- # Define pipeline
50
- pipeline = Pipeline(
51
- name="customer_etl",
52
- schedule="0 2 * * *", # Daily at 2 AM
53
- max_retries=3
54
- )
55
-
56
- # Extract task
57
- @pipeline.task(name="extract_customers")
58
- def extract():
59
- connector = ConnectorFactory.create(
60
- source_type="postgres",
61
- connection_string=os.getenv("DB_CONNECTION")
62
- )
63
- return connector.extract(
64
- query="SELECT * FROM customers WHERE updated_at > :last_run",
65
- params={"last_run": pipeline.last_run_time}
66
- )
67
-
68
- # Transform task
69
- @pipeline.task(name="transform_customers", depends_on=["extract_customers"])
70
- def transform(data):
71
- from transformation_engine import TransformationEngine
72
-
73
- engine = TransformationEngine()
74
- return engine.apply([
75
- engine.clean_nulls(),
76
- engine.standardize_phone_numbers(),
77
- engine.deduplicate(key="customer_id"),
78
- engine.validate_schema(schema="customer_schema.json")
79
- ], data)
80
-
81
- # Load task
82
- @pipeline.task(name="load_customers", depends_on=["transform_customers"])
83
- def load(data):
84
- connector = ConnectorFactory.create(
85
- target_type="delta_lake",
86
- path="abfss://silver@lakehouse.dfs.core.windows.net/customers"
87
- )
88
- connector.load(data, mode="append", partition_by="ingestion_date")
89
-
90
- # Execute
91
- pipeline.run()
92
- ```
93
-
94
- ## 📚 Best Practices
95
-
96
- ### Cost Optimization (FinOps Integration)
97
-
98
- 1. **Optimize Compute for Pipeline Execution**
99
- - Use serverless compute for sporadic pipelines
100
- - Right-size dedicated compute pools for regular jobs
101
- - Implement auto-scaling based on queue depth
102
- - Schedule resource-intensive jobs during off-peak hours
103
- - Reference: FinOps fo-06 (Compute Optimization)
104
-
105
- 2. **Incremental Loading to Reduce Costs**
106
- - Implement CDC to process only changed records
107
- - Use watermarks for incremental extraction
108
- - Partition data by date for efficient processing
109
- - Track last successful run timestamps
110
- - Reference: Data Engineer best practices
111
-
112
- 3. **Pipeline Cost Monitoring**
113
- - Track costs per pipeline execution
114
- - Monitor data transfer costs between regions
115
- - Alert on cost anomalies
116
- - Generate cost attribution reports by team/project
117
- - Reference: FinOps fo-01 (Cost Monitoring), fo-03 (Budget Management)
118
-
119
- 4. **Data Transfer Cost Optimization**
120
- - Minimize cross-region data movement
121
- - Compress data before transfer
122
- - Use Azure Private Link to avoid egress charges
123
- - Batch small files to reduce transaction costs
124
- - Reference: FinOps fo-05 (Storage Optimization)
125
-
126
- ### Infrastructure as Code (DevOps Integration)
127
-
128
- 5. **Deploy Pipelines as Code**
129
- - Version control all pipeline definitions
130
- - Use IaC for pipeline infrastructure (Terraform/Bicep)
131
- - Implement GitOps for pipeline deployment
132
- - Maintain separate environments (dev/staging/prod)
133
- - Reference: DevOps do-04 (IaC), do-05 (GitOps)
134
-
135
- 6. **CI/CD for Data Pipelines**
136
- - Automated testing for pipeline logic
137
- - Schema validation tests
138
- - Data quality tests in CI
139
- - Blue-green deployments for critical pipelines
140
- - Reference: DevOps do-01 (CI/CD), do-02 (Testing)
141
-
142
- 7. **Containerize Pipeline Components**
143
- - Package transformations in Docker containers
144
- - Use Kubernetes for orchestration at scale
145
- - Implement health checks and readiness probes
146
- - Version container images alongside code
147
- - Reference: DevOps do-03 (Containerization)
148
-
149
- ### Security & Compliance (Security Architect Integration)
150
-
151
- 8. **Secure Credential Management**
152
- - Store credentials in Azure Key Vault
153
- - Use managed identities where possible
154
- - Rotate credentials regularly
155
- - Never hardcode secrets in pipeline code
156
- - Reference: Security Architect sa-03 (Secrets Management)
157
-
158
- 9. **Encrypt Data in Transit**
159
- - Use TLS for all data transfers
160
- - Implement VPN/Private Link for sensitive data
161
- - Validate SSL certificates
162
- - Monitor for unencrypted connections
163
- - Reference: Security Architect sa-04 (Encryption)
164
-
165
- 10. **Audit Trail for Data Movement**
166
- - Log all pipeline executions with metadata
167
- - Track data lineage from source to target
168
- - Implement compliance auditing
169
- - Retain audit logs per regulatory requirements
170
- - Reference: Security Architect sa-06 (Data Governance)
171
-
172
- ### Data Quality (Data Engineer Integration)
173
-
174
- 11. **Schema Validation**
175
- - Validate schemas before transformation
176
- - Detect schema drift automatically
177
- - Alert on breaking schema changes
178
- - Maintain schema registry
179
- - Reference: Data Engineer de-03 (Data Quality)
180
-
181
- 12. **Data Quality Gates**
182
- - Implement quality checks at each stage
183
- - Quarantine bad data for review
184
- - Alert on quality threshold violations
185
- - Track data quality metrics over time
186
- - Reference: Data Engineer de-03 (Data Quality)
187
-
188
- 13. **Error Handling & Recovery**
189
- - Implement circuit breakers for failing sources
190
- - Dead letter queues for failed records
191
- - Automatic retry with exponential backoff
192
- - Manual review process for persistent failures
193
- - Reference: System Design sd-05 (Resilience Patterns)
194
-
195
- ### Monitoring & Observability (DevOps Integration)
196
-
197
- 14. **Pipeline Monitoring**
198
- - Track execution time, success rate, data volume
199
- - Set up SLA alerts for critical pipelines
200
- - Monitor resource utilization (CPU, memory, I/O)
201
- - Implement distributed tracing for complex workflows
202
- - Reference: DevOps do-08 (Monitoring & Observability)
203
-
204
- 15. **Alerting Strategy**
205
- - Alert on pipeline failures with context
206
- - Warning alerts for performance degradation
207
- - SLA breach notifications
208
- - Cost spike alerts
209
- - Reference: DevOps do-08, FinOps fo-03
210
-
211
- ### Azure-Specific Best Practices
212
-
213
- 16. **Azure Data Factory**
214
- - Use managed VNet for secure connectivity
215
- - Implement parameterized pipelines
216
- - Enable git integration for version control
217
- - Use mapping data flows for complex transformations
218
- - Reference: Azure az-01 (Data Factory)
219
-
220
- 17. **Azure Synapse Pipelines**
221
- - Leverage Spark pools for big data processing
222
- - Use serverless SQL for ad-hoc transformations
223
- - Implement workload management
224
- - Monitor pipeline costs via Synapse Studio
225
- - Reference: Azure az-02 (Synapse Analytics)
226
-
227
- 18. **Integration Runtime Optimization**
228
- - Use Azure IR for Azure-to-Azure transfers
229
- - Self-hosted IR for on-premises connectivity
230
- - Right-size IR compute units
231
- - Enable VNet integration for security
232
- - Reference: Azure az-01
233
-
234
- ## 💰 Cost Optimization Examples
235
-
236
- ### Incremental Loading with CDC
237
- ```python
238
- from incremental_loader import IncrementalLoader
239
- from datetime import datetime, timedelta
240
-
241
- # Initialize incremental loader
242
- loader = IncrementalLoader(
243
- source="sales_db",
244
- target="bronze.sales",
245
- watermark_column="updated_at"
246
- )
247
-
248
- # Load only changed records
249
- result = loader.load_incremental(
250
- query="""
251
- SELECT * FROM sales
252
- WHERE updated_at > :watermark
253
- AND updated_at <= :current_time
254
- """,
255
- merge_key="sale_id"
256
- )
257
-
258
- print(f"Records processed: {result.records_processed}")
259
- print(f"Records skipped: {result.records_skipped}")
260
- print(f"Cost saved vs full load: ${result.cost_savings:.2f}")
261
-
262
- # Track incremental loading efficiency
263
- from finops_tracker import PipelineCostTracker
264
-
265
- cost_tracker = PipelineCostTracker()
266
- cost_tracker.log_execution(
267
- pipeline_name="sales_incremental",
268
- execution_time=result.execution_time,
269
- data_processed_gb=result.data_size_gb,
270
- compute_cost=result.compute_cost,
271
- storage_cost=result.storage_cost
272
- )
273
- ```
274
-
275
- ### Pipeline Cost Monitoring
276
- ```python
277
- from pipeline_monitor import PipelineMonitor
278
- from finops_tracker import PipelineCostTracker
279
-
280
- monitor = PipelineMonitor()
281
- cost_tracker = PipelineCostTracker()
282
-
283
- # Track pipeline execution costs
284
- @cost_tracker.track_costs
285
- def run_pipeline(pipeline_name: str):
286
- pipeline = Pipeline.get(pipeline_name)
287
- result = pipeline.run()
288
-
289
- # Log detailed cost breakdown
290
- cost_tracker.log_execution(
291
- pipeline_name=pipeline_name,
292
- execution_time=result.duration,
293
- data_processed_gb=result.data_volume,
294
- compute_cost=result.compute_cost,
295
- data_transfer_cost=result.transfer_cost,
296
- storage_cost=result.storage_cost
297
- )
298
-
299
- return result
300
-
301
- # Generate cost report
302
- report = cost_tracker.generate_report(
303
- period="monthly",
304
- group_by=["pipeline_name", "environment"]
305
- )
306
-
307
- print(f"Total pipeline costs: ${report.total_cost:.2f}")
308
- print(f"Top 5 expensive pipelines:")
309
- for pipeline in report.top_pipelines:
310
- print(f" {pipeline.name}: ${pipeline.cost:.2f}")
311
-
312
- # Set budget alerts
313
- cost_tracker.set_budget_alert(
314
- pipeline_name="customer_etl",
315
- monthly_budget=500.00,
316
- alert_threshold=0.8
317
- )
318
- ```
319
-
320
- ### Optimize Data Transfer Costs
321
- ```python
322
- from connector_factory import ConnectorFactory
323
- from compression_utils import compress_data
324
-
325
- # Extract with compression
326
- source_connector = ConnectorFactory.create(
327
- source_type="postgres",
328
- connection_string=db_connection
329
- )
330
-
331
- data = source_connector.extract(query="SELECT * FROM large_table")
332
-
333
- # Compress before transfer
334
- compressed_data = compress_data(
335
- data,
336
- algorithm="zstd", # Better compression than gzip
337
- compression_level=3 # Balance speed vs size
338
- )
339
-
340
- print(f"Original size: {data.size_mb:.2f} MB")
341
- print(f"Compressed size: {compressed_data.size_mb:.2f} MB")
342
- print(f"Compression ratio: {compressed_data.ratio:.2f}x")
343
- print(f"Transfer cost saved: ${compressed_data.cost_savings:.2f}")
344
-
345
- # Load to target
346
- target_connector = ConnectorFactory.create(
347
- target_type="delta_lake",
348
- path="abfss://bronze@lakehouse.dfs.core.windows.net/data"
349
- )
350
-
351
- target_connector.load(
352
- compressed_data,
353
- decompress=True,
354
- partition_by="ingestion_date"
355
- )
356
- ```
357
-
358
- ## 🔒 Security Best Practices Examples
359
-
360
- ### Secure Credential Management
361
- ```python
362
- from azure.identity import DefaultAzureCredential
363
- from azure.keyvault.secrets import SecretClient
364
- from connector_factory import ConnectorFactory
365
-
366
- # Use managed identity to access Key Vault
367
- credential = DefaultAzureCredential()
368
- key_vault_client = SecretClient(
369
- vault_url="https://my-keyvault.vault.azure.net/",
370
- credential=credential
371
- )
372
-
373
- # Retrieve connection string securely
374
- db_connection = key_vault_client.get_secret("postgres-connection-string").value
375
-
376
- # Create connector with secure credentials
377
- connector = ConnectorFactory.create(
378
- source_type="postgres",
379
- connection_string=db_connection,
380
- ssl_mode="require", # Enforce SSL
381
- ssl_cert_path="/certs/postgres.crt"
382
- )
383
-
384
- # Audit credential access
385
- from audit_logger import AuditLogger
386
- audit_logger = AuditLogger()
387
- audit_logger.log_credential_access(
388
- secret_name="postgres-connection-string",
389
- accessed_by=os.getenv("USER"),
390
- pipeline="customer_etl",
391
- timestamp=datetime.now()
392
- )
393
- ```
394
-
395
- ### Data Lineage Tracking
396
- ```python
397
- from data_lineage import LineageTracker
398
-
399
- tracker = LineageTracker()
400
-
401
- # Track data movement
402
- @tracker.track_lineage
403
- def customer_etl_pipeline():
404
- # Extract
405
- source_data = extract_from_postgres(
406
- table="customers",
407
- database="prod_crm"
408
- )
409
-
410
- tracker.log_source(
411
- dataset="prod_crm.customers",
412
- record_count=len(source_data),
413
- extraction_time=datetime.now()
414
- )
415
-
416
- # Transform
417
- transformed_data = apply_transformations(source_data)
418
-
419
- tracker.log_transformation(
420
- operation="clean_and_standardize",
421
- input_records=len(source_data),
422
- output_records=len(transformed_data),
423
- rules_applied=["deduplicate", "validate_email", "standardize_phone"]
424
- )
425
-
426
- # Load
427
- load_to_lakehouse(
428
- data=transformed_data,
429
- target="silver.customers"
430
- )
431
-
432
- tracker.log_target(
433
- dataset="silver.customers",
434
- record_count=len(transformed_data),
435
- load_time=datetime.now()
436
- )
437
-
438
- # Query lineage
439
- lineage = tracker.get_lineage(dataset="silver.customers")
440
- print(f"Source: {lineage.source}")
441
- print(f"Transformations: {lineage.transformations}")
442
- print(f"Load timestamp: {lineage.loaded_at}")
443
- ```
444
-
445
- ## 📊 Enhanced Metrics & Monitoring
446
-
447
- | Metric Category | Metric | Target | Tool |
448
- |-----------------|--------|--------|------|
449
- | **Performance** | Pipeline execution time (p95) | <30min | Azure Monitor |
450
- | | Data throughput | >10GB/hour | Pipeline metrics |
451
- | | Task success rate | >99% | Airflow/Prefect |
452
- | **Cost** | Cost per pipeline run | <$5 | FinOps tracker |
453
- | | Data transfer cost | <$0.10/GB | Cost Management |
454
- | | Compute cost per GB processed | <$0.50 | Custom tracker |
455
- | **Quality** | Schema validation pass rate | 100% | DQ framework |
456
- | | Data completeness | >98% | DQ checks |
457
- | | Duplicate rate | <1% | DQ checks |
458
- | **Reliability** | Pipeline availability | >99.5% | Azure Monitor |
459
- | | Mean Time To Recovery (MTTR) | <15min | Incident tracker |
460
- | | Failed task retry success rate | >80% | Pipeline logs |
461
- | **Security** | Encrypted connections | 100% | Security scans |
462
- | | Credential rotation compliance | 100% | Compliance dashboard |
463
-
464
- ## 🚀 Deployment Pipeline
465
-
466
- ### CI/CD for ETL Pipelines
467
- ```yaml
468
- # .github/workflows/deploy-etl-pipeline.yml
469
- name: Deploy ETL Pipeline
470
-
471
- on:
472
- push:
473
- paths:
474
- - 'pipelines/**'
475
- - 'transformations/**'
476
- branches:
477
- - main
478
-
479
- jobs:
480
- test-and-deploy:
481
- runs-on: ubuntu-latest
482
-
483
- steps:
484
- - uses: actions/checkout@v3
485
-
486
- - name: Set up Python
487
- uses: actions/setup-python@v4
488
- with:
489
- python-version: '3.10'
490
-
491
- - name: Install dependencies
492
- run: |
493
- pip install -r requirements.txt
494
- pip install pytest pytest-cov
495
-
496
- - name: Run unit tests
497
- run: pytest tests/unit/ --cov=pipelines
498
-
499
- - name: Validate pipeline definitions
500
- run: python scripts/validate_pipelines.py
501
-
502
- - name: Test schema validation
503
- run: pytest tests/schema_validation/
504
-
505
- - name: Run data quality tests
506
- run: pytest tests/data_quality/
507
-
508
- - name: Security scan
509
- run: |
510
- pip install bandit safety
511
- bandit -r pipelines/
512
- safety check
513
-
514
- - name: Deploy to staging
515
- run: |
516
- python scripts/deploy_pipeline.py \
517
- --environment staging \
518
- --pipeline customer_etl
519
-
520
- - name: Run integration tests
521
- run: pytest tests/integration/ --env staging
522
-
523
- - name: Deploy to production
524
- if: success()
525
- run: |
526
- python scripts/deploy_pipeline.py \
527
- --environment production \
528
- --pipeline customer_etl
529
-
530
- - name: Monitor pipeline health
531
- run: python scripts/monitor_pipeline.py --duration 1h
532
-
533
- - name: Generate deployment report
534
- run: python scripts/generate_deployment_report.py
535
- ```
536
-
537
- ## 🔄 Integration Workflow
538
-
539
- ### End-to-End Pipeline Workflow
540
- ```
541
- 1. Schedule Trigger (Airflow/Prefect/ADF)
542
-
543
- 2. Extract from Sources (API/DB/Files)
544
-
545
- 3. Validate Schema (de-03)
546
-
547
- 4. Security Scan & PII Detection (sa-01)
548
-
549
- 5. Transform Data (de-02)
550
-
551
- 6. Data Quality Checks (de-03)
552
-
553
- 7. Load to Lakehouse (de-01)
554
- ├── Bronze Layer (raw)
555
- ├── Silver Layer (cleaned)
556
- └── Gold Layer (business-ready)
557
-
558
- 8. Trigger Downstream Pipelines
559
- ├── Feature Store Update (ml-02)
560
- ├── RAG Knowledge Base (ai-02)
561
- ├── Analytics Refresh (ds-01)
562
- └── Model Retraining (ml-01)
563
-
564
- 9. Monitor & Alert (do-08)
565
-
566
- 10. Cost Tracking & Optimization (fo-01)
567
- ```
568
-
569
- ## 🎯 Quick Wins
570
-
571
- 1. **Implement incremental loading** - 60-80% cost reduction vs full loads
572
- 2. **Add schema validation** - Catch data issues early
573
- 3. **Set up pipeline monitoring** - Reduce MTTR by 50%
574
- 4. **Use managed identities** - Eliminate credential management overhead
575
- 5. **Enable auto-scaling** - 30-50% compute cost savings
576
- 6. **Implement retry logic** - Improve reliability to 99%+
577
- 7. **Add data quality checks** - Prevent downstream issues
578
- 8. **Set up cost alerts** - Avoid budget overruns
579
- 9. **Containerize transformations** - Portable, reproducible pipelines
580
- 10. **Enable diagnostic logging** - Full observability and debugging
1
+ # Skill 2: ETL/ELT Pipeline Orchestration
2
+
3
+ ## 🎯 Overview
4
+ Build and orchestrate production-grade ETL/ELT pipelines with scheduling, monitoring, error handling, and dependency management for scalable data workflows.
5
+
6
+ ## 🔗 Connections
7
+ - **Data Engineer**: Feeds lakehouse architecture layers (de-01, de-03)
8
+ - **ML Engineer**: Provides training data pipelines (ml-01, ml-02)
9
+ - **MLOps**: Data versioning and feature pipelines (mo-02, mo-06)
10
+ - **AI Engineer**: Prepares data for RAG systems (ai-02)
11
+ - **Data Scientist**: Delivers clean data for analysis (ds-01, ds-02)
12
+ - **Security Architect**: Secure data movement, encryption in transit (sa-04, sa-05)
13
+ - **FinOps**: Pipeline cost optimization and monitoring (fo-01, fo-06)
14
+ - **DevOps**: CI/CD for pipeline deployment, IaC (do-01, do-04, do-08)
15
+
16
+ ## 🛠️ Tools Included
17
+
18
+ ### 1. `orchestrator.py`
19
+ Pipeline orchestration with DAG-based workflow management using Airflow/Prefect.
20
+
21
+ ### 2. `connector_factory.py`
22
+ Universal connectors for databases, APIs, files, and cloud storage.
23
+
24
+ ### 3. `transformation_engine.py`
25
+ Scalable data transformations with schema validation and error handling.
26
+
27
+ ### 4. `incremental_loader.py`
28
+ Change Data Capture (CDC) and incremental loading strategies.
29
+
30
+ ### 5. `pipeline_monitor.py`
31
+ Real-time pipeline monitoring, alerting, and SLA tracking.
32
+
33
+ ## 📊 Architecture
34
+
35
+ ```
36
+ Source Systems → Extract → Transform → Load → Target Systems
37
+ ↓ ↓ ↓ ↓ ↓
38
+ Connectors Staging Validation Loading Lakehouse
39
+ API/DB/File Area Quality Strategy Layers
40
+ Checks (Full/Inc)
41
+ ```
42
+
43
+ ## 🚀 Quick Start
44
+
45
+ ```python
46
+ from orchestrator import Pipeline, Task
47
+ from connector_factory import ConnectorFactory
48
+
49
+ # Define pipeline
50
+ pipeline = Pipeline(
51
+ name="customer_etl",
52
+ schedule="0 2 * * *", # Daily at 2 AM
53
+ max_retries=3
54
+ )
55
+
56
+ # Extract task
57
+ @pipeline.task(name="extract_customers")
58
+ def extract():
59
+ connector = ConnectorFactory.create(
60
+ source_type="postgres",
61
+ connection_string=os.getenv("DB_CONNECTION")
62
+ )
63
+ return connector.extract(
64
+ query="SELECT * FROM customers WHERE updated_at > :last_run",
65
+ params={"last_run": pipeline.last_run_time}
66
+ )
67
+
68
+ # Transform task
69
+ @pipeline.task(name="transform_customers", depends_on=["extract_customers"])
70
+ def transform(data):
71
+ from transformation_engine import TransformationEngine
72
+
73
+ engine = TransformationEngine()
74
+ return engine.apply([
75
+ engine.clean_nulls(),
76
+ engine.standardize_phone_numbers(),
77
+ engine.deduplicate(key="customer_id"),
78
+ engine.validate_schema(schema="customer_schema.json")
79
+ ], data)
80
+
81
+ # Load task
82
+ @pipeline.task(name="load_customers", depends_on=["transform_customers"])
83
+ def load(data):
84
+ connector = ConnectorFactory.create(
85
+ target_type="delta_lake",
86
+ path="abfss://silver@lakehouse.dfs.core.windows.net/customers"
87
+ )
88
+ connector.load(data, mode="append", partition_by="ingestion_date")
89
+
90
+ # Execute
91
+ pipeline.run()
92
+ ```
93
+
94
+ ## 📚 Best Practices
95
+
96
+ ### Cost Optimization (FinOps Integration)
97
+
98
+ 1. **Optimize Compute for Pipeline Execution**
99
+ - Use serverless compute for sporadic pipelines
100
+ - Right-size dedicated compute pools for regular jobs
101
+ - Implement auto-scaling based on queue depth
102
+ - Schedule resource-intensive jobs during off-peak hours
103
+ - Reference: FinOps fo-06 (Compute Optimization)
104
+
105
+ 2. **Incremental Loading to Reduce Costs**
106
+ - Implement CDC to process only changed records
107
+ - Use watermarks for incremental extraction
108
+ - Partition data by date for efficient processing
109
+ - Track last successful run timestamps
110
+ - Reference: Data Engineer best practices
111
+
112
+ 3. **Pipeline Cost Monitoring**
113
+ - Track costs per pipeline execution
114
+ - Monitor data transfer costs between regions
115
+ - Alert on cost anomalies
116
+ - Generate cost attribution reports by team/project
117
+ - Reference: FinOps fo-01 (Cost Monitoring), fo-03 (Budget Management)
118
+
119
+ 4. **Data Transfer Cost Optimization**
120
+ - Minimize cross-region data movement
121
+ - Compress data before transfer
122
+ - Use Azure Private Link to avoid egress charges
123
+ - Batch small files to reduce transaction costs
124
+ - Reference: FinOps fo-05 (Storage Optimization)
125
+
126
+ ### Infrastructure as Code (DevOps Integration)
127
+
128
+ 5. **Deploy Pipelines as Code**
129
+ - Version control all pipeline definitions
130
+ - Use IaC for pipeline infrastructure (Terraform/Bicep)
131
+ - Implement GitOps for pipeline deployment
132
+ - Maintain separate environments (dev/staging/prod)
133
+ - Reference: DevOps do-04 (IaC), do-05 (GitOps)
134
+
135
+ 6. **CI/CD for Data Pipelines**
136
+ - Automated testing for pipeline logic
137
+ - Schema validation tests
138
+ - Data quality tests in CI
139
+ - Blue-green deployments for critical pipelines
140
+ - Reference: DevOps do-01 (CI/CD), do-02 (Testing)
141
+
142
+ 7. **Containerize Pipeline Components**
143
+ - Package transformations in Docker containers
144
+ - Use Kubernetes for orchestration at scale
145
+ - Implement health checks and readiness probes
146
+ - Version container images alongside code
147
+ - Reference: DevOps do-03 (Containerization)
148
+
149
+ ### Security & Compliance (Security Architect Integration)
150
+
151
+ 8. **Secure Credential Management**
152
+ - Store credentials in Azure Key Vault
153
+ - Use managed identities where possible
154
+ - Rotate credentials regularly
155
+ - Never hardcode secrets in pipeline code
156
+ - Reference: Security Architect sa-03 (Secrets Management)
157
+
158
+ 9. **Encrypt Data in Transit**
159
+ - Use TLS for all data transfers
160
+ - Implement VPN/Private Link for sensitive data
161
+ - Validate SSL certificates
162
+ - Monitor for unencrypted connections
163
+ - Reference: Security Architect sa-04 (Encryption)
164
+
165
+ 10. **Audit Trail for Data Movement**
166
+ - Log all pipeline executions with metadata
167
+ - Track data lineage from source to target
168
+ - Implement compliance auditing
169
+ - Retain audit logs per regulatory requirements
170
+ - Reference: Security Architect sa-06 (Data Governance)
171
+
172
+ ### Data Quality (Data Engineer Integration)
173
+
174
+ 11. **Schema Validation**
175
+ - Validate schemas before transformation
176
+ - Detect schema drift automatically
177
+ - Alert on breaking schema changes
178
+ - Maintain schema registry
179
+ - Reference: Data Engineer de-03 (Data Quality)
180
+
181
+ 12. **Data Quality Gates**
182
+ - Implement quality checks at each stage
183
+ - Quarantine bad data for review
184
+ - Alert on quality threshold violations
185
+ - Track data quality metrics over time
186
+ - Reference: Data Engineer de-03 (Data Quality)
187
+
188
+ 13. **Error Handling & Recovery**
189
+ - Implement circuit breakers for failing sources
190
+ - Dead letter queues for failed records
191
+ - Automatic retry with exponential backoff
192
+ - Manual review process for persistent failures
193
+ - Reference: System Design sd-05 (Resilience Patterns)
194
+
195
+ ### Monitoring & Observability (DevOps Integration)
196
+
197
+ 14. **Pipeline Monitoring**
198
+ - Track execution time, success rate, data volume
199
+ - Set up SLA alerts for critical pipelines
200
+ - Monitor resource utilization (CPU, memory, I/O)
201
+ - Implement distributed tracing for complex workflows
202
+ - Reference: DevOps do-08 (Monitoring & Observability)
203
+
204
+ 15. **Alerting Strategy**
205
+ - Alert on pipeline failures with context
206
+ - Warning alerts for performance degradation
207
+ - SLA breach notifications
208
+ - Cost spike alerts
209
+ - Reference: DevOps do-08, FinOps fo-03
210
+
211
+ ### Azure-Specific Best Practices
212
+
213
+ 16. **Azure Data Factory**
214
+ - Use managed VNet for secure connectivity
215
+ - Implement parameterized pipelines
216
+ - Enable git integration for version control
217
+ - Use mapping data flows for complex transformations
218
+ - Reference: Azure az-01 (Data Factory)
219
+
220
+ 17. **Azure Synapse Pipelines**
221
+ - Leverage Spark pools for big data processing
222
+ - Use serverless SQL for ad-hoc transformations
223
+ - Implement workload management
224
+ - Monitor pipeline costs via Synapse Studio
225
+ - Reference: Azure az-02 (Synapse Analytics)
226
+
227
+ 18. **Integration Runtime Optimization**
228
+ - Use Azure IR for Azure-to-Azure transfers
229
+ - Self-hosted IR for on-premises connectivity
230
+ - Right-size IR compute units
231
+ - Enable VNet integration for security
232
+ - Reference: Azure az-01
233
+
234
+ ## 💰 Cost Optimization Examples
235
+
236
+ ### Incremental Loading with CDC
237
+ ```python
238
+ from incremental_loader import IncrementalLoader
239
+ from datetime import datetime, timedelta
240
+
241
+ # Initialize incremental loader
242
+ loader = IncrementalLoader(
243
+ source="sales_db",
244
+ target="bronze.sales",
245
+ watermark_column="updated_at"
246
+ )
247
+
248
+ # Load only changed records
249
+ result = loader.load_incremental(
250
+ query="""
251
+ SELECT * FROM sales
252
+ WHERE updated_at > :watermark
253
+ AND updated_at <= :current_time
254
+ """,
255
+ merge_key="sale_id"
256
+ )
257
+
258
+ print(f"Records processed: {result.records_processed}")
259
+ print(f"Records skipped: {result.records_skipped}")
260
+ print(f"Cost saved vs full load: ${result.cost_savings:.2f}")
261
+
262
+ # Track incremental loading efficiency
263
+ from finops_tracker import PipelineCostTracker
264
+
265
+ cost_tracker = PipelineCostTracker()
266
+ cost_tracker.log_execution(
267
+ pipeline_name="sales_incremental",
268
+ execution_time=result.execution_time,
269
+ data_processed_gb=result.data_size_gb,
270
+ compute_cost=result.compute_cost,
271
+ storage_cost=result.storage_cost
272
+ )
273
+ ```
274
+
275
+ ### Pipeline Cost Monitoring
276
+ ```python
277
+ from pipeline_monitor import PipelineMonitor
278
+ from finops_tracker import PipelineCostTracker
279
+
280
+ monitor = PipelineMonitor()
281
+ cost_tracker = PipelineCostTracker()
282
+
283
+ # Track pipeline execution costs
284
+ @cost_tracker.track_costs
285
+ def run_pipeline(pipeline_name: str):
286
+ pipeline = Pipeline.get(pipeline_name)
287
+ result = pipeline.run()
288
+
289
+ # Log detailed cost breakdown
290
+ cost_tracker.log_execution(
291
+ pipeline_name=pipeline_name,
292
+ execution_time=result.duration,
293
+ data_processed_gb=result.data_volume,
294
+ compute_cost=result.compute_cost,
295
+ data_transfer_cost=result.transfer_cost,
296
+ storage_cost=result.storage_cost
297
+ )
298
+
299
+ return result
300
+
301
+ # Generate cost report
302
+ report = cost_tracker.generate_report(
303
+ period="monthly",
304
+ group_by=["pipeline_name", "environment"]
305
+ )
306
+
307
+ print(f"Total pipeline costs: ${report.total_cost:.2f}")
308
+ print(f"Top 5 expensive pipelines:")
309
+ for pipeline in report.top_pipelines:
310
+ print(f" {pipeline.name}: ${pipeline.cost:.2f}")
311
+
312
+ # Set budget alerts
313
+ cost_tracker.set_budget_alert(
314
+ pipeline_name="customer_etl",
315
+ monthly_budget=500.00,
316
+ alert_threshold=0.8
317
+ )
318
+ ```
319
+
320
+ ### Optimize Data Transfer Costs
321
+ ```python
322
+ from connector_factory import ConnectorFactory
323
+ from compression_utils import compress_data
324
+
325
+ # Extract with compression
326
+ source_connector = ConnectorFactory.create(
327
+ source_type="postgres",
328
+ connection_string=db_connection
329
+ )
330
+
331
+ data = source_connector.extract(query="SELECT * FROM large_table")
332
+
333
+ # Compress before transfer
334
+ compressed_data = compress_data(
335
+ data,
336
+ algorithm="zstd", # Better compression than gzip
337
+ compression_level=3 # Balance speed vs size
338
+ )
339
+
340
+ print(f"Original size: {data.size_mb:.2f} MB")
341
+ print(f"Compressed size: {compressed_data.size_mb:.2f} MB")
342
+ print(f"Compression ratio: {compressed_data.ratio:.2f}x")
343
+ print(f"Transfer cost saved: ${compressed_data.cost_savings:.2f}")
344
+
345
+ # Load to target
346
+ target_connector = ConnectorFactory.create(
347
+ target_type="delta_lake",
348
+ path="abfss://bronze@lakehouse.dfs.core.windows.net/data"
349
+ )
350
+
351
+ target_connector.load(
352
+ compressed_data,
353
+ decompress=True,
354
+ partition_by="ingestion_date"
355
+ )
356
+ ```
357
+
358
+ ## 🔒 Security Best Practices Examples
359
+
360
+ ### Secure Credential Management
361
+ ```python
362
+ from azure.identity import DefaultAzureCredential
363
+ from azure.keyvault.secrets import SecretClient
364
+ from connector_factory import ConnectorFactory
365
+
366
+ # Use managed identity to access Key Vault
367
+ credential = DefaultAzureCredential()
368
+ key_vault_client = SecretClient(
369
+ vault_url="https://my-keyvault.vault.azure.net/",
370
+ credential=credential
371
+ )
372
+
373
+ # Retrieve connection string securely
374
+ db_connection = key_vault_client.get_secret("postgres-connection-string").value
375
+
376
+ # Create connector with secure credentials
377
+ connector = ConnectorFactory.create(
378
+ source_type="postgres",
379
+ connection_string=db_connection,
380
+ ssl_mode="require", # Enforce SSL
381
+ ssl_cert_path="/certs/postgres.crt"
382
+ )
383
+
384
+ # Audit credential access
385
+ from audit_logger import AuditLogger
386
+ audit_logger = AuditLogger()
387
+ audit_logger.log_credential_access(
388
+ secret_name="postgres-connection-string",
389
+ accessed_by=os.getenv("USER"),
390
+ pipeline="customer_etl",
391
+ timestamp=datetime.now()
392
+ )
393
+ ```
394
+
395
+ ### Data Lineage Tracking
396
+ ```python
397
+ from data_lineage import LineageTracker
398
+
399
+ tracker = LineageTracker()
400
+
401
+ # Track data movement
402
+ @tracker.track_lineage
403
+ def customer_etl_pipeline():
404
+ # Extract
405
+ source_data = extract_from_postgres(
406
+ table="customers",
407
+ database="prod_crm"
408
+ )
409
+
410
+ tracker.log_source(
411
+ dataset="prod_crm.customers",
412
+ record_count=len(source_data),
413
+ extraction_time=datetime.now()
414
+ )
415
+
416
+ # Transform
417
+ transformed_data = apply_transformations(source_data)
418
+
419
+ tracker.log_transformation(
420
+ operation="clean_and_standardize",
421
+ input_records=len(source_data),
422
+ output_records=len(transformed_data),
423
+ rules_applied=["deduplicate", "validate_email", "standardize_phone"]
424
+ )
425
+
426
+ # Load
427
+ load_to_lakehouse(
428
+ data=transformed_data,
429
+ target="silver.customers"
430
+ )
431
+
432
+ tracker.log_target(
433
+ dataset="silver.customers",
434
+ record_count=len(transformed_data),
435
+ load_time=datetime.now()
436
+ )
437
+
438
+ # Query lineage
439
+ lineage = tracker.get_lineage(dataset="silver.customers")
440
+ print(f"Source: {lineage.source}")
441
+ print(f"Transformations: {lineage.transformations}")
442
+ print(f"Load timestamp: {lineage.loaded_at}")
443
+ ```
444
+
445
+ ## 📊 Enhanced Metrics & Monitoring
446
+
447
+ | Metric Category | Metric | Target | Tool |
448
+ |-----------------|--------|--------|------|
449
+ | **Performance** | Pipeline execution time (p95) | <30min | Azure Monitor |
450
+ | | Data throughput | >10GB/hour | Pipeline metrics |
451
+ | | Task success rate | >99% | Airflow/Prefect |
452
+ | **Cost** | Cost per pipeline run | <$5 | FinOps tracker |
453
+ | | Data transfer cost | <$0.10/GB | Cost Management |
454
+ | | Compute cost per GB processed | <$0.50 | Custom tracker |
455
+ | **Quality** | Schema validation pass rate | 100% | DQ framework |
456
+ | | Data completeness | >98% | DQ checks |
457
+ | | Duplicate rate | <1% | DQ checks |
458
+ | **Reliability** | Pipeline availability | >99.5% | Azure Monitor |
459
+ | | Mean Time To Recovery (MTTR) | <15min | Incident tracker |
460
+ | | Failed task retry success rate | >80% | Pipeline logs |
461
+ | **Security** | Encrypted connections | 100% | Security scans |
462
+ | | Credential rotation compliance | 100% | Compliance dashboard |
463
+
464
+ ## 🚀 Deployment Pipeline
465
+
466
+ ### CI/CD for ETL Pipelines
467
+ ```yaml
468
+ # .github/workflows/deploy-etl-pipeline.yml
469
+ name: Deploy ETL Pipeline
470
+
471
+ on:
472
+ push:
473
+ paths:
474
+ - 'pipelines/**'
475
+ - 'transformations/**'
476
+ branches:
477
+ - main
478
+
479
+ jobs:
480
+ test-and-deploy:
481
+ runs-on: ubuntu-latest
482
+
483
+ steps:
484
+ - uses: actions/checkout@v3
485
+
486
+ - name: Set up Python
487
+ uses: actions/setup-python@v4
488
+ with:
489
+ python-version: '3.10'
490
+
491
+ - name: Install dependencies
492
+ run: |
493
+ pip install -r requirements.txt
494
+ pip install pytest pytest-cov
495
+
496
+ - name: Run unit tests
497
+ run: pytest tests/unit/ --cov=pipelines
498
+
499
+ - name: Validate pipeline definitions
500
+ run: python scripts/validate_pipelines.py
501
+
502
+ - name: Test schema validation
503
+ run: pytest tests/schema_validation/
504
+
505
+ - name: Run data quality tests
506
+ run: pytest tests/data_quality/
507
+
508
+ - name: Security scan
509
+ run: |
510
+ pip install bandit safety
511
+ bandit -r pipelines/
512
+ safety check
513
+
514
+ - name: Deploy to staging
515
+ run: |
516
+ python scripts/deploy_pipeline.py \
517
+ --environment staging \
518
+ --pipeline customer_etl
519
+
520
+ - name: Run integration tests
521
+ run: pytest tests/integration/ --env staging
522
+
523
+ - name: Deploy to production
524
+ if: success()
525
+ run: |
526
+ python scripts/deploy_pipeline.py \
527
+ --environment production \
528
+ --pipeline customer_etl
529
+
530
+ - name: Monitor pipeline health
531
+ run: python scripts/monitor_pipeline.py --duration 1h
532
+
533
+ - name: Generate deployment report
534
+ run: python scripts/generate_deployment_report.py
535
+ ```
536
+
537
+ ## 🔄 Integration Workflow
538
+
539
+ ### End-to-End Pipeline Workflow
540
+ ```
541
+ 1. Schedule Trigger (Airflow/Prefect/ADF)
542
+
543
+ 2. Extract from Sources (API/DB/Files)
544
+
545
+ 3. Validate Schema (de-03)
546
+
547
+ 4. Security Scan & PII Detection (sa-01)
548
+
549
+ 5. Transform Data (de-02)
550
+
551
+ 6. Data Quality Checks (de-03)
552
+
553
+ 7. Load to Lakehouse (de-01)
554
+ ├── Bronze Layer (raw)
555
+ ├── Silver Layer (cleaned)
556
+ └── Gold Layer (business-ready)
557
+
558
+ 8. Trigger Downstream Pipelines
559
+ ├── Feature Store Update (ml-02)
560
+ ├── RAG Knowledge Base (ai-02)
561
+ ├── Analytics Refresh (ds-01)
562
+ └── Model Retraining (ml-01)
563
+
564
+ 9. Monitor & Alert (do-08)
565
+
566
+ 10. Cost Tracking & Optimization (fo-01)
567
+ ```
568
+
569
+ ## 🎯 Quick Wins
570
+
571
+ 1. **Implement incremental loading** - 60-80% cost reduction vs full loads
572
+ 2. **Add schema validation** - Catch data issues early
573
+ 3. **Set up pipeline monitoring** - Reduce MTTR by 50%
574
+ 4. **Use managed identities** - Eliminate credential management overhead
575
+ 5. **Enable auto-scaling** - 30-50% compute cost savings
576
+ 6. **Implement retry logic** - Improve reliability to 99%+
577
+ 7. **Add data quality checks** - Prevent downstream issues
578
+ 8. **Set up cost alerts** - Avoid budget overruns
579
+ 9. **Containerize transformations** - Portable, reproducible pipelines
580
+ 10. **Enable diagnostic logging** - Full observability and debugging