tech-hub-skills 1.2.0 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/{LICENSE → .claude/LICENSE} +21 -21
- package/.claude/README.md +291 -0
- package/.claude/bin/cli.js +266 -0
- package/{bin → .claude/bin}/copilot.js +182 -182
- package/{bin → .claude/bin}/postinstall.js +42 -42
- package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
- package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
- package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
- package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
- package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
- package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
- package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
- package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
- package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
- package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
- package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
- package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
- package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
- package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
- package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
- package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
- package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
- package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
- package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
- package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
- package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
- package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
- package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
- package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
- package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
- package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
- package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
- package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
- package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
- package/.claude/package.json +46 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
- package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
- package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
- package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
- package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
- package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
- package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
- package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
- package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
- package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
- package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
- package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
- package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
- package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
- package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
- package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
- package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
- package/.claude/skills/README.md +336 -0
- package/.claude/skills/ai-engineer.md +104 -0
- package/.claude/skills/aws.md +143 -0
- package/.claude/skills/azure.md +149 -0
- package/.claude/skills/backend-developer.md +108 -0
- package/.claude/skills/code-review.md +399 -0
- package/.claude/skills/compliance-automation.md +747 -0
- package/.claude/skills/compliance-officer.md +108 -0
- package/.claude/skills/data-engineer.md +113 -0
- package/.claude/skills/data-governance.md +102 -0
- package/.claude/skills/data-scientist.md +123 -0
- package/.claude/skills/database-admin.md +109 -0
- package/.claude/skills/devops.md +160 -0
- package/.claude/skills/docker.md +160 -0
- package/.claude/skills/enterprise-dashboard.md +613 -0
- package/.claude/skills/finops.md +184 -0
- package/.claude/skills/frontend-developer.md +108 -0
- package/.claude/skills/gcp.md +143 -0
- package/.claude/skills/ml-engineer.md +115 -0
- package/.claude/skills/mlops.md +187 -0
- package/.claude/skills/network-engineer.md +109 -0
- package/.claude/skills/optimization-advisor.md +329 -0
- package/.claude/skills/orchestrator.md +623 -0
- package/.claude/skills/platform-engineer.md +102 -0
- package/.claude/skills/process-automation.md +226 -0
- package/.claude/skills/process-changelog.md +184 -0
- package/.claude/skills/process-documentation.md +484 -0
- package/.claude/skills/process-kanban.md +324 -0
- package/.claude/skills/process-versioning.md +214 -0
- package/.claude/skills/product-designer.md +104 -0
- package/.claude/skills/project-starter.md +443 -0
- package/.claude/skills/qa-engineer.md +109 -0
- package/.claude/skills/security-architect.md +135 -0
- package/.claude/skills/sre.md +109 -0
- package/.claude/skills/system-design.md +126 -0
- package/.claude/skills/technical-writer.md +101 -0
- package/.gitattributes +2 -0
- package/GITHUB_COPILOT.md +106 -0
- package/README.md +192 -291
- package/package.json +16 -46
- package/bin/cli.js +0 -241
|
@@ -1,112 +1,112 @@
|
|
|
1
|
-
# dg-01: Data Catalog
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
|
|
5
|
-
Build enterprise data catalogs for asset discovery, metadata management, and data classification.
|
|
6
|
-
|
|
7
|
-
## Key Capabilities
|
|
8
|
-
|
|
9
|
-
- **Asset Registration**: Automated discovery and registration of data assets
|
|
10
|
-
- **Metadata Management**: Technical, business, and operational metadata
|
|
11
|
-
- **Data Classification**: Automatic classification (PII, confidential, public)
|
|
12
|
-
- **Search & Discovery**: Powerful search capabilities for data consumers
|
|
13
|
-
- **Business Glossary**: Standardized business terminology
|
|
14
|
-
|
|
15
|
-
## Tools & Technologies
|
|
16
|
-
|
|
17
|
-
- **Azure Purview**: Enterprise data catalog
|
|
18
|
-
- **DataHub**: Open-source metadata platform
|
|
19
|
-
- **Amundsen**: Lyft's data discovery platform
|
|
20
|
-
- **Collibra**: Data governance platform
|
|
21
|
-
|
|
22
|
-
## Implementation
|
|
23
|
-
|
|
24
|
-
### 1. Asset Registration
|
|
25
|
-
|
|
26
|
-
```python
|
|
27
|
-
# Automated asset registration
|
|
28
|
-
from azure.purview.catalog import PurviewCatalogClient
|
|
29
|
-
|
|
30
|
-
def register_data_asset(asset_name, asset_type, location):
|
|
31
|
-
"""Register data asset in catalog"""
|
|
32
|
-
client = PurviewCatalogClient()
|
|
33
|
-
|
|
34
|
-
asset = {
|
|
35
|
-
"typeName": asset_type,
|
|
36
|
-
"attributes": {
|
|
37
|
-
"name": asset_name,
|
|
38
|
-
"qualifiedName": f"{location}/{asset_name}",
|
|
39
|
-
"location": location
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
return client.entity.create_or_update(entity=asset)
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
### 2. Metadata Management
|
|
47
|
-
|
|
48
|
-
```python
|
|
49
|
-
# Add business metadata
|
|
50
|
-
def add_business_metadata(asset_id, owner, description, tags):
|
|
51
|
-
"""Enrich asset with business context"""
|
|
52
|
-
metadata = {
|
|
53
|
-
"businessOwner": owner,
|
|
54
|
-
"description": description,
|
|
55
|
-
"tags": tags,
|
|
56
|
-
"certification": "certified"
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
return client.entity.add_business_metadata(
|
|
60
|
-
guid=asset_id,
|
|
61
|
-
business_metadata=metadata
|
|
62
|
-
)
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
### 3. Data Classification
|
|
66
|
-
|
|
67
|
-
```python
|
|
68
|
-
# Automatic classification
|
|
69
|
-
def classify_data(asset_id):
|
|
70
|
-
"""Apply automatic classification based on content"""
|
|
71
|
-
classifications = []
|
|
72
|
-
|
|
73
|
-
# Scan for PII
|
|
74
|
-
if contains_pii(asset_id):
|
|
75
|
-
classifications.append("PII")
|
|
76
|
-
|
|
77
|
-
# Scan for confidential data
|
|
78
|
-
if contains_confidential(asset_id):
|
|
79
|
-
classifications.append("Confidential")
|
|
80
|
-
|
|
81
|
-
return client.entity.add_classifications(
|
|
82
|
-
guid=asset_id,
|
|
83
|
-
classifications=classifications
|
|
84
|
-
)
|
|
85
|
-
```
|
|
86
|
-
|
|
87
|
-
## Best Practices
|
|
88
|
-
|
|
89
|
-
1. **Automate Discovery** - Use scanners to auto-discover assets
|
|
90
|
-
2. **Enrich Metadata** - Add business context, not just technical
|
|
91
|
-
3. **Clear Ownership** - Every asset needs a business owner
|
|
92
|
-
4. **Regular Updates** - Keep metadata fresh and relevant
|
|
93
|
-
5. **User Training** - Train users on search capabilities
|
|
94
|
-
|
|
95
|
-
## Cost Optimization
|
|
96
|
-
|
|
97
|
-
- Use Azure Purview Standard tier for < 100k assets
|
|
98
|
-
- Schedule scans during off-peak hours
|
|
99
|
-
- Use incremental scans instead of full scans
|
|
100
|
-
- Archive unused asset metadata
|
|
101
|
-
|
|
102
|
-
## Integration
|
|
103
|
-
|
|
104
|
-
**Connects with:**
|
|
105
|
-
- de-01 (Lakehouse): Catalog lakehouse tables
|
|
106
|
-
- sa-01 (PII Detection): Auto-classify PII data
|
|
107
|
-
- dg-02 (Lineage): Link to lineage tracking
|
|
108
|
-
- dg-03 (Quality): Link quality scores
|
|
109
|
-
|
|
110
|
-
## Quick Win
|
|
111
|
-
|
|
112
|
-
Start with top 10 critical datasets, manually catalog them with rich metadata, then expand automated discovery.
|
|
1
|
+
# dg-01: Data Catalog
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Build enterprise data catalogs for asset discovery, metadata management, and data classification.
|
|
6
|
+
|
|
7
|
+
## Key Capabilities
|
|
8
|
+
|
|
9
|
+
- **Asset Registration**: Automated discovery and registration of data assets
|
|
10
|
+
- **Metadata Management**: Technical, business, and operational metadata
|
|
11
|
+
- **Data Classification**: Automatic classification (PII, confidential, public)
|
|
12
|
+
- **Search & Discovery**: Powerful search capabilities for data consumers
|
|
13
|
+
- **Business Glossary**: Standardized business terminology
|
|
14
|
+
|
|
15
|
+
## Tools & Technologies
|
|
16
|
+
|
|
17
|
+
- **Azure Purview**: Enterprise data catalog
|
|
18
|
+
- **DataHub**: Open-source metadata platform
|
|
19
|
+
- **Amundsen**: Lyft's data discovery platform
|
|
20
|
+
- **Collibra**: Data governance platform
|
|
21
|
+
|
|
22
|
+
## Implementation
|
|
23
|
+
|
|
24
|
+
### 1. Asset Registration
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
# Automated asset registration
|
|
28
|
+
from azure.purview.catalog import PurviewCatalogClient
|
|
29
|
+
|
|
30
|
+
def register_data_asset(asset_name, asset_type, location):
|
|
31
|
+
"""Register data asset in catalog"""
|
|
32
|
+
client = PurviewCatalogClient()
|
|
33
|
+
|
|
34
|
+
asset = {
|
|
35
|
+
"typeName": asset_type,
|
|
36
|
+
"attributes": {
|
|
37
|
+
"name": asset_name,
|
|
38
|
+
"qualifiedName": f"{location}/{asset_name}",
|
|
39
|
+
"location": location
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return client.entity.create_or_update(entity=asset)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### 2. Metadata Management
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
# Add business metadata
|
|
50
|
+
def add_business_metadata(asset_id, owner, description, tags):
|
|
51
|
+
"""Enrich asset with business context"""
|
|
52
|
+
metadata = {
|
|
53
|
+
"businessOwner": owner,
|
|
54
|
+
"description": description,
|
|
55
|
+
"tags": tags,
|
|
56
|
+
"certification": "certified"
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return client.entity.add_business_metadata(
|
|
60
|
+
guid=asset_id,
|
|
61
|
+
business_metadata=metadata
|
|
62
|
+
)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### 3. Data Classification
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
# Automatic classification
|
|
69
|
+
def classify_data(asset_id):
|
|
70
|
+
"""Apply automatic classification based on content"""
|
|
71
|
+
classifications = []
|
|
72
|
+
|
|
73
|
+
# Scan for PII
|
|
74
|
+
if contains_pii(asset_id):
|
|
75
|
+
classifications.append("PII")
|
|
76
|
+
|
|
77
|
+
# Scan for confidential data
|
|
78
|
+
if contains_confidential(asset_id):
|
|
79
|
+
classifications.append("Confidential")
|
|
80
|
+
|
|
81
|
+
return client.entity.add_classifications(
|
|
82
|
+
guid=asset_id,
|
|
83
|
+
classifications=classifications
|
|
84
|
+
)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Best Practices
|
|
88
|
+
|
|
89
|
+
1. **Automate Discovery** - Use scanners to auto-discover assets
|
|
90
|
+
2. **Enrich Metadata** - Add business context, not just technical
|
|
91
|
+
3. **Clear Ownership** - Every asset needs a business owner
|
|
92
|
+
4. **Regular Updates** - Keep metadata fresh and relevant
|
|
93
|
+
5. **User Training** - Train users on search capabilities
|
|
94
|
+
|
|
95
|
+
## Cost Optimization
|
|
96
|
+
|
|
97
|
+
- Use Azure Purview Standard tier for < 100k assets
|
|
98
|
+
- Schedule scans during off-peak hours
|
|
99
|
+
- Use incremental scans instead of full scans
|
|
100
|
+
- Archive unused asset metadata
|
|
101
|
+
|
|
102
|
+
## Integration
|
|
103
|
+
|
|
104
|
+
**Connects with:**
|
|
105
|
+
- de-01 (Lakehouse): Catalog lakehouse tables
|
|
106
|
+
- sa-01 (PII Detection): Auto-classify PII data
|
|
107
|
+
- dg-02 (Lineage): Link to lineage tracking
|
|
108
|
+
- dg-03 (Quality): Link quality scores
|
|
109
|
+
|
|
110
|
+
## Quick Win
|
|
111
|
+
|
|
112
|
+
Start with top 10 critical datasets, manually catalog them with rich metadata, then expand automated discovery.
|
|
@@ -1,129 +1,129 @@
|
|
|
1
|
-
# dg-02: Data Lineage
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
|
|
5
|
-
Track end-to-end data lineage for impact analysis, root cause analysis, and regulatory compliance.
|
|
6
|
-
|
|
7
|
-
## Key Capabilities
|
|
8
|
-
|
|
9
|
-
- **End-to-End Lineage**: From source to consumption
|
|
10
|
-
- **Impact Analysis**: Understand downstream impacts
|
|
11
|
-
- **Root Cause Analysis**: Trace issues to source
|
|
12
|
-
- **Column-Level Lineage**: Field-level tracking
|
|
13
|
-
- **Transformation Documentation**: Track data transformations
|
|
14
|
-
|
|
15
|
-
## Tools & Technologies
|
|
16
|
-
|
|
17
|
-
- **Azure Purview**: Native lineage tracking
|
|
18
|
-
- **OpenLineage**: Open standard for lineage
|
|
19
|
-
- **Marquez**: Metadata service for lineage
|
|
20
|
-
- **Spline**: Spark lineage tracking
|
|
21
|
-
|
|
22
|
-
## Implementation
|
|
23
|
-
|
|
24
|
-
### 1. Lineage Extraction
|
|
25
|
-
|
|
26
|
-
```python
|
|
27
|
-
# Extract lineage from Spark jobs
|
|
28
|
-
from spline import SplineAgent
|
|
29
|
-
|
|
30
|
-
def track_spark_lineage(spark_session):
|
|
31
|
-
"""Enable lineage tracking for Spark"""
|
|
32
|
-
spark_session.sparkContext.setLogLevel("INFO")
|
|
33
|
-
|
|
34
|
-
# Initialize Spline agent
|
|
35
|
-
SplineAgent.builder() \
|
|
36
|
-
.appName("data-pipeline") \
|
|
37
|
-
.mode("REQUIRED") \
|
|
38
|
-
.url("http://spline-server:9090") \
|
|
39
|
-
.build()
|
|
40
|
-
```
|
|
41
|
-
|
|
42
|
-
### 2. Column-Level Lineage
|
|
43
|
-
|
|
44
|
-
```sql
|
|
45
|
-
-- Azure Purview automatically tracks column lineage
|
|
46
|
-
-- Example transformation with lineage
|
|
47
|
-
CREATE VIEW customer_360 AS
|
|
48
|
-
SELECT
|
|
49
|
-
c.customer_id,
|
|
50
|
-
c.first_name || ' ' || c.last_name as full_name, -- Lineage: derived
|
|
51
|
-
o.total_orders,
|
|
52
|
-
p.total_payments
|
|
53
|
-
FROM customers c
|
|
54
|
-
LEFT JOIN order_summary o ON c.customer_id = o.customer_id
|
|
55
|
-
LEFT JOIN payment_summary p ON c.customer_id = p.customer_id;
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
### 3. Impact Analysis
|
|
59
|
-
|
|
60
|
-
```python
|
|
61
|
-
# Find downstream dependencies
|
|
62
|
-
def get_downstream_impact(asset_id):
|
|
63
|
-
"""Find all downstream assets affected by changes"""
|
|
64
|
-
lineage = client.lineage.get_lineage(
|
|
65
|
-
guid=asset_id,
|
|
66
|
-
direction="OUTPUT",
|
|
67
|
-
depth=10
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
downstream_assets = []
|
|
71
|
-
for entity in lineage['guidEntityMap'].values():
|
|
72
|
-
downstream_assets.append({
|
|
73
|
-
'name': entity['attributes']['name'],
|
|
74
|
-
'type': entity['typeName'],
|
|
75
|
-
'owner': entity.get('attributes', {}).get('owner')
|
|
76
|
-
})
|
|
77
|
-
|
|
78
|
-
return downstream_assets
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
### 4. OpenLineage Integration
|
|
82
|
-
|
|
83
|
-
```python
|
|
84
|
-
# Emit lineage events using OpenLineage
|
|
85
|
-
from openlineage.client import OpenLineageClient
|
|
86
|
-
from openlineage.client.run import RunEvent, RunState, Run, Job
|
|
87
|
-
|
|
88
|
-
def emit_lineage_event(job_name, inputs, outputs):
|
|
89
|
-
"""Emit lineage event to OpenLineage"""
|
|
90
|
-
client = OpenLineageClient(url="http://lineage-api:5000")
|
|
91
|
-
|
|
92
|
-
event = RunEvent(
|
|
93
|
-
eventType=RunState.COMPLETE,
|
|
94
|
-
eventTime="2025-01-01T00:00:00Z",
|
|
95
|
-
run=Run(runId=str(uuid.uuid4())),
|
|
96
|
-
job=Job(namespace="production", name=job_name),
|
|
97
|
-
inputs=inputs,
|
|
98
|
-
outputs=outputs
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
client.emit(event)
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
## Best Practices
|
|
105
|
-
|
|
106
|
-
1. **Automate Collection** - Manual lineage doesn't scale
|
|
107
|
-
2. **Column-Level Tracking** - For sensitive data, track field-level
|
|
108
|
-
3. **Version Control** - Track lineage changes over time
|
|
109
|
-
4. **Clear Visualization** - Make lineage easy to understand
|
|
110
|
-
5. **Regular Validation** - Verify lineage accuracy
|
|
111
|
-
|
|
112
|
-
## Cost Optimization
|
|
113
|
-
|
|
114
|
-
- Use incremental lineage updates
|
|
115
|
-
- Archive old lineage data after retention period
|
|
116
|
-
- Cache frequently accessed lineage queries
|
|
117
|
-
- Use materialized views for complex lineage
|
|
118
|
-
|
|
119
|
-
## Integration
|
|
120
|
-
|
|
121
|
-
**Connects with:**
|
|
122
|
-
- de-02 (ETL): Track pipeline lineage
|
|
123
|
-
- dg-01 (Catalog): Link assets to lineage
|
|
124
|
-
- ml-02 (Feature Engineering): Track feature lineage
|
|
125
|
-
- ai-02 (RAG): Track document lineage
|
|
126
|
-
|
|
127
|
-
## Quick Win
|
|
128
|
-
|
|
129
|
-
Start with 1 critical data pipeline, manually document lineage, validate accuracy, then automate extraction.
|
|
1
|
+
# dg-02: Data Lineage
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Track end-to-end data lineage for impact analysis, root cause analysis, and regulatory compliance.
|
|
6
|
+
|
|
7
|
+
## Key Capabilities
|
|
8
|
+
|
|
9
|
+
- **End-to-End Lineage**: From source to consumption
|
|
10
|
+
- **Impact Analysis**: Understand downstream impacts
|
|
11
|
+
- **Root Cause Analysis**: Trace issues to source
|
|
12
|
+
- **Column-Level Lineage**: Field-level tracking
|
|
13
|
+
- **Transformation Documentation**: Track data transformations
|
|
14
|
+
|
|
15
|
+
## Tools & Technologies
|
|
16
|
+
|
|
17
|
+
- **Azure Purview**: Native lineage tracking
|
|
18
|
+
- **OpenLineage**: Open standard for lineage
|
|
19
|
+
- **Marquez**: Metadata service for lineage
|
|
20
|
+
- **Spline**: Spark lineage tracking
|
|
21
|
+
|
|
22
|
+
## Implementation
|
|
23
|
+
|
|
24
|
+
### 1. Lineage Extraction
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
# Extract lineage from Spark jobs
|
|
28
|
+
from spline import SplineAgent
|
|
29
|
+
|
|
30
|
+
def track_spark_lineage(spark_session):
|
|
31
|
+
"""Enable lineage tracking for Spark"""
|
|
32
|
+
spark_session.sparkContext.setLogLevel("INFO")
|
|
33
|
+
|
|
34
|
+
# Initialize Spline agent
|
|
35
|
+
SplineAgent.builder() \
|
|
36
|
+
.appName("data-pipeline") \
|
|
37
|
+
.mode("REQUIRED") \
|
|
38
|
+
.url("http://spline-server:9090") \
|
|
39
|
+
.build()
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### 2. Column-Level Lineage
|
|
43
|
+
|
|
44
|
+
```sql
|
|
45
|
+
-- Azure Purview automatically tracks column lineage
|
|
46
|
+
-- Example transformation with lineage
|
|
47
|
+
CREATE VIEW customer_360 AS
|
|
48
|
+
SELECT
|
|
49
|
+
c.customer_id,
|
|
50
|
+
c.first_name || ' ' || c.last_name as full_name, -- Lineage: derived
|
|
51
|
+
o.total_orders,
|
|
52
|
+
p.total_payments
|
|
53
|
+
FROM customers c
|
|
54
|
+
LEFT JOIN order_summary o ON c.customer_id = o.customer_id
|
|
55
|
+
LEFT JOIN payment_summary p ON c.customer_id = p.customer_id;
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### 3. Impact Analysis
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
# Find downstream dependencies
|
|
62
|
+
def get_downstream_impact(asset_id):
|
|
63
|
+
"""Find all downstream assets affected by changes"""
|
|
64
|
+
lineage = client.lineage.get_lineage(
|
|
65
|
+
guid=asset_id,
|
|
66
|
+
direction="OUTPUT",
|
|
67
|
+
depth=10
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
downstream_assets = []
|
|
71
|
+
for entity in lineage['guidEntityMap'].values():
|
|
72
|
+
downstream_assets.append({
|
|
73
|
+
'name': entity['attributes']['name'],
|
|
74
|
+
'type': entity['typeName'],
|
|
75
|
+
'owner': entity.get('attributes', {}).get('owner')
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
return downstream_assets
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### 4. OpenLineage Integration
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
# Emit lineage events using OpenLineage
|
|
85
|
+
from openlineage.client import OpenLineageClient
|
|
86
|
+
from openlineage.client.run import RunEvent, RunState, Run, Job
|
|
87
|
+
|
|
88
|
+
def emit_lineage_event(job_name, inputs, outputs):
|
|
89
|
+
"""Emit lineage event to OpenLineage"""
|
|
90
|
+
client = OpenLineageClient(url="http://lineage-api:5000")
|
|
91
|
+
|
|
92
|
+
event = RunEvent(
|
|
93
|
+
eventType=RunState.COMPLETE,
|
|
94
|
+
eventTime="2025-01-01T00:00:00Z",
|
|
95
|
+
run=Run(runId=str(uuid.uuid4())),
|
|
96
|
+
job=Job(namespace="production", name=job_name),
|
|
97
|
+
inputs=inputs,
|
|
98
|
+
outputs=outputs
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
client.emit(event)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Best Practices
|
|
105
|
+
|
|
106
|
+
1. **Automate Collection** - Manual lineage doesn't scale
|
|
107
|
+
2. **Column-Level Tracking** - For sensitive data, track field-level
|
|
108
|
+
3. **Version Control** - Track lineage changes over time
|
|
109
|
+
4. **Clear Visualization** - Make lineage easy to understand
|
|
110
|
+
5. **Regular Validation** - Verify lineage accuracy
|
|
111
|
+
|
|
112
|
+
## Cost Optimization
|
|
113
|
+
|
|
114
|
+
- Use incremental lineage updates
|
|
115
|
+
- Archive old lineage data after retention period
|
|
116
|
+
- Cache frequently accessed lineage queries
|
|
117
|
+
- Use materialized views for complex lineage
|
|
118
|
+
|
|
119
|
+
## Integration
|
|
120
|
+
|
|
121
|
+
**Connects with:**
|
|
122
|
+
- de-02 (ETL): Track pipeline lineage
|
|
123
|
+
- dg-01 (Catalog): Link assets to lineage
|
|
124
|
+
- ml-02 (Feature Engineering): Track feature lineage
|
|
125
|
+
- ai-02 (RAG): Track document lineage
|
|
126
|
+
|
|
127
|
+
## Quick Win
|
|
128
|
+
|
|
129
|
+
Start with 1 critical data pipeline, manually document lineage, validate accuracy, then automate extraction.
|