tech-hub-skills 1.2.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/{LICENSE → .claude/LICENSE} +21 -21
  2. package/.claude/README.md +291 -0
  3. package/.claude/bin/cli.js +266 -0
  4. package/{bin → .claude/bin}/copilot.js +182 -182
  5. package/{bin → .claude/bin}/postinstall.js +42 -42
  6. package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
  7. package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
  8. package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
  9. package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
  10. package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
  11. package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
  12. package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
  13. package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
  14. package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
  15. package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
  16. package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
  17. package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
  18. package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
  19. package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
  20. package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
  21. package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
  22. package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
  23. package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
  24. package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
  25. package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
  26. package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
  27. package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
  28. package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
  29. package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
  30. package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
  31. package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
  32. package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
  33. package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
  34. package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
  35. package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
  36. package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
  37. package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
  38. package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
  39. package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
  40. package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
  41. package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -0
  43. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  47. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  50. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
  59. package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
  64. package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  74. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  86. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  158. package/.claude/skills/README.md +336 -0
  159. package/.claude/skills/ai-engineer.md +104 -0
  160. package/.claude/skills/aws.md +143 -0
  161. package/.claude/skills/azure.md +149 -0
  162. package/.claude/skills/backend-developer.md +108 -0
  163. package/.claude/skills/code-review.md +399 -0
  164. package/.claude/skills/compliance-automation.md +747 -0
  165. package/.claude/skills/compliance-officer.md +108 -0
  166. package/.claude/skills/data-engineer.md +113 -0
  167. package/.claude/skills/data-governance.md +102 -0
  168. package/.claude/skills/data-scientist.md +123 -0
  169. package/.claude/skills/database-admin.md +109 -0
  170. package/.claude/skills/devops.md +160 -0
  171. package/.claude/skills/docker.md +160 -0
  172. package/.claude/skills/enterprise-dashboard.md +613 -0
  173. package/.claude/skills/finops.md +184 -0
  174. package/.claude/skills/frontend-developer.md +108 -0
  175. package/.claude/skills/gcp.md +143 -0
  176. package/.claude/skills/ml-engineer.md +115 -0
  177. package/.claude/skills/mlops.md +187 -0
  178. package/.claude/skills/network-engineer.md +109 -0
  179. package/.claude/skills/optimization-advisor.md +329 -0
  180. package/.claude/skills/orchestrator.md +623 -0
  181. package/.claude/skills/platform-engineer.md +102 -0
  182. package/.claude/skills/process-automation.md +226 -0
  183. package/.claude/skills/process-changelog.md +184 -0
  184. package/.claude/skills/process-documentation.md +484 -0
  185. package/.claude/skills/process-kanban.md +324 -0
  186. package/.claude/skills/process-versioning.md +214 -0
  187. package/.claude/skills/product-designer.md +104 -0
  188. package/.claude/skills/project-starter.md +443 -0
  189. package/.claude/skills/qa-engineer.md +109 -0
  190. package/.claude/skills/security-architect.md +135 -0
  191. package/.claude/skills/sre.md +109 -0
  192. package/.claude/skills/system-design.md +126 -0
  193. package/.claude/skills/technical-writer.md +101 -0
  194. package/.gitattributes +2 -0
  195. package/GITHUB_COPILOT.md +106 -0
  196. package/README.md +192 -291
  197. package/package.json +16 -46
  198. package/bin/cli.js +0 -241
@@ -1,112 +1,112 @@
1
- # dg-01: Data Catalog
2
-
3
- ## Overview
4
-
5
- Build enterprise data catalogs for asset discovery, metadata management, and data classification.
6
-
7
- ## Key Capabilities
8
-
9
- - **Asset Registration**: Automated discovery and registration of data assets
10
- - **Metadata Management**: Technical, business, and operational metadata
11
- - **Data Classification**: Automatic classification (PII, confidential, public)
12
- - **Search & Discovery**: Powerful search capabilities for data consumers
13
- - **Business Glossary**: Standardized business terminology
14
-
15
- ## Tools & Technologies
16
-
17
- - **Azure Purview**: Enterprise data catalog
18
- - **DataHub**: Open-source metadata platform
19
- - **Amundsen**: Lyft's data discovery platform
20
- - **Collibra**: Data governance platform
21
-
22
- ## Implementation
23
-
24
- ### 1. Asset Registration
25
-
26
- ```python
27
- # Automated asset registration
28
- from azure.purview.catalog import PurviewCatalogClient
29
-
30
- def register_data_asset(asset_name, asset_type, location):
31
- """Register data asset in catalog"""
32
- client = PurviewCatalogClient()
33
-
34
- asset = {
35
- "typeName": asset_type,
36
- "attributes": {
37
- "name": asset_name,
38
- "qualifiedName": f"{location}/{asset_name}",
39
- "location": location
40
- }
41
- }
42
-
43
- return client.entity.create_or_update(entity=asset)
44
- ```
45
-
46
- ### 2. Metadata Management
47
-
48
- ```python
49
- # Add business metadata
50
- def add_business_metadata(asset_id, owner, description, tags):
51
- """Enrich asset with business context"""
52
- metadata = {
53
- "businessOwner": owner,
54
- "description": description,
55
- "tags": tags,
56
- "certification": "certified"
57
- }
58
-
59
- return client.entity.add_business_metadata(
60
- guid=asset_id,
61
- business_metadata=metadata
62
- )
63
- ```
64
-
65
- ### 3. Data Classification
66
-
67
- ```python
68
- # Automatic classification
69
- def classify_data(asset_id):
70
- """Apply automatic classification based on content"""
71
- classifications = []
72
-
73
- # Scan for PII
74
- if contains_pii(asset_id):
75
- classifications.append("PII")
76
-
77
- # Scan for confidential data
78
- if contains_confidential(asset_id):
79
- classifications.append("Confidential")
80
-
81
- return client.entity.add_classifications(
82
- guid=asset_id,
83
- classifications=classifications
84
- )
85
- ```
86
-
87
- ## Best Practices
88
-
89
- 1. **Automate Discovery** - Use scanners to auto-discover assets
90
- 2. **Enrich Metadata** - Add business context, not just technical
91
- 3. **Clear Ownership** - Every asset needs a business owner
92
- 4. **Regular Updates** - Keep metadata fresh and relevant
93
- 5. **User Training** - Train users on search capabilities
94
-
95
- ## Cost Optimization
96
-
97
- - Use Azure Purview Standard tier for < 100k assets
98
- - Schedule scans during off-peak hours
99
- - Use incremental scans instead of full scans
100
- - Archive unused asset metadata
101
-
102
- ## Integration
103
-
104
- **Connects with:**
105
- - de-01 (Lakehouse): Catalog lakehouse tables
106
- - sa-01 (PII Detection): Auto-classify PII data
107
- - dg-02 (Lineage): Link to lineage tracking
108
- - dg-03 (Quality): Link quality scores
109
-
110
- ## Quick Win
111
-
112
- Start with top 10 critical datasets, manually catalog them with rich metadata, then expand automated discovery.
1
+ # dg-01: Data Catalog
2
+
3
+ ## Overview
4
+
5
+ Build enterprise data catalogs for asset discovery, metadata management, and data classification.
6
+
7
+ ## Key Capabilities
8
+
9
+ - **Asset Registration**: Automated discovery and registration of data assets
10
+ - **Metadata Management**: Technical, business, and operational metadata
11
+ - **Data Classification**: Automatic classification (PII, confidential, public)
12
+ - **Search & Discovery**: Powerful search capabilities for data consumers
13
+ - **Business Glossary**: Standardized business terminology
14
+
15
+ ## Tools & Technologies
16
+
17
+ - **Azure Purview**: Enterprise data catalog
18
+ - **DataHub**: Open-source metadata platform
19
+ - **Amundsen**: Lyft's data discovery platform
20
+ - **Collibra**: Data governance platform
21
+
22
+ ## Implementation
23
+
24
+ ### 1. Asset Registration
25
+
26
+ ```python
27
+ # Automated asset registration
28
+ from azure.purview.catalog import PurviewCatalogClient
29
+
30
+ def register_data_asset(asset_name, asset_type, location):
31
+ """Register data asset in catalog"""
32
+ client = PurviewCatalogClient()
33
+
34
+ asset = {
35
+ "typeName": asset_type,
36
+ "attributes": {
37
+ "name": asset_name,
38
+ "qualifiedName": f"{location}/{asset_name}",
39
+ "location": location
40
+ }
41
+ }
42
+
43
+ return client.entity.create_or_update(entity=asset)
44
+ ```
45
+
46
+ ### 2. Metadata Management
47
+
48
+ ```python
49
+ # Add business metadata
50
+ def add_business_metadata(asset_id, owner, description, tags):
51
+ """Enrich asset with business context"""
52
+ metadata = {
53
+ "businessOwner": owner,
54
+ "description": description,
55
+ "tags": tags,
56
+ "certification": "certified"
57
+ }
58
+
59
+ return client.entity.add_business_metadata(
60
+ guid=asset_id,
61
+ business_metadata=metadata
62
+ )
63
+ ```
64
+
65
+ ### 3. Data Classification
66
+
67
+ ```python
68
+ # Automatic classification
69
+ def classify_data(asset_id):
70
+ """Apply automatic classification based on content"""
71
+ classifications = []
72
+
73
+ # Scan for PII
74
+ if contains_pii(asset_id):
75
+ classifications.append("PII")
76
+
77
+ # Scan for confidential data
78
+ if contains_confidential(asset_id):
79
+ classifications.append("Confidential")
80
+
81
+ return client.entity.add_classifications(
82
+ guid=asset_id,
83
+ classifications=classifications
84
+ )
85
+ ```
86
+
87
+ ## Best Practices
88
+
89
+ 1. **Automate Discovery** - Use scanners to auto-discover assets
90
+ 2. **Enrich Metadata** - Add business context, not just technical
91
+ 3. **Clear Ownership** - Every asset needs a business owner
92
+ 4. **Regular Updates** - Keep metadata fresh and relevant
93
+ 5. **User Training** - Train users on search capabilities
94
+
95
+ ## Cost Optimization
96
+
97
+ - Use Azure Purview Standard tier for < 100k assets
98
+ - Schedule scans during off-peak hours
99
+ - Use incremental scans instead of full scans
100
+ - Archive unused asset metadata
101
+
102
+ ## Integration
103
+
104
+ **Connects with:**
105
+ - de-01 (Lakehouse): Catalog lakehouse tables
106
+ - sa-01 (PII Detection): Auto-classify PII data
107
+ - dg-02 (Lineage): Link to lineage tracking
108
+ - dg-03 (Quality): Link quality scores
109
+
110
+ ## Quick Win
111
+
112
+ Start with top 10 critical datasets, manually catalog them with rich metadata, then expand automated discovery.
@@ -1,129 +1,129 @@
1
- # dg-02: Data Lineage
2
-
3
- ## Overview
4
-
5
- Track end-to-end data lineage for impact analysis, root cause analysis, and regulatory compliance.
6
-
7
- ## Key Capabilities
8
-
9
- - **End-to-End Lineage**: From source to consumption
10
- - **Impact Analysis**: Understand downstream impacts
11
- - **Root Cause Analysis**: Trace issues to source
12
- - **Column-Level Lineage**: Field-level tracking
13
- - **Transformation Documentation**: Track data transformations
14
-
15
- ## Tools & Technologies
16
-
17
- - **Azure Purview**: Native lineage tracking
18
- - **OpenLineage**: Open standard for lineage
19
- - **Marquez**: Metadata service for lineage
20
- - **Spline**: Spark lineage tracking
21
-
22
- ## Implementation
23
-
24
- ### 1. Lineage Extraction
25
-
26
- ```python
27
- # Extract lineage from Spark jobs
28
- from spline import SplineAgent
29
-
30
- def track_spark_lineage(spark_session):
31
- """Enable lineage tracking for Spark"""
32
- spark_session.sparkContext.setLogLevel("INFO")
33
-
34
- # Initialize Spline agent
35
- SplineAgent.builder() \
36
- .appName("data-pipeline") \
37
- .mode("REQUIRED") \
38
- .url("http://spline-server:9090") \
39
- .build()
40
- ```
41
-
42
- ### 2. Column-Level Lineage
43
-
44
- ```sql
45
- -- Azure Purview automatically tracks column lineage
46
- -- Example transformation with lineage
47
- CREATE VIEW customer_360 AS
48
- SELECT
49
- c.customer_id,
50
- c.first_name || ' ' || c.last_name as full_name, -- Lineage: derived
51
- o.total_orders,
52
- p.total_payments
53
- FROM customers c
54
- LEFT JOIN order_summary o ON c.customer_id = o.customer_id
55
- LEFT JOIN payment_summary p ON c.customer_id = p.customer_id;
56
- ```
57
-
58
- ### 3. Impact Analysis
59
-
60
- ```python
61
- # Find downstream dependencies
62
- def get_downstream_impact(asset_id):
63
- """Find all downstream assets affected by changes"""
64
- lineage = client.lineage.get_lineage(
65
- guid=asset_id,
66
- direction="OUTPUT",
67
- depth=10
68
- )
69
-
70
- downstream_assets = []
71
- for entity in lineage['guidEntityMap'].values():
72
- downstream_assets.append({
73
- 'name': entity['attributes']['name'],
74
- 'type': entity['typeName'],
75
- 'owner': entity.get('attributes', {}).get('owner')
76
- })
77
-
78
- return downstream_assets
79
- ```
80
-
81
- ### 4. OpenLineage Integration
82
-
83
- ```python
84
- # Emit lineage events using OpenLineage
85
- from openlineage.client import OpenLineageClient
86
- from openlineage.client.run import RunEvent, RunState, Run, Job
87
-
88
- def emit_lineage_event(job_name, inputs, outputs):
89
- """Emit lineage event to OpenLineage"""
90
- client = OpenLineageClient(url="http://lineage-api:5000")
91
-
92
- event = RunEvent(
93
- eventType=RunState.COMPLETE,
94
- eventTime="2025-01-01T00:00:00Z",
95
- run=Run(runId=str(uuid.uuid4())),
96
- job=Job(namespace="production", name=job_name),
97
- inputs=inputs,
98
- outputs=outputs
99
- )
100
-
101
- client.emit(event)
102
- ```
103
-
104
- ## Best Practices
105
-
106
- 1. **Automate Collection** - Manual lineage doesn't scale
107
- 2. **Column-Level Tracking** - For sensitive data, track field-level
108
- 3. **Version Control** - Track lineage changes over time
109
- 4. **Clear Visualization** - Make lineage easy to understand
110
- 5. **Regular Validation** - Verify lineage accuracy
111
-
112
- ## Cost Optimization
113
-
114
- - Use incremental lineage updates
115
- - Archive old lineage data after retention period
116
- - Cache frequently accessed lineage queries
117
- - Use materialized views for complex lineage
118
-
119
- ## Integration
120
-
121
- **Connects with:**
122
- - de-02 (ETL): Track pipeline lineage
123
- - dg-01 (Catalog): Link assets to lineage
124
- - ml-02 (Feature Engineering): Track feature lineage
125
- - ai-02 (RAG): Track document lineage
126
-
127
- ## Quick Win
128
-
129
- Start with 1 critical data pipeline, manually document lineage, validate accuracy, then automate extraction.
1
+ # dg-02: Data Lineage
2
+
3
+ ## Overview
4
+
5
+ Track end-to-end data lineage for impact analysis, root cause analysis, and regulatory compliance.
6
+
7
+ ## Key Capabilities
8
+
9
+ - **End-to-End Lineage**: From source to consumption
10
+ - **Impact Analysis**: Understand downstream impacts
11
+ - **Root Cause Analysis**: Trace issues to source
12
+ - **Column-Level Lineage**: Field-level tracking
13
+ - **Transformation Documentation**: Track data transformations
14
+
15
+ ## Tools & Technologies
16
+
17
+ - **Azure Purview**: Native lineage tracking
18
+ - **OpenLineage**: Open standard for lineage
19
+ - **Marquez**: Metadata service for lineage
20
+ - **Spline**: Spark lineage tracking
21
+
22
+ ## Implementation
23
+
24
+ ### 1. Lineage Extraction
25
+
26
+ ```python
27
+ # Extract lineage from Spark jobs
28
+ from spline import SplineAgent
29
+
30
+ def track_spark_lineage(spark_session):
31
+ """Enable lineage tracking for Spark"""
32
+ spark_session.sparkContext.setLogLevel("INFO")
33
+
34
+ # Initialize Spline agent
35
+ SplineAgent.builder() \
36
+ .appName("data-pipeline") \
37
+ .mode("REQUIRED") \
38
+ .url("http://spline-server:9090") \
39
+ .build()
40
+ ```
41
+
42
+ ### 2. Column-Level Lineage
43
+
44
+ ```sql
45
+ -- Azure Purview automatically tracks column lineage
46
+ -- Example transformation with lineage
47
+ CREATE VIEW customer_360 AS
48
+ SELECT
49
+ c.customer_id,
50
+ c.first_name || ' ' || c.last_name as full_name, -- Lineage: derived
51
+ o.total_orders,
52
+ p.total_payments
53
+ FROM customers c
54
+ LEFT JOIN order_summary o ON c.customer_id = o.customer_id
55
+ LEFT JOIN payment_summary p ON c.customer_id = p.customer_id;
56
+ ```
57
+
58
+ ### 3. Impact Analysis
59
+
60
+ ```python
61
+ # Find downstream dependencies
62
+ def get_downstream_impact(asset_id):
63
+ """Find all downstream assets affected by changes"""
64
+ lineage = client.lineage.get_lineage(
65
+ guid=asset_id,
66
+ direction="OUTPUT",
67
+ depth=10
68
+ )
69
+
70
+ downstream_assets = []
71
+ for entity in lineage['guidEntityMap'].values():
72
+ downstream_assets.append({
73
+ 'name': entity['attributes']['name'],
74
+ 'type': entity['typeName'],
75
+ 'owner': entity.get('attributes', {}).get('owner')
76
+ })
77
+
78
+ return downstream_assets
79
+ ```
80
+
81
+ ### 4. OpenLineage Integration
82
+
83
+ ```python
84
+ # Emit lineage events using OpenLineage
85
+ from openlineage.client import OpenLineageClient
86
+ from openlineage.client.run import RunEvent, RunState, Run, Job
87
+
88
+ def emit_lineage_event(job_name, inputs, outputs):
89
+ """Emit lineage event to OpenLineage"""
90
+ client = OpenLineageClient(url="http://lineage-api:5000")
91
+
92
+ event = RunEvent(
93
+ eventType=RunState.COMPLETE,
94
+ eventTime="2025-01-01T00:00:00Z",
95
+ run=Run(runId=str(uuid.uuid4())),
96
+ job=Job(namespace="production", name=job_name),
97
+ inputs=inputs,
98
+ outputs=outputs
99
+ )
100
+
101
+ client.emit(event)
102
+ ```
103
+
104
+ ## Best Practices
105
+
106
+ 1. **Automate Collection** - Manual lineage doesn't scale
107
+ 2. **Column-Level Tracking** - For sensitive data, track field-level
108
+ 3. **Version Control** - Track lineage changes over time
109
+ 4. **Clear Visualization** - Make lineage easy to understand
110
+ 5. **Regular Validation** - Verify lineage accuracy
111
+
112
+ ## Cost Optimization
113
+
114
+ - Use incremental lineage updates
115
+ - Archive old lineage data after retention period
116
+ - Cache frequently accessed lineage queries
117
+ - Use materialized views for complex lineage
118
+
119
+ ## Integration
120
+
121
+ **Connects with:**
122
+ - de-02 (ETL): Track pipeline lineage
123
+ - dg-01 (Catalog): Link assets to lineage
124
+ - ml-02 (Feature Engineering): Track feature lineage
125
+ - ai-02 (RAG): Track document lineage
126
+
127
+ ## Quick Win
128
+
129
+ Start with 1 critical data pipeline, manually document lineage, validate accuracy, then automate extraction.