tech-hub-skills 1.2.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/{LICENSE → .claude/LICENSE} +21 -21
  2. package/.claude/README.md +291 -0
  3. package/.claude/bin/cli.js +266 -0
  4. package/{bin → .claude/bin}/copilot.js +182 -182
  5. package/{bin → .claude/bin}/postinstall.js +42 -42
  6. package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
  7. package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
  8. package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
  9. package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
  10. package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
  11. package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
  12. package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
  13. package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
  14. package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
  15. package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
  16. package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
  17. package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
  18. package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
  19. package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
  20. package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
  21. package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
  22. package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
  23. package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
  24. package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
  25. package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
  26. package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
  27. package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
  28. package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
  29. package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
  30. package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
  31. package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
  32. package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
  33. package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
  34. package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
  35. package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
  36. package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
  37. package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
  38. package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
  39. package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
  40. package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
  41. package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -0
  43. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  47. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  50. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
  59. package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
  64. package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  74. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  86. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  158. package/.claude/skills/README.md +336 -0
  159. package/.claude/skills/ai-engineer.md +104 -0
  160. package/.claude/skills/aws.md +143 -0
  161. package/.claude/skills/azure.md +149 -0
  162. package/.claude/skills/backend-developer.md +108 -0
  163. package/.claude/skills/code-review.md +399 -0
  164. package/.claude/skills/compliance-automation.md +747 -0
  165. package/.claude/skills/compliance-officer.md +108 -0
  166. package/.claude/skills/data-engineer.md +113 -0
  167. package/.claude/skills/data-governance.md +102 -0
  168. package/.claude/skills/data-scientist.md +123 -0
  169. package/.claude/skills/database-admin.md +109 -0
  170. package/.claude/skills/devops.md +160 -0
  171. package/.claude/skills/docker.md +160 -0
  172. package/.claude/skills/enterprise-dashboard.md +613 -0
  173. package/.claude/skills/finops.md +184 -0
  174. package/.claude/skills/frontend-developer.md +108 -0
  175. package/.claude/skills/gcp.md +143 -0
  176. package/.claude/skills/ml-engineer.md +115 -0
  177. package/.claude/skills/mlops.md +187 -0
  178. package/.claude/skills/network-engineer.md +109 -0
  179. package/.claude/skills/optimization-advisor.md +329 -0
  180. package/.claude/skills/orchestrator.md +623 -0
  181. package/.claude/skills/platform-engineer.md +102 -0
  182. package/.claude/skills/process-automation.md +226 -0
  183. package/.claude/skills/process-changelog.md +184 -0
  184. package/.claude/skills/process-documentation.md +484 -0
  185. package/.claude/skills/process-kanban.md +324 -0
  186. package/.claude/skills/process-versioning.md +214 -0
  187. package/.claude/skills/product-designer.md +104 -0
  188. package/.claude/skills/project-starter.md +443 -0
  189. package/.claude/skills/qa-engineer.md +109 -0
  190. package/.claude/skills/security-architect.md +135 -0
  191. package/.claude/skills/sre.md +109 -0
  192. package/.claude/skills/system-design.md +126 -0
  193. package/.claude/skills/technical-writer.md +101 -0
  194. package/.gitattributes +2 -0
  195. package/GITHUB_COPILOT.md +106 -0
  196. package/README.md +192 -291
  197. package/package.json +16 -46
  198. package/bin/cli.js +0 -241
@@ -1,182 +1,182 @@
1
- # dg-03: Data Quality Framework
2
-
3
- ## Overview
4
-
5
- Implement automated data quality validation, scoring, monitoring, and issue remediation workflows.
6
-
7
- ## Key Capabilities
8
-
9
- - **Quality Rules Definition**: Completeness, accuracy, consistency
10
- - **Automated Validation**: Real-time quality checks
11
- - **Quality Scoring**: Quantifiable quality metrics
12
- - **Quality Monitoring**: Continuous quality tracking
13
- - **Issue Remediation**: Workflows for quality issues
14
-
15
- ## Tools & Technologies
16
-
17
- - **Great Expectations**: Python data validation
18
- - **Soda**: Data quality as code
19
- - **dbt tests**: Quality tests in dbt
20
- - **Azure Data Quality**: Native Azure solution
21
-
22
- ## Implementation
23
-
24
- ### 1. Quality Rules with Great Expectations
25
-
26
- ```python
27
- # Define quality expectations
28
- import great_expectations as gx
29
-
30
- def create_quality_suite(context, table_name):
31
- """Create data quality test suite"""
32
- suite = context.add_expectation_suite(
33
- expectation_suite_name=f"{table_name}_quality_suite"
34
- )
35
-
36
- validator = context.get_validator(
37
- batch_request=batch_request,
38
- expectation_suite_name=suite.expectation_suite_name
39
- )
40
-
41
- # Completeness checks
42
- validator.expect_column_values_to_not_be_null(column="customer_id")
43
- validator.expect_column_values_to_not_be_null(column="order_date")
44
-
45
- # Accuracy checks
46
- validator.expect_column_values_to_be_between(
47
- column="age",
48
- min_value=0,
49
- max_value=120
50
- )
51
-
52
- # Consistency checks
53
- validator.expect_column_values_to_match_regex(
54
- column="email",
55
- regex=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
56
- )
57
-
58
- validator.save_expectation_suite()
59
- return validator
60
- ```
61
-
62
- ### 2. Quality Scoring
63
-
64
- ```python
65
- # Calculate quality score
66
- def calculate_quality_score(validation_results):
67
- """Calculate overall quality score"""
68
- total_checks = validation_results.statistics['evaluated_expectations']
69
- successful_checks = validation_results.statistics['successful_expectations']
70
-
71
- score = (successful_checks / total_checks) * 100
72
-
73
- # Categorize quality
74
- if score >= 95:
75
- quality_level = "Excellent"
76
- elif score >= 85:
77
- quality_level = "Good"
78
- elif score >= 70:
79
- quality_level = "Acceptable"
80
- else:
81
- quality_level = "Poor"
82
-
83
- return {
84
- 'score': score,
85
- 'level': quality_level,
86
- 'total_checks': total_checks,
87
- 'passed_checks': successful_checks
88
- }
89
- ```
90
-
91
- ### 3. Automated Monitoring
92
-
93
- ```python
94
- # Set up quality monitoring
95
- def setup_quality_monitoring(checkpoint_name):
96
- """Configure automated quality monitoring"""
97
- checkpoint_config = {
98
- "name": checkpoint_name,
99
- "config_version": 1.0,
100
- "template_name": "default",
101
- "run_name_template": "%Y%m%d-%H%M%S",
102
- "validations": [
103
- {
104
- "batch_request": {
105
- "datasource_name": "production_data",
106
- "data_connector_name": "default_inferred_data_connector_name",
107
- "data_asset_name": "customers"
108
- },
109
- "expectation_suite_name": "customers_quality_suite"
110
- }
111
- ],
112
- "action_list": [
113
- {
114
- "name": "store_validation_result",
115
- "action": {"class_name": "StoreValidationResultAction"}
116
- },
117
- {
118
- "name": "send_slack_notification",
119
- "action": {
120
- "class_name": "SlackNotificationAction",
121
- "slack_webhook": "${SLACK_WEBHOOK}",
122
- "notify_on": "failure"
123
- }
124
- }
125
- ]
126
- }
127
-
128
- context.add_checkpoint(**checkpoint_config)
129
- ```
130
-
131
- ### 4. Issue Remediation Workflow
132
-
133
- ```python
134
- # Create remediation workflow
135
- def create_remediation_workflow(quality_issues):
136
- """Create tickets for quality issues"""
137
- from azure.devops import AzureDevOpsClient
138
-
139
- client = AzureDevOpsClient()
140
-
141
- for issue in quality_issues:
142
- work_item = {
143
- 'title': f"Data Quality Issue: {issue['column']}",
144
- 'description': issue['description'],
145
- 'priority': issue['severity'],
146
- 'assigned_to': issue['data_owner'],
147
- 'tags': ['data-quality', issue['table']]
148
- }
149
-
150
- client.create_work_item(
151
- project='DataGovernance',
152
- work_item_type='Bug',
153
- fields=work_item
154
- )
155
- ```
156
-
157
- ## Best Practices
158
-
159
- 1. **Start Simple** - Begin with critical fields, expand coverage
160
- 2. **Automate Everything** - Manual checks don't scale
161
- 3. **Clear Ownership** - Assign quality issues to data owners
162
- 4. **Threshold Alerts** - Alert on quality score drops
163
- 5. **Historical Tracking** - Monitor quality trends over time
164
-
165
- ## Cost Optimization
166
-
167
- - Run quality checks incrementally (only new/changed data)
168
- - Use sampling for large datasets
169
- - Cache validation results
170
- - Right-size validation compute
171
-
172
- ## Integration
173
-
174
- **Connects with:**
175
- - de-01 (Lakehouse): Validate lakehouse data
176
- - de-03 (Data Quality): Engineering quality checks
177
- - dg-01 (Catalog): Link quality scores to assets
178
- - dg-02 (Lineage): Trace quality issues to source
179
-
180
- ## Quick Win
181
-
182
- Implement completeness checks on 5 critical fields in your most important table. Show before/after quality scores.
1
+ # dg-03: Data Quality Framework
2
+
3
+ ## Overview
4
+
5
+ Implement automated data quality validation, scoring, monitoring, and issue remediation workflows.
6
+
7
+ ## Key Capabilities
8
+
9
+ - **Quality Rules Definition**: Completeness, accuracy, consistency
10
+ - **Automated Validation**: Real-time quality checks
11
+ - **Quality Scoring**: Quantifiable quality metrics
12
+ - **Quality Monitoring**: Continuous quality tracking
13
+ - **Issue Remediation**: Workflows for quality issues
14
+
15
+ ## Tools & Technologies
16
+
17
+ - **Great Expectations**: Python data validation
18
+ - **Soda**: Data quality as code
19
+ - **dbt tests**: Quality tests in dbt
20
+ - **Azure Data Quality**: Native Azure solution
21
+
22
+ ## Implementation
23
+
24
+ ### 1. Quality Rules with Great Expectations
25
+
26
+ ```python
27
+ # Define quality expectations
28
+ import great_expectations as gx
29
+
30
+ def create_quality_suite(context, table_name):
31
+ """Create data quality test suite"""
32
+ suite = context.add_expectation_suite(
33
+ expectation_suite_name=f"{table_name}_quality_suite"
34
+ )
35
+
36
+ validator = context.get_validator(
37
+ batch_request=batch_request,
38
+ expectation_suite_name=suite.expectation_suite_name
39
+ )
40
+
41
+ # Completeness checks
42
+ validator.expect_column_values_to_not_be_null(column="customer_id")
43
+ validator.expect_column_values_to_not_be_null(column="order_date")
44
+
45
+ # Accuracy checks
46
+ validator.expect_column_values_to_be_between(
47
+ column="age",
48
+ min_value=0,
49
+ max_value=120
50
+ )
51
+
52
+ # Consistency checks
53
+ validator.expect_column_values_to_match_regex(
54
+ column="email",
55
+ regex=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
56
+ )
57
+
58
+ validator.save_expectation_suite()
59
+ return validator
60
+ ```
61
+
62
+ ### 2. Quality Scoring
63
+
64
+ ```python
65
+ # Calculate quality score
66
+ def calculate_quality_score(validation_results):
67
+ """Calculate overall quality score"""
68
+ total_checks = validation_results.statistics['evaluated_expectations']
69
+ successful_checks = validation_results.statistics['successful_expectations']
70
+
71
+ score = (successful_checks / total_checks) * 100
72
+
73
+ # Categorize quality
74
+ if score >= 95:
75
+ quality_level = "Excellent"
76
+ elif score >= 85:
77
+ quality_level = "Good"
78
+ elif score >= 70:
79
+ quality_level = "Acceptable"
80
+ else:
81
+ quality_level = "Poor"
82
+
83
+ return {
84
+ 'score': score,
85
+ 'level': quality_level,
86
+ 'total_checks': total_checks,
87
+ 'passed_checks': successful_checks
88
+ }
89
+ ```
90
+
91
+ ### 3. Automated Monitoring
92
+
93
+ ```python
94
+ # Set up quality monitoring
95
+ def setup_quality_monitoring(checkpoint_name):
96
+ """Configure automated quality monitoring"""
97
+ checkpoint_config = {
98
+ "name": checkpoint_name,
99
+ "config_version": 1.0,
100
+ "template_name": "default",
101
+ "run_name_template": "%Y%m%d-%H%M%S",
102
+ "validations": [
103
+ {
104
+ "batch_request": {
105
+ "datasource_name": "production_data",
106
+ "data_connector_name": "default_inferred_data_connector_name",
107
+ "data_asset_name": "customers"
108
+ },
109
+ "expectation_suite_name": "customers_quality_suite"
110
+ }
111
+ ],
112
+ "action_list": [
113
+ {
114
+ "name": "store_validation_result",
115
+ "action": {"class_name": "StoreValidationResultAction"}
116
+ },
117
+ {
118
+ "name": "send_slack_notification",
119
+ "action": {
120
+ "class_name": "SlackNotificationAction",
121
+ "slack_webhook": "${SLACK_WEBHOOK}",
122
+ "notify_on": "failure"
123
+ }
124
+ }
125
+ ]
126
+ }
127
+
128
+ context.add_checkpoint(**checkpoint_config)
129
+ ```
130
+
131
+ ### 4. Issue Remediation Workflow
132
+
133
+ ```python
134
+ # Create remediation workflow
135
+ def create_remediation_workflow(quality_issues):
136
+ """Create tickets for quality issues"""
137
+ from azure.devops import AzureDevOpsClient
138
+
139
+ client = AzureDevOpsClient()
140
+
141
+ for issue in quality_issues:
142
+ work_item = {
143
+ 'title': f"Data Quality Issue: {issue['column']}",
144
+ 'description': issue['description'],
145
+ 'priority': issue['severity'],
146
+ 'assigned_to': issue['data_owner'],
147
+ 'tags': ['data-quality', issue['table']]
148
+ }
149
+
150
+ client.create_work_item(
151
+ project='DataGovernance',
152
+ work_item_type='Bug',
153
+ fields=work_item
154
+ )
155
+ ```
156
+
157
+ ## Best Practices
158
+
159
+ 1. **Start Simple** - Begin with critical fields, expand coverage
160
+ 2. **Automate Everything** - Manual checks don't scale
161
+ 3. **Clear Ownership** - Assign quality issues to data owners
162
+ 4. **Threshold Alerts** - Alert on quality score drops
163
+ 5. **Historical Tracking** - Monitor quality trends over time
164
+
165
+ ## Cost Optimization
166
+
167
+ - Run quality checks incrementally (only new/changed data)
168
+ - Use sampling for large datasets
169
+ - Cache validation results
170
+ - Right-size validation compute
171
+
172
+ ## Integration
173
+
174
+ **Connects with:**
175
+ - de-01 (Lakehouse): Validate lakehouse data
176
+ - de-03 (Data Quality): Engineering quality checks
177
+ - dg-01 (Catalog): Link quality scores to assets
178
+ - dg-02 (Lineage): Trace quality issues to source
179
+
180
+ ## Quick Win
181
+
182
+ Implement completeness checks on 5 critical fields in your most important table. Show before/after quality scores.
@@ -1,39 +1,39 @@
1
- # dg-04: Access Control & Policies
2
-
3
- ## Overview
4
-
5
- Implement role-based access control, column/row-level security, dynamic data masking, and access audit logging.
6
-
7
- ## Key Capabilities
8
-
9
- - **RBAC**: Role-based access control
10
- - **Column-Level Security**: Restrict sensitive columns
11
- - **Row-Level Security**: Filter data by user context
12
- - **Dynamic Data Masking**: Auto-mask sensitive data
13
- - **Access Audit Logging**: Track all data access
14
-
15
- ## Implementation
16
-
17
- ```sql
18
- -- Column-level security
19
- CREATE VIEW customer_secure AS
20
- SELECT
21
- customer_id,
22
- CASE
23
- WHEN CURRENT_USER() IN (SELECT user FROM admin_users)
24
- THEN email -- Show full email to admins
25
- ELSE CONCAT(LEFT(email, 3), '***@', SPLIT_PART(email, '@', 2)) -- Mask for others
26
- END as email,
27
- first_name,
28
- last_name
29
- FROM customers;
30
-
31
- -- Row-level security
32
- CREATE POLICY customer_region_policy ON customers
33
- FOR SELECT
34
- USING (region = current_setting('app.user_region'));
35
- ```
36
-
37
- ## Integration
38
-
39
- **Connects with:** sa-01 (PII Detection), sa-04 (IAM), dg-01 (Catalog)
1
+ # dg-04: Access Control & Policies
2
+
3
+ ## Overview
4
+
5
+ Implement role-based access control, column/row-level security, dynamic data masking, and access audit logging.
6
+
7
+ ## Key Capabilities
8
+
9
+ - **RBAC**: Role-based access control
10
+ - **Column-Level Security**: Restrict sensitive columns
11
+ - **Row-Level Security**: Filter data by user context
12
+ - **Dynamic Data Masking**: Auto-mask sensitive data
13
+ - **Access Audit Logging**: Track all data access
14
+
15
+ ## Implementation
16
+
17
+ ```sql
18
+ -- Column-level security
19
+ CREATE VIEW customer_secure AS
20
+ SELECT
21
+ customer_id,
22
+ CASE
23
+ WHEN CURRENT_USER() IN (SELECT user FROM admin_users)
24
+ THEN email -- Show full email to admins
25
+ ELSE CONCAT(LEFT(email, 3), '***@', SPLIT_PART(email, '@', 2)) -- Mask for others
26
+ END as email,
27
+ first_name,
28
+ last_name
29
+ FROM customers;
30
+
31
+ -- Row-level security
32
+ CREATE POLICY customer_region_policy ON customers
33
+ FOR SELECT
34
+ USING (region = current_setting('app.user_region'));
35
+ ```
36
+
37
+ ## Integration
38
+
39
+ **Connects with:** sa-01 (PII Detection), sa-04 (IAM), dg-01 (Catalog)
@@ -1,40 +1,40 @@
1
- # dg-05: Master Data Management
2
-
3
- ## Overview
4
-
5
- Entity resolution, golden record creation, data stewardship, and hierarchy management for critical business entities.
6
-
7
- ## Key Capabilities
8
-
9
- - **Entity Resolution**: Match and merge duplicate entities
10
- - **Golden Record**: Single source of truth
11
- - **Data Stewardship**: Workflows for data quality
12
- - **Cross-Reference**: Link entities across systems
13
- - **Hierarchy Management**: Organizational structures
14
-
15
- ## Implementation
16
-
17
- ```python
18
- # Entity resolution
19
- from recordlinkage import Index, Compare
20
-
21
- def match_customers(df1, df2):
22
- """Match customer records across systems"""
23
- indexer = Index()
24
- indexer.block('last_name')
25
- candidate_pairs = indexer.index(df1, df2)
26
-
27
- compare = Compare()
28
- compare.exact('first_name', 'first_name')
29
- compare.string('email', 'email', method='jarowinkler', threshold=0.85)
30
- compare.numeric('age', 'age', method='linear', offset=2)
31
-
32
- features = compare.compute(candidate_pairs, df1, df2)
33
- matches = features[features.sum(axis=1) > 2.5]
34
-
35
- return matches
36
- ```
37
-
38
- ## Integration
39
-
40
- **Connects with:** dg-01 (Catalog), dg-03 (Quality), de-02 (ETL)
1
+ # dg-05: Master Data Management
2
+
3
+ ## Overview
4
+
5
+ Entity resolution, golden record creation, data stewardship, and hierarchy management for critical business entities.
6
+
7
+ ## Key Capabilities
8
+
9
+ - **Entity Resolution**: Match and merge duplicate entities
10
+ - **Golden Record**: Single source of truth
11
+ - **Data Stewardship**: Workflows for data quality
12
+ - **Cross-Reference**: Link entities across systems
13
+ - **Hierarchy Management**: Organizational structures
14
+
15
+ ## Implementation
16
+
17
+ ```python
18
+ # Entity resolution
19
+ from recordlinkage import Index, Compare
20
+
21
+ def match_customers(df1, df2):
22
+ """Match customer records across systems"""
23
+ indexer = Index()
24
+ indexer.block('last_name')
25
+ candidate_pairs = indexer.index(df1, df2)
26
+
27
+ compare = Compare()
28
+ compare.exact('first_name', 'first_name')
29
+ compare.string('email', 'email', method='jarowinkler', threshold=0.85)
30
+ compare.numeric('age', 'age', method='linear', offset=2)
31
+
32
+ features = compare.compute(candidate_pairs, df1, df2)
33
+ matches = features[features.sum(axis=1) > 2.5]
34
+
35
+ return matches
36
+ ```
37
+
38
+ ## Integration
39
+
40
+ **Connects with:** dg-01 (Catalog), dg-03 (Quality), de-02 (ETL)
@@ -1,46 +1,46 @@
1
- # dg-06: Compliance & Privacy
2
-
3
- ## Overview
4
-
5
- GDPR compliance automation, data retention policies, right to be forgotten, consent management, and privacy impact assessments.
6
-
7
- ## Key Capabilities
8
-
9
- - **GDPR Automation**: Automated compliance checks
10
- - **Data Retention**: Automated data lifecycle
11
- - **Right to be Forgotten**: Delete personal data on request
12
- - **Consent Management**: Track user consent
13
- - **Privacy Impact Assessments**: Risk assessment
14
-
15
- ## Implementation
16
-
17
- ```python
18
- # Right to be forgotten
19
- def delete_user_data(user_id):
20
- """Delete all personal data for a user"""
21
- tables = [
22
- 'customers', 'orders', 'payments',
23
- 'preferences', 'analytics_events'
24
- ]
25
-
26
- for table in tables:
27
- spark.sql(f"""
28
- DELETE FROM {table}
29
- WHERE user_id = '{user_id}'
30
- """)
31
-
32
- # Log deletion for audit
33
- log_gdpr_deletion(user_id, tables)
34
-
35
- # Data retention policy
36
- def apply_retention_policy():
37
- """Delete data past retention period"""
38
- spark.sql("""
39
- DELETE FROM customer_events
40
- WHERE event_date < DATE_SUB(CURRENT_DATE(), 730) -- 2 years
41
- """)
42
- ```
43
-
44
- ## Integration
45
-
46
- **Connects with:** sa-01 (PII Detection), dg-01 (Catalog), dg-04 (Access Control)
1
+ # dg-06: Compliance & Privacy
2
+
3
+ ## Overview
4
+
5
+ GDPR compliance automation, data retention policies, right to be forgotten, consent management, and privacy impact assessments.
6
+
7
+ ## Key Capabilities
8
+
9
+ - **GDPR Automation**: Automated compliance checks
10
+ - **Data Retention**: Automated data lifecycle
11
+ - **Right to be Forgotten**: Delete personal data on request
12
+ - **Consent Management**: Track user consent
13
+ - **Privacy Impact Assessments**: Risk assessment
14
+
15
+ ## Implementation
16
+
17
+ ```python
18
+ # Right to be forgotten
19
+ def delete_user_data(user_id):
20
+ """Delete all personal data for a user"""
21
+ tables = [
22
+ 'customers', 'orders', 'payments',
23
+ 'preferences', 'analytics_events'
24
+ ]
25
+
26
+ for table in tables:
27
+ spark.sql(f"""
28
+ DELETE FROM {table}
29
+ WHERE user_id = '{user_id}'
30
+ """)
31
+
32
+ # Log deletion for audit
33
+ log_gdpr_deletion(user_id, tables)
34
+
35
+ # Data retention policy
36
+ def apply_retention_policy():
37
+ """Delete data past retention period"""
38
+ spark.sql("""
39
+ DELETE FROM customer_events
40
+ WHERE event_date < DATE_SUB(CURRENT_DATE(), 730) -- 2 years
41
+ """)
42
+ ```
43
+
44
+ ## Integration
45
+
46
+ **Connects with:** sa-01 (PII Detection), dg-01 (Catalog), dg-04 (Access Control)