tech-hub-skills 1.2.0 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/{LICENSE → .claude/LICENSE} +21 -21
- package/.claude/README.md +291 -0
- package/.claude/bin/cli.js +266 -0
- package/{bin → .claude/bin}/copilot.js +182 -182
- package/{bin → .claude/bin}/postinstall.js +42 -42
- package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
- package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
- package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
- package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
- package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
- package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
- package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
- package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
- package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
- package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
- package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
- package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
- package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
- package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
- package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
- package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
- package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
- package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
- package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
- package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
- package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
- package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
- package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
- package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
- package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
- package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
- package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
- package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
- package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
- package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
- package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
- package/.claude/package.json +46 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
- package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
- package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
- package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
- package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
- package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
- package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
- package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
- package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
- package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
- package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
- package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
- package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
- package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
- package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
- package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
- package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
- package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
- package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
- package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
- package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
- package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
- package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
- package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
- package/.claude/skills/README.md +336 -0
- package/.claude/skills/ai-engineer.md +104 -0
- package/.claude/skills/aws.md +143 -0
- package/.claude/skills/azure.md +149 -0
- package/.claude/skills/backend-developer.md +108 -0
- package/.claude/skills/code-review.md +399 -0
- package/.claude/skills/compliance-automation.md +747 -0
- package/.claude/skills/compliance-officer.md +108 -0
- package/.claude/skills/data-engineer.md +113 -0
- package/.claude/skills/data-governance.md +102 -0
- package/.claude/skills/data-scientist.md +123 -0
- package/.claude/skills/database-admin.md +109 -0
- package/.claude/skills/devops.md +160 -0
- package/.claude/skills/docker.md +160 -0
- package/.claude/skills/enterprise-dashboard.md +613 -0
- package/.claude/skills/finops.md +184 -0
- package/.claude/skills/frontend-developer.md +108 -0
- package/.claude/skills/gcp.md +143 -0
- package/.claude/skills/ml-engineer.md +115 -0
- package/.claude/skills/mlops.md +187 -0
- package/.claude/skills/network-engineer.md +109 -0
- package/.claude/skills/optimization-advisor.md +329 -0
- package/.claude/skills/orchestrator.md +623 -0
- package/.claude/skills/platform-engineer.md +102 -0
- package/.claude/skills/process-automation.md +226 -0
- package/.claude/skills/process-changelog.md +184 -0
- package/.claude/skills/process-documentation.md +484 -0
- package/.claude/skills/process-kanban.md +324 -0
- package/.claude/skills/process-versioning.md +214 -0
- package/.claude/skills/product-designer.md +104 -0
- package/.claude/skills/project-starter.md +443 -0
- package/.claude/skills/qa-engineer.md +109 -0
- package/.claude/skills/security-architect.md +135 -0
- package/.claude/skills/sre.md +109 -0
- package/.claude/skills/system-design.md +126 -0
- package/.claude/skills/technical-writer.md +101 -0
- package/.gitattributes +2 -0
- package/GITHUB_COPILOT.md +106 -0
- package/README.md +192 -291
- package/package.json +16 -46
- package/bin/cli.js +0 -241
package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md
RENAMED
|
@@ -1,182 +1,182 @@
|
|
|
1
|
-
# dg-03: Data Quality Framework
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
|
|
5
|
-
Implement automated data quality validation, scoring, monitoring, and issue remediation workflows.
|
|
6
|
-
|
|
7
|
-
## Key Capabilities
|
|
8
|
-
|
|
9
|
-
- **Quality Rules Definition**: Completeness, accuracy, consistency
|
|
10
|
-
- **Automated Validation**: Real-time quality checks
|
|
11
|
-
- **Quality Scoring**: Quantifiable quality metrics
|
|
12
|
-
- **Quality Monitoring**: Continuous quality tracking
|
|
13
|
-
- **Issue Remediation**: Workflows for quality issues
|
|
14
|
-
|
|
15
|
-
## Tools & Technologies
|
|
16
|
-
|
|
17
|
-
- **Great Expectations**: Python data validation
|
|
18
|
-
- **Soda**: Data quality as code
|
|
19
|
-
- **dbt tests**: Quality tests in dbt
|
|
20
|
-
- **Azure Data Quality**: Native Azure solution
|
|
21
|
-
|
|
22
|
-
## Implementation
|
|
23
|
-
|
|
24
|
-
### 1. Quality Rules with Great Expectations
|
|
25
|
-
|
|
26
|
-
```python
|
|
27
|
-
# Define quality expectations
|
|
28
|
-
import great_expectations as gx
|
|
29
|
-
|
|
30
|
-
def create_quality_suite(context, table_name):
|
|
31
|
-
"""Create data quality test suite"""
|
|
32
|
-
suite = context.add_expectation_suite(
|
|
33
|
-
expectation_suite_name=f"{table_name}_quality_suite"
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
validator = context.get_validator(
|
|
37
|
-
batch_request=batch_request,
|
|
38
|
-
expectation_suite_name=suite.expectation_suite_name
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
# Completeness checks
|
|
42
|
-
validator.expect_column_values_to_not_be_null(column="customer_id")
|
|
43
|
-
validator.expect_column_values_to_not_be_null(column="order_date")
|
|
44
|
-
|
|
45
|
-
# Accuracy checks
|
|
46
|
-
validator.expect_column_values_to_be_between(
|
|
47
|
-
column="age",
|
|
48
|
-
min_value=0,
|
|
49
|
-
max_value=120
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
# Consistency checks
|
|
53
|
-
validator.expect_column_values_to_match_regex(
|
|
54
|
-
column="email",
|
|
55
|
-
regex=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
validator.save_expectation_suite()
|
|
59
|
-
return validator
|
|
60
|
-
```
|
|
61
|
-
|
|
62
|
-
### 2. Quality Scoring
|
|
63
|
-
|
|
64
|
-
```python
|
|
65
|
-
# Calculate quality score
|
|
66
|
-
def calculate_quality_score(validation_results):
|
|
67
|
-
"""Calculate overall quality score"""
|
|
68
|
-
total_checks = validation_results.statistics['evaluated_expectations']
|
|
69
|
-
successful_checks = validation_results.statistics['successful_expectations']
|
|
70
|
-
|
|
71
|
-
score = (successful_checks / total_checks) * 100
|
|
72
|
-
|
|
73
|
-
# Categorize quality
|
|
74
|
-
if score >= 95:
|
|
75
|
-
quality_level = "Excellent"
|
|
76
|
-
elif score >= 85:
|
|
77
|
-
quality_level = "Good"
|
|
78
|
-
elif score >= 70:
|
|
79
|
-
quality_level = "Acceptable"
|
|
80
|
-
else:
|
|
81
|
-
quality_level = "Poor"
|
|
82
|
-
|
|
83
|
-
return {
|
|
84
|
-
'score': score,
|
|
85
|
-
'level': quality_level,
|
|
86
|
-
'total_checks': total_checks,
|
|
87
|
-
'passed_checks': successful_checks
|
|
88
|
-
}
|
|
89
|
-
```
|
|
90
|
-
|
|
91
|
-
### 3. Automated Monitoring
|
|
92
|
-
|
|
93
|
-
```python
|
|
94
|
-
# Set up quality monitoring
|
|
95
|
-
def setup_quality_monitoring(checkpoint_name):
|
|
96
|
-
"""Configure automated quality monitoring"""
|
|
97
|
-
checkpoint_config = {
|
|
98
|
-
"name": checkpoint_name,
|
|
99
|
-
"config_version": 1.0,
|
|
100
|
-
"template_name": "default",
|
|
101
|
-
"run_name_template": "%Y%m%d-%H%M%S",
|
|
102
|
-
"validations": [
|
|
103
|
-
{
|
|
104
|
-
"batch_request": {
|
|
105
|
-
"datasource_name": "production_data",
|
|
106
|
-
"data_connector_name": "default_inferred_data_connector_name",
|
|
107
|
-
"data_asset_name": "customers"
|
|
108
|
-
},
|
|
109
|
-
"expectation_suite_name": "customers_quality_suite"
|
|
110
|
-
}
|
|
111
|
-
],
|
|
112
|
-
"action_list": [
|
|
113
|
-
{
|
|
114
|
-
"name": "store_validation_result",
|
|
115
|
-
"action": {"class_name": "StoreValidationResultAction"}
|
|
116
|
-
},
|
|
117
|
-
{
|
|
118
|
-
"name": "send_slack_notification",
|
|
119
|
-
"action": {
|
|
120
|
-
"class_name": "SlackNotificationAction",
|
|
121
|
-
"slack_webhook": "${SLACK_WEBHOOK}",
|
|
122
|
-
"notify_on": "failure"
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
]
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
context.add_checkpoint(**checkpoint_config)
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
### 4. Issue Remediation Workflow
|
|
132
|
-
|
|
133
|
-
```python
|
|
134
|
-
# Create remediation workflow
|
|
135
|
-
def create_remediation_workflow(quality_issues):
|
|
136
|
-
"""Create tickets for quality issues"""
|
|
137
|
-
from azure.devops import AzureDevOpsClient
|
|
138
|
-
|
|
139
|
-
client = AzureDevOpsClient()
|
|
140
|
-
|
|
141
|
-
for issue in quality_issues:
|
|
142
|
-
work_item = {
|
|
143
|
-
'title': f"Data Quality Issue: {issue['column']}",
|
|
144
|
-
'description': issue['description'],
|
|
145
|
-
'priority': issue['severity'],
|
|
146
|
-
'assigned_to': issue['data_owner'],
|
|
147
|
-
'tags': ['data-quality', issue['table']]
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
client.create_work_item(
|
|
151
|
-
project='DataGovernance',
|
|
152
|
-
work_item_type='Bug',
|
|
153
|
-
fields=work_item
|
|
154
|
-
)
|
|
155
|
-
```
|
|
156
|
-
|
|
157
|
-
## Best Practices
|
|
158
|
-
|
|
159
|
-
1. **Start Simple** - Begin with critical fields, expand coverage
|
|
160
|
-
2. **Automate Everything** - Manual checks don't scale
|
|
161
|
-
3. **Clear Ownership** - Assign quality issues to data owners
|
|
162
|
-
4. **Threshold Alerts** - Alert on quality score drops
|
|
163
|
-
5. **Historical Tracking** - Monitor quality trends over time
|
|
164
|
-
|
|
165
|
-
## Cost Optimization
|
|
166
|
-
|
|
167
|
-
- Run quality checks incrementally (only new/changed data)
|
|
168
|
-
- Use sampling for large datasets
|
|
169
|
-
- Cache validation results
|
|
170
|
-
- Right-size validation compute
|
|
171
|
-
|
|
172
|
-
## Integration
|
|
173
|
-
|
|
174
|
-
**Connects with:**
|
|
175
|
-
- de-01 (Lakehouse): Validate lakehouse data
|
|
176
|
-
- de-03 (Data Quality): Engineering quality checks
|
|
177
|
-
- dg-01 (Catalog): Link quality scores to assets
|
|
178
|
-
- dg-02 (Lineage): Trace quality issues to source
|
|
179
|
-
|
|
180
|
-
## Quick Win
|
|
181
|
-
|
|
182
|
-
Implement completeness checks on 5 critical fields in your most important table. Show before/after quality scores.
|
|
1
|
+
# dg-03: Data Quality Framework
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Implement automated data quality validation, scoring, monitoring, and issue remediation workflows.
|
|
6
|
+
|
|
7
|
+
## Key Capabilities
|
|
8
|
+
|
|
9
|
+
- **Quality Rules Definition**: Completeness, accuracy, consistency
|
|
10
|
+
- **Automated Validation**: Real-time quality checks
|
|
11
|
+
- **Quality Scoring**: Quantifiable quality metrics
|
|
12
|
+
- **Quality Monitoring**: Continuous quality tracking
|
|
13
|
+
- **Issue Remediation**: Workflows for quality issues
|
|
14
|
+
|
|
15
|
+
## Tools & Technologies
|
|
16
|
+
|
|
17
|
+
- **Great Expectations**: Python data validation
|
|
18
|
+
- **Soda**: Data quality as code
|
|
19
|
+
- **dbt tests**: Quality tests in dbt
|
|
20
|
+
- **Azure Data Quality**: Native Azure solution
|
|
21
|
+
|
|
22
|
+
## Implementation
|
|
23
|
+
|
|
24
|
+
### 1. Quality Rules with Great Expectations
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
# Define quality expectations
|
|
28
|
+
import great_expectations as gx
|
|
29
|
+
|
|
30
|
+
def create_quality_suite(context, table_name):
|
|
31
|
+
"""Create data quality test suite"""
|
|
32
|
+
suite = context.add_expectation_suite(
|
|
33
|
+
expectation_suite_name=f"{table_name}_quality_suite"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
validator = context.get_validator(
|
|
37
|
+
batch_request=batch_request,
|
|
38
|
+
expectation_suite_name=suite.expectation_suite_name
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Completeness checks
|
|
42
|
+
validator.expect_column_values_to_not_be_null(column="customer_id")
|
|
43
|
+
validator.expect_column_values_to_not_be_null(column="order_date")
|
|
44
|
+
|
|
45
|
+
# Accuracy checks
|
|
46
|
+
validator.expect_column_values_to_be_between(
|
|
47
|
+
column="age",
|
|
48
|
+
min_value=0,
|
|
49
|
+
max_value=120
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Consistency checks
|
|
53
|
+
validator.expect_column_values_to_match_regex(
|
|
54
|
+
column="email",
|
|
55
|
+
regex=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
validator.save_expectation_suite()
|
|
59
|
+
return validator
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 2. Quality Scoring
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
# Calculate quality score
|
|
66
|
+
def calculate_quality_score(validation_results):
|
|
67
|
+
"""Calculate overall quality score"""
|
|
68
|
+
total_checks = validation_results.statistics['evaluated_expectations']
|
|
69
|
+
successful_checks = validation_results.statistics['successful_expectations']
|
|
70
|
+
|
|
71
|
+
score = (successful_checks / total_checks) * 100
|
|
72
|
+
|
|
73
|
+
# Categorize quality
|
|
74
|
+
if score >= 95:
|
|
75
|
+
quality_level = "Excellent"
|
|
76
|
+
elif score >= 85:
|
|
77
|
+
quality_level = "Good"
|
|
78
|
+
elif score >= 70:
|
|
79
|
+
quality_level = "Acceptable"
|
|
80
|
+
else:
|
|
81
|
+
quality_level = "Poor"
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
'score': score,
|
|
85
|
+
'level': quality_level,
|
|
86
|
+
'total_checks': total_checks,
|
|
87
|
+
'passed_checks': successful_checks
|
|
88
|
+
}
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### 3. Automated Monitoring
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
# Set up quality monitoring
|
|
95
|
+
def setup_quality_monitoring(checkpoint_name):
|
|
96
|
+
"""Configure automated quality monitoring"""
|
|
97
|
+
checkpoint_config = {
|
|
98
|
+
"name": checkpoint_name,
|
|
99
|
+
"config_version": 1.0,
|
|
100
|
+
"template_name": "default",
|
|
101
|
+
"run_name_template": "%Y%m%d-%H%M%S",
|
|
102
|
+
"validations": [
|
|
103
|
+
{
|
|
104
|
+
"batch_request": {
|
|
105
|
+
"datasource_name": "production_data",
|
|
106
|
+
"data_connector_name": "default_inferred_data_connector_name",
|
|
107
|
+
"data_asset_name": "customers"
|
|
108
|
+
},
|
|
109
|
+
"expectation_suite_name": "customers_quality_suite"
|
|
110
|
+
}
|
|
111
|
+
],
|
|
112
|
+
"action_list": [
|
|
113
|
+
{
|
|
114
|
+
"name": "store_validation_result",
|
|
115
|
+
"action": {"class_name": "StoreValidationResultAction"}
|
|
116
|
+
},
|
|
117
|
+
{
|
|
118
|
+
"name": "send_slack_notification",
|
|
119
|
+
"action": {
|
|
120
|
+
"class_name": "SlackNotificationAction",
|
|
121
|
+
"slack_webhook": "${SLACK_WEBHOOK}",
|
|
122
|
+
"notify_on": "failure"
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
]
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
context.add_checkpoint(**checkpoint_config)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### 4. Issue Remediation Workflow
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
# Create remediation workflow
|
|
135
|
+
def create_remediation_workflow(quality_issues):
|
|
136
|
+
"""Create tickets for quality issues"""
|
|
137
|
+
from azure.devops import AzureDevOpsClient
|
|
138
|
+
|
|
139
|
+
client = AzureDevOpsClient()
|
|
140
|
+
|
|
141
|
+
for issue in quality_issues:
|
|
142
|
+
work_item = {
|
|
143
|
+
'title': f"Data Quality Issue: {issue['column']}",
|
|
144
|
+
'description': issue['description'],
|
|
145
|
+
'priority': issue['severity'],
|
|
146
|
+
'assigned_to': issue['data_owner'],
|
|
147
|
+
'tags': ['data-quality', issue['table']]
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
client.create_work_item(
|
|
151
|
+
project='DataGovernance',
|
|
152
|
+
work_item_type='Bug',
|
|
153
|
+
fields=work_item
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Best Practices
|
|
158
|
+
|
|
159
|
+
1. **Start Simple** - Begin with critical fields, expand coverage
|
|
160
|
+
2. **Automate Everything** - Manual checks don't scale
|
|
161
|
+
3. **Clear Ownership** - Assign quality issues to data owners
|
|
162
|
+
4. **Threshold Alerts** - Alert on quality score drops
|
|
163
|
+
5. **Historical Tracking** - Monitor quality trends over time
|
|
164
|
+
|
|
165
|
+
## Cost Optimization
|
|
166
|
+
|
|
167
|
+
- Run quality checks incrementally (only new/changed data)
|
|
168
|
+
- Use sampling for large datasets
|
|
169
|
+
- Cache validation results
|
|
170
|
+
- Right-size validation compute
|
|
171
|
+
|
|
172
|
+
## Integration
|
|
173
|
+
|
|
174
|
+
**Connects with:**
|
|
175
|
+
- de-01 (Lakehouse): Validate lakehouse data
|
|
176
|
+
- de-03 (Data Quality): Engineering quality checks
|
|
177
|
+
- dg-01 (Catalog): Link quality scores to assets
|
|
178
|
+
- dg-02 (Lineage): Trace quality issues to source
|
|
179
|
+
|
|
180
|
+
## Quick Win
|
|
181
|
+
|
|
182
|
+
Implement completeness checks on 5 critical fields in your most important table. Show before/after quality scores.
|
package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md
RENAMED
|
@@ -1,39 +1,39 @@
|
|
|
1
|
-
# dg-04: Access Control & Policies
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
|
|
5
|
-
Implement role-based access control, column/row-level security, dynamic data masking, and access audit logging.
|
|
6
|
-
|
|
7
|
-
## Key Capabilities
|
|
8
|
-
|
|
9
|
-
- **RBAC**: Role-based access control
|
|
10
|
-
- **Column-Level Security**: Restrict sensitive columns
|
|
11
|
-
- **Row-Level Security**: Filter data by user context
|
|
12
|
-
- **Dynamic Data Masking**: Auto-mask sensitive data
|
|
13
|
-
- **Access Audit Logging**: Track all data access
|
|
14
|
-
|
|
15
|
-
## Implementation
|
|
16
|
-
|
|
17
|
-
```sql
|
|
18
|
-
-- Column-level security
|
|
19
|
-
CREATE VIEW customer_secure AS
|
|
20
|
-
SELECT
|
|
21
|
-
customer_id,
|
|
22
|
-
CASE
|
|
23
|
-
WHEN CURRENT_USER() IN (SELECT user FROM admin_users)
|
|
24
|
-
THEN email -- Show full email to admins
|
|
25
|
-
ELSE CONCAT(LEFT(email, 3), '***@', SPLIT_PART(email, '@', 2)) -- Mask for others
|
|
26
|
-
END as email,
|
|
27
|
-
first_name,
|
|
28
|
-
last_name
|
|
29
|
-
FROM customers;
|
|
30
|
-
|
|
31
|
-
-- Row-level security
|
|
32
|
-
CREATE POLICY customer_region_policy ON customers
|
|
33
|
-
FOR SELECT
|
|
34
|
-
USING (region = current_setting('app.user_region'));
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
## Integration
|
|
38
|
-
|
|
39
|
-
**Connects with:** sa-01 (PII Detection), sa-04 (IAM), dg-01 (Catalog)
|
|
1
|
+
# dg-04: Access Control & Policies
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Implement role-based access control, column/row-level security, dynamic data masking, and access audit logging.
|
|
6
|
+
|
|
7
|
+
## Key Capabilities
|
|
8
|
+
|
|
9
|
+
- **RBAC**: Role-based access control
|
|
10
|
+
- **Column-Level Security**: Restrict sensitive columns
|
|
11
|
+
- **Row-Level Security**: Filter data by user context
|
|
12
|
+
- **Dynamic Data Masking**: Auto-mask sensitive data
|
|
13
|
+
- **Access Audit Logging**: Track all data access
|
|
14
|
+
|
|
15
|
+
## Implementation
|
|
16
|
+
|
|
17
|
+
```sql
|
|
18
|
+
-- Column-level security
|
|
19
|
+
CREATE VIEW customer_secure AS
|
|
20
|
+
SELECT
|
|
21
|
+
customer_id,
|
|
22
|
+
CASE
|
|
23
|
+
WHEN CURRENT_USER() IN (SELECT user FROM admin_users)
|
|
24
|
+
THEN email -- Show full email to admins
|
|
25
|
+
ELSE CONCAT(LEFT(email, 3), '***@', SPLIT_PART(email, '@', 2)) -- Mask for others
|
|
26
|
+
END as email,
|
|
27
|
+
first_name,
|
|
28
|
+
last_name
|
|
29
|
+
FROM customers;
|
|
30
|
+
|
|
31
|
+
-- Row-level security
|
|
32
|
+
CREATE POLICY customer_region_policy ON customers
|
|
33
|
+
FOR SELECT
|
|
34
|
+
USING (region = current_setting('app.user_region'));
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Integration
|
|
38
|
+
|
|
39
|
+
**Connects with:** sa-01 (PII Detection), sa-04 (IAM), dg-01 (Catalog)
|
package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md
RENAMED
|
@@ -1,40 +1,40 @@
|
|
|
1
|
-
# dg-05: Master Data Management
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
|
|
5
|
-
Entity resolution, golden record creation, data stewardship, and hierarchy management for critical business entities.
|
|
6
|
-
|
|
7
|
-
## Key Capabilities
|
|
8
|
-
|
|
9
|
-
- **Entity Resolution**: Match and merge duplicate entities
|
|
10
|
-
- **Golden Record**: Single source of truth
|
|
11
|
-
- **Data Stewardship**: Workflows for data quality
|
|
12
|
-
- **Cross-Reference**: Link entities across systems
|
|
13
|
-
- **Hierarchy Management**: Organizational structures
|
|
14
|
-
|
|
15
|
-
## Implementation
|
|
16
|
-
|
|
17
|
-
```python
|
|
18
|
-
# Entity resolution
|
|
19
|
-
from recordlinkage import Index, Compare
|
|
20
|
-
|
|
21
|
-
def match_customers(df1, df2):
|
|
22
|
-
"""Match customer records across systems"""
|
|
23
|
-
indexer = Index()
|
|
24
|
-
indexer.block('last_name')
|
|
25
|
-
candidate_pairs = indexer.index(df1, df2)
|
|
26
|
-
|
|
27
|
-
compare = Compare()
|
|
28
|
-
compare.exact('first_name', 'first_name')
|
|
29
|
-
compare.string('email', 'email', method='jarowinkler', threshold=0.85)
|
|
30
|
-
compare.numeric('age', 'age', method='linear', offset=2)
|
|
31
|
-
|
|
32
|
-
features = compare.compute(candidate_pairs, df1, df2)
|
|
33
|
-
matches = features[features.sum(axis=1) > 2.5]
|
|
34
|
-
|
|
35
|
-
return matches
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
## Integration
|
|
39
|
-
|
|
40
|
-
**Connects with:** dg-01 (Catalog), dg-03 (Quality), de-02 (ETL)
|
|
1
|
+
# dg-05: Master Data Management
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Entity resolution, golden record creation, data stewardship, and hierarchy management for critical business entities.
|
|
6
|
+
|
|
7
|
+
## Key Capabilities
|
|
8
|
+
|
|
9
|
+
- **Entity Resolution**: Match and merge duplicate entities
|
|
10
|
+
- **Golden Record**: Single source of truth
|
|
11
|
+
- **Data Stewardship**: Workflows for data quality
|
|
12
|
+
- **Cross-Reference**: Link entities across systems
|
|
13
|
+
- **Hierarchy Management**: Organizational structures
|
|
14
|
+
|
|
15
|
+
## Implementation
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
# Entity resolution
|
|
19
|
+
from recordlinkage import Index, Compare
|
|
20
|
+
|
|
21
|
+
def match_customers(df1, df2):
|
|
22
|
+
"""Match customer records across systems"""
|
|
23
|
+
indexer = Index()
|
|
24
|
+
indexer.block('last_name')
|
|
25
|
+
candidate_pairs = indexer.index(df1, df2)
|
|
26
|
+
|
|
27
|
+
compare = Compare()
|
|
28
|
+
compare.exact('first_name', 'first_name')
|
|
29
|
+
compare.string('email', 'email', method='jarowinkler', threshold=0.85)
|
|
30
|
+
compare.numeric('age', 'age', method='linear', offset=2)
|
|
31
|
+
|
|
32
|
+
features = compare.compute(candidate_pairs, df1, df2)
|
|
33
|
+
matches = features[features.sum(axis=1) > 2.5]
|
|
34
|
+
|
|
35
|
+
return matches
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Integration
|
|
39
|
+
|
|
40
|
+
**Connects with:** dg-01 (Catalog), dg-03 (Quality), de-02 (ETL)
|
package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md
RENAMED
|
@@ -1,46 +1,46 @@
|
|
|
1
|
-
# dg-06: Compliance & Privacy
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
|
|
5
|
-
GDPR compliance automation, data retention policies, right to be forgotten, consent management, and privacy impact assessments.
|
|
6
|
-
|
|
7
|
-
## Key Capabilities
|
|
8
|
-
|
|
9
|
-
- **GDPR Automation**: Automated compliance checks
|
|
10
|
-
- **Data Retention**: Automated data lifecycle
|
|
11
|
-
- **Right to be Forgotten**: Delete personal data on request
|
|
12
|
-
- **Consent Management**: Track user consent
|
|
13
|
-
- **Privacy Impact Assessments**: Risk assessment
|
|
14
|
-
|
|
15
|
-
## Implementation
|
|
16
|
-
|
|
17
|
-
```python
|
|
18
|
-
# Right to be forgotten
|
|
19
|
-
def delete_user_data(user_id):
|
|
20
|
-
"""Delete all personal data for a user"""
|
|
21
|
-
tables = [
|
|
22
|
-
'customers', 'orders', 'payments',
|
|
23
|
-
'preferences', 'analytics_events'
|
|
24
|
-
]
|
|
25
|
-
|
|
26
|
-
for table in tables:
|
|
27
|
-
spark.sql(f"""
|
|
28
|
-
DELETE FROM {table}
|
|
29
|
-
WHERE user_id = '{user_id}'
|
|
30
|
-
""")
|
|
31
|
-
|
|
32
|
-
# Log deletion for audit
|
|
33
|
-
log_gdpr_deletion(user_id, tables)
|
|
34
|
-
|
|
35
|
-
# Data retention policy
|
|
36
|
-
def apply_retention_policy():
|
|
37
|
-
"""Delete data past retention period"""
|
|
38
|
-
spark.sql("""
|
|
39
|
-
DELETE FROM customer_events
|
|
40
|
-
WHERE event_date < DATE_SUB(CURRENT_DATE(), 730) -- 2 years
|
|
41
|
-
""")
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
## Integration
|
|
45
|
-
|
|
46
|
-
**Connects with:** sa-01 (PII Detection), dg-01 (Catalog), dg-04 (Access Control)
|
|
1
|
+
# dg-06: Compliance & Privacy
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
GDPR compliance automation, data retention policies, right to be forgotten, consent management, and privacy impact assessments.
|
|
6
|
+
|
|
7
|
+
## Key Capabilities
|
|
8
|
+
|
|
9
|
+
- **GDPR Automation**: Automated compliance checks
|
|
10
|
+
- **Data Retention**: Automated data lifecycle
|
|
11
|
+
- **Right to be Forgotten**: Delete personal data on request
|
|
12
|
+
- **Consent Management**: Track user consent
|
|
13
|
+
- **Privacy Impact Assessments**: Risk assessment
|
|
14
|
+
|
|
15
|
+
## Implementation
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
# Right to be forgotten
|
|
19
|
+
def delete_user_data(user_id):
|
|
20
|
+
"""Delete all personal data for a user"""
|
|
21
|
+
tables = [
|
|
22
|
+
'customers', 'orders', 'payments',
|
|
23
|
+
'preferences', 'analytics_events'
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
for table in tables:
|
|
27
|
+
spark.sql(f"""
|
|
28
|
+
DELETE FROM {table}
|
|
29
|
+
WHERE user_id = '{user_id}'
|
|
30
|
+
""")
|
|
31
|
+
|
|
32
|
+
# Log deletion for audit
|
|
33
|
+
log_gdpr_deletion(user_id, tables)
|
|
34
|
+
|
|
35
|
+
# Data retention policy
|
|
36
|
+
def apply_retention_policy():
|
|
37
|
+
"""Delete data past retention period"""
|
|
38
|
+
spark.sql("""
|
|
39
|
+
DELETE FROM customer_events
|
|
40
|
+
WHERE event_date < DATE_SUB(CURRENT_DATE(), 730) -- 2 years
|
|
41
|
+
""")
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Integration
|
|
45
|
+
|
|
46
|
+
**Connects with:** sa-01 (PII Detection), dg-01 (Catalog), dg-04 (Access Control)
|