tech-hub-skills 1.2.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/{LICENSE → .claude/LICENSE} +21 -21
  2. package/.claude/README.md +291 -0
  3. package/.claude/bin/cli.js +266 -0
  4. package/{bin → .claude/bin}/copilot.js +182 -182
  5. package/{bin → .claude/bin}/postinstall.js +42 -42
  6. package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
  7. package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
  8. package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
  9. package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
  10. package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
  11. package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
  12. package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
  13. package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
  14. package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
  15. package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
  16. package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
  17. package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
  18. package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
  19. package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
  20. package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
  21. package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
  22. package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
  23. package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
  24. package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
  25. package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
  26. package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
  27. package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
  28. package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
  29. package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
  30. package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
  31. package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
  32. package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
  33. package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
  34. package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
  35. package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
  36. package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
  37. package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
  38. package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
  39. package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
  40. package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
  41. package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -0
  43. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  47. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  50. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
  59. package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
  64. package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  74. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  86. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  158. package/.claude/skills/README.md +336 -0
  159. package/.claude/skills/ai-engineer.md +104 -0
  160. package/.claude/skills/aws.md +143 -0
  161. package/.claude/skills/azure.md +149 -0
  162. package/.claude/skills/backend-developer.md +108 -0
  163. package/.claude/skills/code-review.md +399 -0
  164. package/.claude/skills/compliance-automation.md +747 -0
  165. package/.claude/skills/compliance-officer.md +108 -0
  166. package/.claude/skills/data-engineer.md +113 -0
  167. package/.claude/skills/data-governance.md +102 -0
  168. package/.claude/skills/data-scientist.md +123 -0
  169. package/.claude/skills/database-admin.md +109 -0
  170. package/.claude/skills/devops.md +160 -0
  171. package/.claude/skills/docker.md +160 -0
  172. package/.claude/skills/enterprise-dashboard.md +613 -0
  173. package/.claude/skills/finops.md +184 -0
  174. package/.claude/skills/frontend-developer.md +108 -0
  175. package/.claude/skills/gcp.md +143 -0
  176. package/.claude/skills/ml-engineer.md +115 -0
  177. package/.claude/skills/mlops.md +187 -0
  178. package/.claude/skills/network-engineer.md +109 -0
  179. package/.claude/skills/optimization-advisor.md +329 -0
  180. package/.claude/skills/orchestrator.md +623 -0
  181. package/.claude/skills/platform-engineer.md +102 -0
  182. package/.claude/skills/process-automation.md +226 -0
  183. package/.claude/skills/process-changelog.md +184 -0
  184. package/.claude/skills/process-documentation.md +484 -0
  185. package/.claude/skills/process-kanban.md +324 -0
  186. package/.claude/skills/process-versioning.md +214 -0
  187. package/.claude/skills/product-designer.md +104 -0
  188. package/.claude/skills/project-starter.md +443 -0
  189. package/.claude/skills/qa-engineer.md +109 -0
  190. package/.claude/skills/security-architect.md +135 -0
  191. package/.claude/skills/sre.md +109 -0
  192. package/.claude/skills/system-design.md +126 -0
  193. package/.claude/skills/technical-writer.md +101 -0
  194. package/.gitattributes +2 -0
  195. package/GITHUB_COPILOT.md +106 -0
  196. package/README.md +192 -291
  197. package/package.json +16 -46
  198. package/bin/cli.js +0 -241
@@ -1,319 +1,319 @@
1
- # Skill 1: PII Detection & Data Privacy
2
-
3
- ## 🎯 Overview
4
- Automated PII detection, masking, and GDPR compliance tools.
5
-
6
- ## 🔗 Connections
7
- - **Data Engineer**: PII masking in data pipelines (de-01, de-02, de-03)
8
- - **AI Engineer**: PII filtering before RAG indexing (ai-02, ai-03)
9
- - **ML Engineer**: Remove PII before model training (ml-01, ml-02)
10
- - **Data Scientist**: PII detection in analysis datasets (ds-01)
11
- - **DevOps**: Automated PII scanning in CI/CD (do-01, do-02)
12
- - **FinOps**: Track compliance audit costs (fo-01)
13
- - **All Roles**: GDPR compliance and data protection
14
-
15
- ## 🛠️ Tools Included
16
-
17
- ### 1. `pii_detector.py`
18
- PII detection using Microsoft Presidio and custom patterns.
19
-
20
- ### 2. `data_anonymizer.py`
21
- Data anonymization with multiple strategies (masking, hashing, generalization).
22
-
23
- ### 3. `gdpr_compliance_checker.py`
24
- GDPR compliance validation and audit trails.
25
-
26
- ### 4. `consent_manager.py`
27
- User consent tracking and right-to-erasure automation.
28
-
29
- ### 5. `pii_audit_queries.sql`
30
- SQL queries for PII inventory and audit logs.
31
-
32
- ## 📊 PII Types Detected
33
- - Email addresses
34
- - Phone numbers
35
- - Credit cards
36
- - SSN / National IDs
37
- - IP addresses
38
- - Addresses
39
- - Names
40
- - Dates of birth
41
-
42
- ## 🚀 Quick Start
43
-
44
- ```python
45
- from pii_detector import PIIDetector
46
- from data_anonymizer import DataAnonymizer
47
-
48
- # Detect PII
49
- detector = PIIDetector()
50
- pii_findings = detector.analyze_text(
51
- "Contact John Smith at john.smith@email.com or 555-123-4567"
52
- )
53
-
54
- # Anonymize data
55
- anonymizer = DataAnonymizer()
56
- anonymized = anonymizer.mask_dataframe(
57
- df=customer_df,
58
- pii_columns=["email", "phone", "ssn"]
59
- )
60
- ```
61
-
62
- ## 📚 Best Practices
63
-
64
- ### Integration with Data Pipelines (Data Engineer)
65
-
66
- 1. **Bronze Layer PII Scanning**
67
- - Scan all raw data at ingestion
68
- - Tag datasets containing PII
69
- - Block high-risk PII from pipeline
70
- - Maintain PII inventory
71
- - Reference: Data Engineer de-01 (Lakehouse Architecture)
72
-
73
- 2. **Silver Layer PII Masking**
74
- - Apply masking transformations
75
- - Implement k-anonymity for aggregations
76
- - Track masked vs raw data lineage
77
- - Validate masking effectiveness
78
- - Reference: Data Engineer de-01, de-03
79
-
80
- 3. **Gold Layer Compliance**
81
- - Ensure no PII in analytics layers
82
- - Implement row-level security
83
- - Audit PII access logs
84
- - Enable right-to-erasure automation
85
- - Reference: Data Engineer de-01
86
-
87
- ### AI/ML Integration
88
-
89
- 4. **Pre-Training PII Removal**
90
- - Scan training data before ML experiments
91
- - Remove PII from feature engineering
92
- - Anonymize datasets for model development
93
- - Track data provenance for compliance
94
- - Reference: ML Engineer ml-01, ml-02
95
-
96
- 5. **RAG Knowledge Base Protection**
97
- - Scan documents before embedding
98
- - Prevent PII indexing in vector databases
99
- - Filter PII from LLM context
100
- - Audit knowledge base for compliance
101
- - Reference: AI Engineer ai-02 (RAG Pipeline)
102
-
103
- 6. **LLM Input/Output Filtering**
104
- - Detect PII in user prompts
105
- - Redact PII from LLM responses
106
- - Log PII exposure incidents
107
- - Implement real-time PII alerts
108
- - Reference: AI Engineer ai-01, ai-07
109
-
110
- ### Automation & CI/CD (DevOps Integration)
111
-
112
- 7. **Automated PII Scanning**
113
- - Integrate PII detection in CI/CD pipelines
114
- - Block commits containing PII
115
- - Scan code, configs, and test data
116
- - Automate compliance reports
117
- - Reference: DevOps do-01 (CI/CD), do-02 (Testing)
118
-
119
- 8. **Continuous Compliance Monitoring**
120
- - Schedule regular PII scans
121
- - Alert on new PII discoveries
122
- - Track remediation progress
123
- - Generate audit trails
124
- - Reference: DevOps do-08 (Monitoring)
125
-
126
- ### Cost Management (FinOps Integration)
127
-
128
- 9. **Optimize PII Scanning Costs**
129
- - Use sampling for large datasets
130
- - Cache PII detection results
131
- - Right-size scanning compute
132
- - Monitor compliance operation costs
133
- - Reference: FinOps fo-01, fo-06
134
-
135
- ### Enterprise Governance
136
-
137
- 10. **Data Governance Framework**
138
- - Classify data by sensitivity level
139
- - Implement data handling policies
140
- - Track PII across all systems
141
- - Enable compliance reporting
142
- - Reference: Security Architect sa-06 (Data Governance)
143
-
144
- 11. **GDPR Right-to-Erasure**
145
- - Automate data deletion requests
146
- - Track PII deletion across systems
147
- - Verify erasure completeness
148
- - Maintain deletion audit logs
149
- - Reference: Security Architect sa-06
150
-
151
- ## 💰 Cost Optimization Examples
152
-
153
- ### Efficient PII Scanning
154
- ```python
155
- from pii_detector import PIIDetector
156
- from finops_tracker import ComplianceCostTracker
157
-
158
- detector = PIIDetector()
159
- cost_tracker = ComplianceCostTracker()
160
-
161
- @cost_tracker.track_scan_cost
162
- def smart_pii_scan(df: pd.DataFrame, sample_size: int = 10000):
163
- # Sample for initial detection (cost savings)
164
- if len(df) > sample_size:
165
- sample_df = df.sample(n=sample_size, random_state=42)
166
- pii_columns = detector.find_pii_columns(sample_df)
167
-
168
- # Full scan only on suspected PII columns
169
- results = {}
170
- for col in pii_columns:
171
- results[col] = detector.analyze_column(df[col])
172
- else:
173
- results = detector.analyze_dataframe(df)
174
-
175
- return results
176
-
177
- # Cost report
178
- report = cost_tracker.monthly_report()
179
- print(f"PII scanning costs: ${report.total_cost:.2f}")
180
- print(f"Datasets scanned: {report.datasets_scanned}")
181
- ```
182
-
183
- ## 🚀 Automated PII Protection Pipeline
184
-
185
- ### CI/CD Integration
186
- ```yaml
187
- # .github/workflows/pii-protection.yml
188
- name: PII Protection
189
-
190
- on:
191
- push:
192
- paths:
193
- - 'data/**'
194
- - 'pipelines/**'
195
- pull_request:
196
-
197
- jobs:
198
- pii-scan:
199
- runs-on: ubuntu-latest
200
- steps:
201
- - uses: actions/checkout@v3
202
-
203
- - name: Scan code for PII patterns
204
- run: |
205
- python scripts/scan_code_for_pii.py \
206
- --fail-on-detection \
207
- --exclude-patterns .gitignore
208
-
209
- - name: Scan test data
210
- run: |
211
- python scripts/scan_test_data.py \
212
- --redact-if-found \
213
- --report-path reports/pii_scan.json
214
-
215
- - name: Validate data pipelines
216
- run: |
217
- python scripts/validate_pii_masking.py \
218
- --pipeline-config pipelines/config.yaml
219
-
220
- - name: Generate compliance report
221
- run: python scripts/generate_compliance_report.py
222
-
223
- - name: Upload scan results
224
- uses: actions/upload-artifact@v3
225
- with:
226
- name: pii-scan-results
227
- path: reports/
228
- ```
229
-
230
- ### Data Pipeline Integration
231
- ```python
232
- from bronze_ingestion import BronzeLoader
233
- from pii_detector import PIIDetector
234
- from data_anonymizer import DataAnonymizer
235
-
236
- detector = PIIDetector()
237
- anonymizer = DataAnonymizer()
238
-
239
- def secure_data_pipeline(source_data: str, output_table: str):
240
- # Bronze: Ingest with PII detection
241
- bronze = BronzeLoader()
242
- df = bronze.ingest(source_data)
243
-
244
- # Detect PII
245
- pii_findings = detector.analyze_dataframe(df)
246
-
247
- if pii_findings:
248
- # Log for compliance
249
- log_pii_detection(
250
- dataset=output_table,
251
- pii_types=[f.type for f in pii_findings],
252
- timestamp=datetime.now()
253
- )
254
-
255
- # Silver: Mask PII
256
- df_masked = anonymizer.mask_dataframe(
257
- df,
258
- pii_columns=[f.column for f in pii_findings],
259
- strategy="hash" # Deterministic for joins
260
- )
261
-
262
- # Store both raw (encrypted) and masked
263
- bronze.save(df, f"{output_table}_raw_encrypted")
264
- bronze.save(df_masked, f"{output_table}_masked")
265
-
266
- # Alert security team
267
- if any(f.severity == "high" for f in pii_findings):
268
- send_security_alert(pii_findings)
269
- else:
270
- bronze.save(df, output_table)
271
-
272
- return pii_findings
273
- ```
274
-
275
- ## 📊 Enhanced Metrics
276
-
277
- | Metric | Target | Tool |
278
- |--------|--------|------|
279
- | **PII Detection Coverage** | 100% of datasets | Automated scanning |
280
- | **False Positive Rate** | <5% | Model tuning |
281
- | **Detection Latency** | <1min per GB | Performance monitoring |
282
- | **Masking Accuracy** | >99.9% | Validation tests |
283
- | **Compliance Audit Pass Rate** | 100% | Audit logs |
284
- | **Mean Time to Remediate** | <24 hours | Incident tracking |
285
-
286
- ## 🔄 Integration Workflow
287
-
288
- ### End-to-End PII Protection
289
- ```
290
- 1. Data Ingestion (de-01)
291
-
292
- 2. PII Detection (sa-01) → Log Finding
293
-
294
- 3. Risk Assessment (High/Medium/Low)
295
-
296
- 4. Masking/Encryption (sa-01)
297
-
298
- 5. Quality Validation (de-03)
299
-
300
- 6. Compliance Audit Log (sa-06)
301
-
302
- 7. Downstream Processing (ML, Analytics)
303
- ├── Model Training (ml-01) - PII-free
304
- ├── RAG Indexing (ai-02) - PII-free
305
- └── EDA Reports (ds-01) - Masked
306
-
307
- 8. Continuous Monitoring (do-08)
308
-
309
- 9. Cost Tracking (fo-01)
310
- ```
311
-
312
- ## 🎯 Quick Wins
313
-
314
- 1. **Integrate PII scanning in CI/CD** - Prevent PII commits
315
- 2. **Automate Bronze layer scanning** - Detect PII at ingestion
316
- 3. **Implement PII masking in Silver** - Protect downstream systems
317
- 4. **Enable LLM input filtering** - Prevent PII in prompts
318
- 5. **Set up compliance dashboards** - Real-time PII tracking
319
- 6. **Automate right-to-erasure** - GDPR compliance automation
1
+ # Skill 1: PII Detection & Data Privacy
2
+
3
+ ## 🎯 Overview
4
+ Automated PII detection, masking, and GDPR compliance tools.
5
+
6
+ ## 🔗 Connections
7
+ - **Data Engineer**: PII masking in data pipelines (de-01, de-02, de-03)
8
+ - **AI Engineer**: PII filtering before RAG indexing (ai-02, ai-03)
9
+ - **ML Engineer**: Remove PII before model training (ml-01, ml-02)
10
+ - **Data Scientist**: PII detection in analysis datasets (ds-01)
11
+ - **DevOps**: Automated PII scanning in CI/CD (do-01, do-02)
12
+ - **FinOps**: Track compliance audit costs (fo-01)
13
+ - **All Roles**: GDPR compliance and data protection
14
+
15
+ ## 🛠️ Tools Included
16
+
17
+ ### 1. `pii_detector.py`
18
+ PII detection using Microsoft Presidio and custom patterns.
19
+
20
+ ### 2. `data_anonymizer.py`
21
+ Data anonymization with multiple strategies (masking, hashing, generalization).
22
+
23
+ ### 3. `gdpr_compliance_checker.py`
24
+ GDPR compliance validation and audit trails.
25
+
26
+ ### 4. `consent_manager.py`
27
+ User consent tracking and right-to-erasure automation.
28
+
29
+ ### 5. `pii_audit_queries.sql`
30
+ SQL queries for PII inventory and audit logs.
31
+
32
+ ## 📊 PII Types Detected
33
+ - Email addresses
34
+ - Phone numbers
35
+ - Credit cards
36
+ - SSN / National IDs
37
+ - IP addresses
38
+ - Addresses
39
+ - Names
40
+ - Dates of birth
41
+
42
+ ## 🚀 Quick Start
43
+
44
+ ```python
45
+ from pii_detector import PIIDetector
46
+ from data_anonymizer import DataAnonymizer
47
+
48
+ # Detect PII
49
+ detector = PIIDetector()
50
+ pii_findings = detector.analyze_text(
51
+ "Contact John Smith at john.smith@email.com or 555-123-4567"
52
+ )
53
+
54
+ # Anonymize data
55
+ anonymizer = DataAnonymizer()
56
+ anonymized = anonymizer.mask_dataframe(
57
+ df=customer_df,
58
+ pii_columns=["email", "phone", "ssn"]
59
+ )
60
+ ```
61
+
62
+ ## 📚 Best Practices
63
+
64
+ ### Integration with Data Pipelines (Data Engineer)
65
+
66
+ 1. **Bronze Layer PII Scanning**
67
+ - Scan all raw data at ingestion
68
+ - Tag datasets containing PII
69
+ - Block high-risk PII from pipeline
70
+ - Maintain PII inventory
71
+ - Reference: Data Engineer de-01 (Lakehouse Architecture)
72
+
73
+ 2. **Silver Layer PII Masking**
74
+ - Apply masking transformations
75
+ - Implement k-anonymity for aggregations
76
+ - Track masked vs raw data lineage
77
+ - Validate masking effectiveness
78
+ - Reference: Data Engineer de-01, de-03
79
+
80
+ 3. **Gold Layer Compliance**
81
+ - Ensure no PII in analytics layers
82
+ - Implement row-level security
83
+ - Audit PII access logs
84
+ - Enable right-to-erasure automation
85
+ - Reference: Data Engineer de-01
86
+
87
+ ### AI/ML Integration
88
+
89
+ 4. **Pre-Training PII Removal**
90
+ - Scan training data before ML experiments
91
+ - Remove PII from feature engineering
92
+ - Anonymize datasets for model development
93
+ - Track data provenance for compliance
94
+ - Reference: ML Engineer ml-01, ml-02
95
+
96
+ 5. **RAG Knowledge Base Protection**
97
+ - Scan documents before embedding
98
+ - Prevent PII indexing in vector databases
99
+ - Filter PII from LLM context
100
+ - Audit knowledge base for compliance
101
+ - Reference: AI Engineer ai-02 (RAG Pipeline)
102
+
103
+ 6. **LLM Input/Output Filtering**
104
+ - Detect PII in user prompts
105
+ - Redact PII from LLM responses
106
+ - Log PII exposure incidents
107
+ - Implement real-time PII alerts
108
+ - Reference: AI Engineer ai-01, ai-07
109
+
110
+ ### Automation & CI/CD (DevOps Integration)
111
+
112
+ 7. **Automated PII Scanning**
113
+ - Integrate PII detection in CI/CD pipelines
114
+ - Block commits containing PII
115
+ - Scan code, configs, and test data
116
+ - Automate compliance reports
117
+ - Reference: DevOps do-01 (CI/CD), do-02 (Testing)
118
+
119
+ 8. **Continuous Compliance Monitoring**
120
+ - Schedule regular PII scans
121
+ - Alert on new PII discoveries
122
+ - Track remediation progress
123
+ - Generate audit trails
124
+ - Reference: DevOps do-08 (Monitoring)
125
+
126
+ ### Cost Management (FinOps Integration)
127
+
128
+ 9. **Optimize PII Scanning Costs**
129
+ - Use sampling for large datasets
130
+ - Cache PII detection results
131
+ - Right-size scanning compute
132
+ - Monitor compliance operation costs
133
+ - Reference: FinOps fo-01, fo-06
134
+
135
+ ### Enterprise Governance
136
+
137
+ 10. **Data Governance Framework**
138
+ - Classify data by sensitivity level
139
+ - Implement data handling policies
140
+ - Track PII across all systems
141
+ - Enable compliance reporting
142
+ - Reference: Security Architect sa-06 (Data Governance)
143
+
144
+ 11. **GDPR Right-to-Erasure**
145
+ - Automate data deletion requests
146
+ - Track PII deletion across systems
147
+ - Verify erasure completeness
148
+ - Maintain deletion audit logs
149
+ - Reference: Security Architect sa-06
150
+
151
+ ## 💰 Cost Optimization Examples
152
+
153
+ ### Efficient PII Scanning
154
+ ```python
155
+ from pii_detector import PIIDetector
156
+ from finops_tracker import ComplianceCostTracker
157
+
158
+ detector = PIIDetector()
159
+ cost_tracker = ComplianceCostTracker()
160
+
161
+ @cost_tracker.track_scan_cost
162
+ def smart_pii_scan(df: pd.DataFrame, sample_size: int = 10000):
163
+ # Sample for initial detection (cost savings)
164
+ if len(df) > sample_size:
165
+ sample_df = df.sample(n=sample_size, random_state=42)
166
+ pii_columns = detector.find_pii_columns(sample_df)
167
+
168
+ # Full scan only on suspected PII columns
169
+ results = {}
170
+ for col in pii_columns:
171
+ results[col] = detector.analyze_column(df[col])
172
+ else:
173
+ results = detector.analyze_dataframe(df)
174
+
175
+ return results
176
+
177
+ # Cost report
178
+ report = cost_tracker.monthly_report()
179
+ print(f"PII scanning costs: ${report.total_cost:.2f}")
180
+ print(f"Datasets scanned: {report.datasets_scanned}")
181
+ ```
182
+
183
+ ## 🚀 Automated PII Protection Pipeline
184
+
185
+ ### CI/CD Integration
186
+ ```yaml
187
+ # .github/workflows/pii-protection.yml
188
+ name: PII Protection
189
+
190
+ on:
191
+ push:
192
+ paths:
193
+ - 'data/**'
194
+ - 'pipelines/**'
195
+ pull_request:
196
+
197
+ jobs:
198
+ pii-scan:
199
+ runs-on: ubuntu-latest
200
+ steps:
201
+ - uses: actions/checkout@v3
202
+
203
+ - name: Scan code for PII patterns
204
+ run: |
205
+ python scripts/scan_code_for_pii.py \
206
+ --fail-on-detection \
207
+ --exclude-patterns .gitignore
208
+
209
+ - name: Scan test data
210
+ run: |
211
+ python scripts/scan_test_data.py \
212
+ --redact-if-found \
213
+ --report-path reports/pii_scan.json
214
+
215
+ - name: Validate data pipelines
216
+ run: |
217
+ python scripts/validate_pii_masking.py \
218
+ --pipeline-config pipelines/config.yaml
219
+
220
+ - name: Generate compliance report
221
+ run: python scripts/generate_compliance_report.py
222
+
223
+ - name: Upload scan results
224
+ uses: actions/upload-artifact@v3
225
+ with:
226
+ name: pii-scan-results
227
+ path: reports/
228
+ ```
229
+
230
+ ### Data Pipeline Integration
231
+ ```python
232
+ from bronze_ingestion import BronzeLoader
233
+ from pii_detector import PIIDetector
234
+ from data_anonymizer import DataAnonymizer
235
+
236
+ detector = PIIDetector()
237
+ anonymizer = DataAnonymizer()
238
+
239
+ def secure_data_pipeline(source_data: str, output_table: str):
240
+ # Bronze: Ingest with PII detection
241
+ bronze = BronzeLoader()
242
+ df = bronze.ingest(source_data)
243
+
244
+ # Detect PII
245
+ pii_findings = detector.analyze_dataframe(df)
246
+
247
+ if pii_findings:
248
+ # Log for compliance
249
+ log_pii_detection(
250
+ dataset=output_table,
251
+ pii_types=[f.type for f in pii_findings],
252
+ timestamp=datetime.now()
253
+ )
254
+
255
+ # Silver: Mask PII
256
+ df_masked = anonymizer.mask_dataframe(
257
+ df,
258
+ pii_columns=[f.column for f in pii_findings],
259
+ strategy="hash" # Deterministic for joins
260
+ )
261
+
262
+ # Store both raw (encrypted) and masked
263
+ bronze.save(df, f"{output_table}_raw_encrypted")
264
+ bronze.save(df_masked, f"{output_table}_masked")
265
+
266
+ # Alert security team
267
+ if any(f.severity == "high" for f in pii_findings):
268
+ send_security_alert(pii_findings)
269
+ else:
270
+ bronze.save(df, output_table)
271
+
272
+ return pii_findings
273
+ ```
274
+
275
+ ## 📊 Enhanced Metrics
276
+
277
+ | Metric | Target | Tool |
278
+ |--------|--------|------|
279
+ | **PII Detection Coverage** | 100% of datasets | Automated scanning |
280
+ | **False Positive Rate** | <5% | Model tuning |
281
+ | **Detection Latency** | <1min per GB | Performance monitoring |
282
+ | **Masking Accuracy** | >99.9% | Validation tests |
283
+ | **Compliance Audit Pass Rate** | 100% | Audit logs |
284
+ | **Mean Time to Remediate** | <24 hours | Incident tracking |
285
+
286
+ ## 🔄 Integration Workflow
287
+
288
+ ### End-to-End PII Protection
289
+ ```
290
+ 1. Data Ingestion (de-01)
291
+
292
+ 2. PII Detection (sa-01) → Log Finding
293
+
294
+ 3. Risk Assessment (High/Medium/Low)
295
+
296
+ 4. Masking/Encryption (sa-01)
297
+
298
+ 5. Quality Validation (de-03)
299
+
300
+ 6. Compliance Audit Log (sa-06)
301
+
302
+ 7. Downstream Processing (ML, Analytics)
303
+ ├── Model Training (ml-01) - PII-free
304
+ ├── RAG Indexing (ai-02) - PII-free
305
+ └── EDA Reports (ds-01) - Masked
306
+
307
+ 8. Continuous Monitoring (do-08)
308
+
309
+ 9. Cost Tracking (fo-01)
310
+ ```
311
+
312
+ ## 🎯 Quick Wins
313
+
314
+ 1. **Integrate PII scanning in CI/CD** - Prevent PII commits
315
+ 2. **Automate Bronze layer scanning** - Detect PII at ingestion
316
+ 3. **Implement PII masking in Silver** - Protect downstream systems
317
+ 4. **Enable LLM input filtering** - Prevent PII in prompts
318
+ 5. **Set up compliance dashboards** - Real-time PII tracking
319
+ 6. **Automate right-to-erasure** - GDPR compliance automation