mindforge-cc 10.0.2 → 10.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.mindforge/config.json +73 -2
- package/.mindforge/engine/autonomous/cross-iteration-bridge.md +96 -0
- package/.mindforge/engine/cost-tracking/budget-enforcer.md +68 -0
- package/.mindforge/engine/cost-tracking/router.md +58 -0
- package/.mindforge/engine/cost-tracking/token-ledger.md +77 -0
- package/.mindforge/engine/council/council-protocol.md +96 -0
- package/.mindforge/engine/council/council-templates.md +85 -0
- package/.mindforge/engine/council/synthesis-engine.md +71 -0
- package/.mindforge/engine/cross-model-eval.md +74 -0
- package/.mindforge/engine/instincts/capture-engine.md +63 -0
- package/.mindforge/engine/instincts/instinct-schema.md +76 -0
- package/.mindforge/engine/instincts/promotion-engine.md +77 -0
- package/.mindforge/engine/proactive/signal-detector.md +60 -0
- package/.mindforge/engine/proactive/suggestion-engine.md +100 -0
- package/.mindforge/engine/skills/composition.md +83 -0
- package/.mindforge/engine/skills/loader.md +16 -0
- package/.mindforge/personas/agent-architect.md +57 -0
- package/.mindforge/personas/agent-evaluator.md +162 -0
- package/.mindforge/personas/agent-memory-designer.md +157 -0
- package/.mindforge/personas/agent-ops-engineer.md +120 -0
- package/.mindforge/personas/agent-orchestrator.md +112 -0
- package/.mindforge/personas/ai-economist.md +57 -0
- package/.mindforge/personas/ai-safety-engineer.md +57 -0
- package/.mindforge/personas/analytics-engineer.md +57 -0
- package/.mindforge/personas/anti-pattern-hunter.md +61 -0
- package/.mindforge/personas/api-gateway-designer.md +132 -0
- package/.mindforge/personas/auth-engineer.md +112 -0
- package/.mindforge/personas/build-engineer.md +57 -0
- package/.mindforge/personas/business-analyst.md +56 -0
- package/.mindforge/personas/cache-architect.md +100 -0
- package/.mindforge/personas/causal-scientist.md +57 -0
- package/.mindforge/personas/cdn-architect.md +118 -0
- package/.mindforge/personas/change-agent.md +104 -0
- package/.mindforge/personas/code-narrator.md +52 -0
- package/.mindforge/personas/codegen-specialist.md +68 -0
- package/.mindforge/personas/communication-architect.md +102 -0
- package/.mindforge/personas/compliance-engineer.md +96 -0
- package/.mindforge/personas/consensus-engineer.md +116 -0
- package/.mindforge/personas/contract-tester.md +60 -192
- package/.mindforge/personas/cost-optimizer.md +71 -0
- package/.mindforge/personas/council-architect.md +66 -0
- package/.mindforge/personas/council-critic.md +67 -0
- package/.mindforge/personas/council-pragmatist.md +71 -0
- package/.mindforge/personas/council-skeptic.md +73 -0
- package/.mindforge/personas/data-architect.md +108 -0
- package/.mindforge/personas/data-mesh-architect.md +57 -0
- package/.mindforge/personas/data-pipeline-architect.md +120 -0
- package/.mindforge/personas/de-sloppifier.md +60 -0
- package/.mindforge/personas/debt-manager.md +66 -0
- package/.mindforge/personas/decision-architect.md +82 -51
- package/.mindforge/personas/deployment-captain.md +74 -0
- package/.mindforge/personas/design-system-lead.md +112 -0
- package/.mindforge/personas/dmux-orchestrator.md +75 -0
- package/.mindforge/personas/doc-auditor.md +84 -0
- package/.mindforge/personas/dx-engineer.md +96 -0
- package/.mindforge/personas/ecommerce-engineer.md +57 -0
- package/.mindforge/personas/edge-engineer.md +94 -0
- package/.mindforge/personas/edtech-architect.md +106 -0
- package/.mindforge/personas/embedding-architect.md +57 -0
- package/.mindforge/personas/environment-engineer.md +57 -0
- package/.mindforge/personas/eval-judge.md +55 -0
- package/.mindforge/personas/event-architect.md +102 -0
- package/.mindforge/personas/experiment-designer.md +138 -0
- package/.mindforge/personas/feature-store-engineer.md +57 -0
- package/.mindforge/personas/finops-analyst.md +66 -0
- package/.mindforge/personas/fintech-architect.md +57 -0
- package/.mindforge/personas/flutter-engineer.md +104 -0
- package/.mindforge/personas/gaming-engineer.md +57 -0
- package/.mindforge/personas/graphql-designer.md +73 -0
- package/.mindforge/personas/healthcare-engineer.md +57 -0
- package/.mindforge/personas/hiring-strategist.md +105 -0
- package/.mindforge/personas/hitl-architect.md +165 -0
- package/.mindforge/personas/i18n-architect.md +69 -0
- package/.mindforge/personas/instinct-curator.md +83 -0
- package/.mindforge/personas/iot-architect.md +105 -0
- package/.mindforge/personas/knowledge-curator.md +139 -0
- package/.mindforge/personas/knowledge-engineer.md +57 -0
- package/.mindforge/personas/lakehouse-architect.md +57 -0
- package/.mindforge/personas/llm-orchestrator.md +57 -0
- package/.mindforge/personas/logistics-architect.md +106 -0
- package/.mindforge/personas/market-analyst.md +53 -0
- package/.mindforge/personas/marketplace-engineer.md +105 -0
- package/.mindforge/personas/mcp-designer.md +54 -0
- package/.mindforge/personas/meeting-designer.md +104 -0
- package/.mindforge/personas/mentorship-lead.md +106 -0
- package/.mindforge/personas/migration-architect.md +57 -0
- package/.mindforge/personas/ml-ops-engineer.md +101 -0
- package/.mindforge/personas/mobile-architect.md +105 -0
- package/.mindforge/personas/mobile-security-engineer.md +106 -0
- package/.mindforge/personas/multi-model-bridge.md +86 -0
- package/.mindforge/personas/multi-tenancy-architect.md +71 -0
- package/.mindforge/personas/multimodal-engineer.md +57 -0
- package/.mindforge/personas/offline-specialist.md +105 -0
- package/.mindforge/personas/onboarding-navigator.md +63 -0
- package/.mindforge/personas/payments-engineer.md +135 -0
- package/.mindforge/personas/pipeline-engineer.md +115 -0
- package/.mindforge/personas/platform-engineer.md +97 -0
- package/.mindforge/personas/platform-lead.md +57 -0
- package/.mindforge/personas/privacy-engineer.md +57 -0
- package/.mindforge/personas/product-owner.md +56 -0
- package/.mindforge/personas/productivity-analyst.md +57 -0
- package/.mindforge/personas/prompt-architect.md +101 -0
- package/.mindforge/personas/proofreader.md +53 -0
- package/.mindforge/personas/pwa-architect.md +105 -0
- package/.mindforge/personas/quality-scorer.md +63 -0
- package/.mindforge/personas/react-native-engineer.md +106 -0
- package/.mindforge/personas/resilience-engineer.md +69 -0
- package/.mindforge/personas/rfc-architect.md +64 -0
- package/.mindforge/personas/saga-orchestrator.md +80 -0
- package/.mindforge/personas/secrets-engineer.md +57 -0
- package/.mindforge/personas/skill-smith.md +79 -0
- package/.mindforge/personas/sre-lead.md +107 -0
- package/.mindforge/personas/stream-engineer.md +57 -0
- package/.mindforge/personas/streaming-engineer.md +64 -0
- package/.mindforge/personas/swarm-templates.json +695 -38
- package/.mindforge/personas/system-designer.md +57 -0
- package/.mindforge/personas/team-coach.md +120 -0
- package/.mindforge/personas/tech-lead-coach.md +103 -0
- package/.mindforge/personas/technical-writer-lead.md +111 -0
- package/.mindforge/personas/threat-modeler.md +82 -0
- package/.mindforge/personas/vibe-checker.md +75 -0
- package/.mindforge/personas/worktree-manager.md +56 -0
- package/.mindforge/personas/zero-trust-engineer.md +113 -0
- package/.mindforge/skills/a11y-testing/SKILL.md +143 -0
- package/.mindforge/skills/agent-evaluation-framework/SKILL.md +227 -0
- package/.mindforge/skills/agent-introspection-debugging/SKILL.md +88 -0
- package/.mindforge/skills/agent-loops/SKILL.md +84 -0
- package/.mindforge/skills/agent-memory-design/SKILL.md +199 -0
- package/.mindforge/skills/agent-orchestration-patterns/SKILL.md +129 -0
- package/.mindforge/skills/agent-tool-selection/SKILL.md +204 -0
- package/.mindforge/skills/ai-agent-deployment/SKILL.md +176 -0
- package/.mindforge/skills/ai-cost-management/SKILL.md +57 -0
- package/.mindforge/skills/ai-safety-alignment/SKILL.md +53 -0
- package/.mindforge/skills/analytics-instrumentation/SKILL.md +172 -0
- package/.mindforge/skills/api-gateway-patterns/SKILL.md +177 -0
- package/.mindforge/skills/api-marketplace/SKILL.md +56 -0
- package/.mindforge/skills/api-versioning/SKILL.md +100 -0
- package/.mindforge/skills/app-store-deployment/SKILL.md +44 -0
- package/.mindforge/skills/architecture-tradeoff-analysis/SKILL.md +97 -0
- package/.mindforge/skills/audit-logging/SKILL.md +140 -0
- package/.mindforge/skills/auth-patterns/SKILL.md +148 -0
- package/.mindforge/skills/autonomous-agent-harness/SKILL.md +218 -0
- package/.mindforge/skills/autonomous-agents/SKILL.md +59 -0
- package/.mindforge/skills/autonomous-loops/SKILL.md +105 -0
- package/.mindforge/skills/build-system-optimization/SKILL.md +54 -0
- package/.mindforge/skills/build-vs-buy/SKILL.md +80 -0
- package/.mindforge/skills/bundle-optimization/SKILL.md +174 -0
- package/.mindforge/skills/business-analyst/SKILL.md +82 -0
- package/.mindforge/skills/caching-strategies/SKILL.md +132 -0
- package/.mindforge/skills/capacity-planning/SKILL.md +96 -0
- package/.mindforge/skills/causal-inference/SKILL.md +42 -0
- package/.mindforge/skills/cdn-optimization/SKILL.md +212 -0
- package/.mindforge/skills/change-management/SKILL.md +106 -0
- package/.mindforge/skills/chaos-engineering/SKILL.md +99 -0
- package/.mindforge/skills/ci-cd-pipeline/SKILL.md +118 -0
- package/.mindforge/skills/cli-design/SKILL.md +118 -0
- package/.mindforge/skills/code-generation-patterns/SKILL.md +92 -0
- package/.mindforge/skills/code-review-methodology/SKILL.md +180 -0
- package/.mindforge/skills/code-tour/SKILL.md +145 -0
- package/.mindforge/skills/codebase-onboarding/SKILL.md +95 -0
- package/.mindforge/skills/compliance-as-code/SKILL.md +195 -0
- package/.mindforge/skills/conflict-resolution/SKILL.md +87 -0
- package/.mindforge/skills/connection-pooling/SKILL.md +151 -0
- package/.mindforge/skills/container-security/SKILL.md +151 -0
- package/.mindforge/skills/context-engineering/SKILL.md +114 -0
- package/.mindforge/skills/continuous-learning/SKILL.md +84 -0
- package/.mindforge/skills/contract-testing/SKILL.md +85 -0
- package/.mindforge/skills/cost-aware-routing/SKILL.md +83 -0
- package/.mindforge/skills/cost-estimation/SKILL.md +82 -0
- package/.mindforge/skills/council/SKILL.md +68 -0
- package/.mindforge/skills/cqrs-event-sourcing/SKILL.md +95 -0
- package/.mindforge/skills/cross-platform-testing/SKILL.md +43 -0
- package/.mindforge/skills/data-governance/SKILL.md +42 -0
- package/.mindforge/skills/data-lakehouse/SKILL.md +42 -0
- package/.mindforge/skills/data-mesh/SKILL.md +42 -0
- package/.mindforge/skills/data-modeling/SKILL.md +107 -0
- package/.mindforge/skills/data-pipeline-design/SKILL.md +171 -0
- package/.mindforge/skills/data-privacy-engineering/SKILL.md +42 -0
- package/.mindforge/skills/database-performance/SKILL.md +174 -0
- package/.mindforge/skills/database-sharding-advanced/SKILL.md +206 -0
- package/.mindforge/skills/de-sloppify/SKILL.md +120 -0
- package/.mindforge/skills/defense-in-depth/SKILL.md +84 -0
- package/.mindforge/skills/delegation-patterns/SKILL.md +123 -0
- package/.mindforge/skills/dependency-management/SKILL.md +94 -0
- package/.mindforge/skills/deployment-workflow/SKILL.md +135 -0
- package/.mindforge/skills/design-system/SKILL.md +113 -0
- package/.mindforge/skills/developer-onboarding/SKILL.md +99 -0
- package/.mindforge/skills/developer-productivity-metrics/SKILL.md +59 -0
- package/.mindforge/skills/distributed-consensus/SKILL.md +141 -0
- package/.mindforge/skills/dmux-workflows/SKILL.md +141 -0
- package/.mindforge/skills/dns-architecture/SKILL.md +167 -0
- package/.mindforge/skills/doc-health-audit/SKILL.md +102 -0
- package/.mindforge/skills/ecommerce-architecture/SKILL.md +41 -0
- package/.mindforge/skills/edge-computing/SKILL.md +91 -0
- package/.mindforge/skills/edtech-platform/SKILL.md +41 -0
- package/.mindforge/skills/email-deliverability/SKILL.md +177 -0
- package/.mindforge/skills/embedding-systems/SKILL.md +55 -0
- package/.mindforge/skills/environment-management/SKILL.md +54 -0
- package/.mindforge/skills/error-handling-architecture/SKILL.md +118 -0
- package/.mindforge/skills/estimation-techniques/SKILL.md +113 -0
- package/.mindforge/skills/eval-harness/SKILL.md +180 -0
- package/.mindforge/skills/event-driven-architecture/SKILL.md +162 -0
- package/.mindforge/skills/experiment-design/SKILL.md +139 -0
- package/.mindforge/skills/experiment-platform/SKILL.md +43 -0
- package/.mindforge/skills/feature-engineering/SKILL.md +42 -0
- package/.mindforge/skills/feature-flag-management/SKILL.md +183 -0
- package/.mindforge/skills/fine-tuning-workflow/SKILL.md +189 -0
- package/.mindforge/skills/fintech-patterns/SKILL.md +41 -0
- package/.mindforge/skills/flutter-architecture/SKILL.md +42 -0
- package/.mindforge/skills/gaming-backend/SKILL.md +41 -0
- package/.mindforge/skills/git-workflow-design/SKILL.md +129 -0
- package/.mindforge/skills/graceful-degradation/SKILL.md +95 -0
- package/.mindforge/skills/graphql-patterns/SKILL.md +243 -0
- package/.mindforge/skills/guardrails-and-safety/SKILL.md +137 -0
- package/.mindforge/skills/healthcare-systems/SKILL.md +40 -0
- package/.mindforge/skills/hiring-engineering/SKILL.md +119 -0
- package/.mindforge/skills/human-in-the-loop-design/SKILL.md +234 -0
- package/.mindforge/skills/i18n-architecture/SKILL.md +147 -0
- package/.mindforge/skills/idempotency-patterns/SKILL.md +84 -0
- package/.mindforge/skills/incident-communication/SKILL.md +96 -0
- package/.mindforge/skills/incident-management/SKILL.md +97 -0
- package/.mindforge/skills/infrastructure-as-code/SKILL.md +98 -0
- package/.mindforge/skills/instinct-clustering/SKILL.md +190 -0
- package/.mindforge/skills/internal-developer-platform/SKILL.md +51 -0
- package/.mindforge/skills/iot-platform/SKILL.md +41 -0
- package/.mindforge/skills/k8s-deployment/SKILL.md +358 -0
- package/.mindforge/skills/knowledge-graphs/SKILL.md +56 -0
- package/.mindforge/skills/knowledge-sharing-systems/SKILL.md +112 -0
- package/.mindforge/skills/llm-cost-optimization/SKILL.md +198 -0
- package/.mindforge/skills/llm-orchestration/SKILL.md +56 -0
- package/.mindforge/skills/load-testing/SKILL.md +84 -0
- package/.mindforge/skills/logistics-optimization/SKILL.md +40 -0
- package/.mindforge/skills/market-researcher/SKILL.md +99 -0
- package/.mindforge/skills/marketplace-trust/SKILL.md +40 -0
- package/.mindforge/skills/mcp-server-patterns/SKILL.md +264 -0
- package/.mindforge/skills/media-streaming/SKILL.md +41 -0
- package/.mindforge/skills/meeting-architecture/SKILL.md +146 -0
- package/.mindforge/skills/mentoring-patterns/SKILL.md +77 -0
- package/.mindforge/skills/microservices-patterns/SKILL.md +83 -0
- package/.mindforge/skills/migration-platform/SKILL.md +61 -0
- package/.mindforge/skills/migration-strategies/SKILL.md +129 -0
- package/.mindforge/skills/ml-feature-store/SKILL.md +56 -0
- package/.mindforge/skills/ml-monitoring/SKILL.md +42 -0
- package/.mindforge/skills/mobile-performance/SKILL.md +44 -0
- package/.mindforge/skills/mobile-security/SKILL.md +45 -0
- package/.mindforge/skills/model-evaluation/SKILL.md +53 -0
- package/.mindforge/skills/monorepo-management/SKILL.md +100 -0
- package/.mindforge/skills/multi-llm-consult/SKILL.md +75 -0
- package/.mindforge/skills/multi-tenancy-patterns/SKILL.md +145 -0
- package/.mindforge/skills/multi-turn-conversation-design/SKILL.md +206 -0
- package/.mindforge/skills/multimodal-ai/SKILL.md +51 -0
- package/.mindforge/skills/mutation-testing/SKILL.md +97 -0
- package/.mindforge/skills/notification-system-design/SKILL.md +168 -0
- package/.mindforge/skills/observability-stack/SKILL.md +136 -0
- package/.mindforge/skills/offline-first-design/SKILL.md +43 -0
- package/.mindforge/skills/on-call-design/SKILL.md +111 -0
- package/.mindforge/skills/pagination-patterns/SKILL.md +230 -0
- package/.mindforge/skills/payment-integration/SKILL.md +176 -0
- package/.mindforge/skills/performance-reviews/SKILL.md +140 -0
- package/.mindforge/skills/platform-observability/SKILL.md +58 -0
- package/.mindforge/skills/platform-reliability/SKILL.md +52 -0
- package/.mindforge/skills/post-incident-learning/SKILL.md +96 -0
- package/.mindforge/skills/product-manager/SKILL.md +104 -0
- package/.mindforge/skills/progressive-web-app/SKILL.md +44 -0
- package/.mindforge/skills/prompt-engineering/SKILL.md +94 -0
- package/.mindforge/skills/proofreader/SKILL.md +158 -0
- package/.mindforge/skills/push-notification-architecture/SKILL.md +45 -0
- package/.mindforge/skills/python-performance/SKILL.md +183 -0
- package/.mindforge/skills/quality-audit/SKILL.md +171 -0
- package/.mindforge/skills/queue-design/SKILL.md +85 -0
- package/.mindforge/skills/rag-architecture/SKILL.md +176 -0
- package/.mindforge/skills/rate-limiting-design/SKILL.md +94 -0
- package/.mindforge/skills/react-native-patterns/SKILL.md +42 -0
- package/.mindforge/skills/react-performance/SKILL.md +229 -0
- package/.mindforge/skills/real-time-analytics/SKILL.md +42 -0
- package/.mindforge/skills/real-time-sync/SKILL.md +83 -0
- package/.mindforge/skills/responsive-native/SKILL.md +44 -0
- package/.mindforge/skills/responsive-patterns/SKILL.md +141 -0
- package/.mindforge/skills/rfc-pipeline/SKILL.md +114 -0
- package/.mindforge/skills/saas-multi-tenant/SKILL.md +41 -0
- package/.mindforge/skills/santa-method/SKILL.md +134 -0
- package/.mindforge/skills/search-implementation/SKILL.md +98 -0
- package/.mindforge/skills/secrets-platform/SKILL.md +56 -0
- package/.mindforge/skills/secrets-rotation/SKILL.md +173 -0
- package/.mindforge/skills/self-serve-infrastructure/SKILL.md +51 -0
- package/.mindforge/skills/serverless-patterns/SKILL.md +119 -0
- package/.mindforge/skills/skill-creator-meta/SKILL.md +146 -0
- package/.mindforge/skills/sprint-retrospective-facilitation/SKILL.md +112 -0
- package/.mindforge/skills/stakeholder-communication/SKILL.md +85 -0
- package/.mindforge/skills/state-management/SKILL.md +104 -0
- package/.mindforge/skills/stream-processing/SKILL.md +43 -0
- package/.mindforge/skills/streaming-architecture/SKILL.md +81 -0
- package/.mindforge/skills/supply-chain-security/SKILL.md +145 -0
- package/.mindforge/skills/synthetic-data-generation/SKILL.md +52 -0
- package/.mindforge/skills/system-design/SKILL.md +88 -0
- package/.mindforge/skills/team-topology-design/SKILL.md +107 -0
- package/.mindforge/skills/technical-debt-management/SKILL.md +86 -0
- package/.mindforge/skills/technical-interview-design/SKILL.md +98 -0
- package/.mindforge/skills/technical-leadership/SKILL.md +75 -0
- package/.mindforge/skills/technical-writing/SKILL.md +237 -0
- package/.mindforge/skills/technology-radar/SKILL.md +88 -0
- package/.mindforge/skills/testing-anti-patterns/SKILL.md +288 -0
- package/.mindforge/skills/threat-modeling/SKILL.md +109 -0
- package/.mindforge/skills/tool-design/SKILL.md +138 -0
- package/.mindforge/skills/typescript-advanced/SKILL.md +198 -0
- package/.mindforge/skills/using-git-worktrees/SKILL.md +139 -0
- package/.mindforge/skills/verification-loop/SKILL.md +97 -0
- package/.mindforge/skills/vibe-security/SKILL.md +165 -0
- package/.mindforge/skills/visual-regression-testing/SKILL.md +97 -0
- package/.mindforge/skills/websocket-patterns/SKILL.md +203 -0
- package/.mindforge/skills/writing-plans/SKILL.md +170 -0
- package/.mindforge/skills/writing-skills/SKILL.md +216 -0
- package/.mindforge/skills/zero-trust-architecture/SKILL.md +166 -0
- package/CHANGELOG.md +195 -0
- package/MINDFORGE.md +4 -4
- package/README.md +2 -2
- package/RELEASENOTES.md +66 -0
- package/bin/installer-core.js +1 -1
- package/bin/wizard/theme.js +2 -2
- package/docs/commands-reference.md +18 -1
- package/package.json +2 -2
- package/.mindforge/personas/data-privacy-engineer.md +0 -187
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: data-governance
|
|
3
|
+
version: 1.0.0
|
|
4
|
+
min_mindforge_version: 10.6.0
|
|
5
|
+
status: stable
|
|
6
|
+
triggers: data governance framework, data catalog implementation, data lineage tracking, data access control, data quality framework, data stewardship, metadata management, data classification, data retention policy, data discovery platform, data ownership, data compliance framework
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Skill — Data Governance
|
|
10
|
+
|
|
11
|
+
## When this skill activates
|
|
12
|
+
This skill activates when implementing data catalog systems, establishing data ownership models, building lineage tracking, or designing access control frameworks. Use when organizations need to scale data democratization while maintaining compliance and quality.
|
|
13
|
+
|
|
14
|
+
## Mandatory actions when this skill is active
|
|
15
|
+
|
|
16
|
+
### Before writing any code
|
|
17
|
+
1. Define data classification taxonomy with clear criteria: public, internal, confidential, restricted with handling requirements for each tier
|
|
18
|
+
2. Establish data ownership model: domain owners, data stewards, technical custodians with RACI matrix for responsibilities
|
|
19
|
+
3. Document compliance requirements: GDPR, CCPA, HIPAA, SOC2 with specific technical controls needed for each regulation
|
|
20
|
+
4. Design metadata schema capturing: business definitions, technical specifications, quality metrics, lineage, and access policies
|
|
21
|
+
|
|
22
|
+
### During implementation
|
|
23
|
+
- Build automated data catalog discovery scanning databases, data lakes, APIs, and file systems to populate metadata repository
|
|
24
|
+
- Implement column-level lineage tracking from source systems through transformations to final consumption with impact analysis capabilities
|
|
25
|
+
- Create role-based access control (RBAC) with attribute-based policies (ABAC) for dynamic access based on data classification and user context
|
|
26
|
+
- Establish data quality framework with profiling rules, validation checks, and quality scores at dataset and column level
|
|
27
|
+
- Implement data retention policies with automated archival and deletion workflows based on regulatory requirements and business rules
|
|
28
|
+
- Build data stewardship workflows for metadata enrichment: business glossary terms, data ownership assignment, quality issue resolution
|
|
29
|
+
- Create audit logging for all data access, modifications, and policy changes with immutable trail for compliance reporting
|
|
30
|
+
|
|
31
|
+
### After implementation
|
|
32
|
+
- Deploy self-serve data discovery portal with search, business glossary, quality indicators, and access request workflows
|
|
33
|
+
- Generate automated data quality reports with trend analysis, anomaly detection, and stakeholder-specific dashboards
|
|
34
|
+
- Create compliance audit packages with evidence of controls: access logs, retention proof, encryption verification, lineage documentation
|
|
35
|
+
- Build data governance metrics dashboard: catalog coverage, metadata completeness, quality score trends, access request SLA
|
|
36
|
+
|
|
37
|
+
## Self-check before task completion
|
|
38
|
+
- [ ] Data catalog covers 90%+ of production data assets with accurate business metadata
|
|
39
|
+
- [ ] Lineage tracking provides end-to-end visibility from source to consumption with transformation logic
|
|
40
|
+
- [ ] Access control policies enforced at query time with separation of duties for sensitive data
|
|
41
|
+
- [ ] Data quality framework monitors critical datasets with automated alerting on quality degradation
|
|
42
|
+
- [ ] Compliance documentation generated automatically with evidence trails for audit requirements
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: data-lakehouse
|
|
3
|
+
version: 1.0.0
|
|
4
|
+
min_mindforge_version: 10.6.0
|
|
5
|
+
status: stable
|
|
6
|
+
triggers: data lakehouse architecture, medallion architecture, schema evolution lakehouse, time travel data, partition optimization, Delta Lake implementation, Iceberg table design, lakehouse query performance, data lakehouse governance, lakehouse ingestion, lakehouse serving layer, lakehouse cost optimization
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Skill — Data Lakehouse
|
|
10
|
+
|
|
11
|
+
## When this skill activates
|
|
12
|
+
This skill activates when implementing lakehouse architectures combining data lake flexibility with data warehouse performance. Use when building Delta Lake, Iceberg, or Hudi tables with ACID guarantees, schema evolution, and time travel capabilities.
|
|
13
|
+
|
|
14
|
+
## Mandatory actions when this skill is active
|
|
15
|
+
|
|
16
|
+
### Before writing any code
|
|
17
|
+
1. Design medallion architecture layers: bronze (raw ingestion), silver (cleansed/conformed), gold (business-level aggregates) with clear promotion criteria
|
|
18
|
+
2. Select table format (Delta, Iceberg, Hudi) based on requirements: write patterns, query patterns, ecosystem compatibility, and feature needs
|
|
19
|
+
3. Plan partitioning strategy based on query patterns: typically date/time for time-series, geography for location-based, or composite keys avoiding over-partitioning (<1000 partitions)
|
|
20
|
+
4. Define schema evolution policy: additive changes (safe), nullable to required (breaking), type changes (migration required) with versioning strategy
|
|
21
|
+
|
|
22
|
+
### During implementation
|
|
23
|
+
- Implement ACID transactions for atomic writes: use table format's transaction log (Delta Log, Iceberg metadata) to ensure consistency
|
|
24
|
+
- Configure file sizing for optimal query performance: target 128MB-1GB per file, run regular OPTIMIZE/COMPACT operations to prevent small files
|
|
25
|
+
- Enable time travel with retention policy: maintain snapshots for point-in-time queries and audit, configure vacuum/expire based on compliance needs
|
|
26
|
+
- Design incremental processing patterns: merge/upsert operations for CDC, append for event streams, overwrite partitions for batch updates
|
|
27
|
+
- Implement Z-ordering or clustering on frequently filtered columns (non-partition keys) to improve query performance via data skipping
|
|
28
|
+
- Build schema evolution handlers: automatic schema merging for new columns, validation for breaking changes, schema registry integration
|
|
29
|
+
- Create data quality checkpoints between medallion layers: row counts, null checks, referential integrity, business rule validation with quarantine tables
|
|
30
|
+
|
|
31
|
+
### After implementation
|
|
32
|
+
- Monitor table health metrics: file count, average file size, partition count, metadata size, and compaction needs
|
|
33
|
+
- Build cost optimization reports: storage by layer, compute for jobs, query costs, and opportunities for partition pruning or materialization
|
|
34
|
+
- Create governance controls: table-level access policies, column masking, row filtering, and audit logging for sensitive data access
|
|
35
|
+
- Generate performance analysis: query patterns, partition pruning effectiveness, file skipping statistics, and optimization recommendations
|
|
36
|
+
|
|
37
|
+
## Self-check before task completion
|
|
38
|
+
- [ ] Medallion layers clearly defined with data quality gates between bronze → silver → gold promotions
|
|
39
|
+
- [ ] Partitioning strategy optimized for query patterns with validation that queries prune partitions effectively
|
|
40
|
+
- [ ] ACID transactions tested with concurrent writes and failure scenarios to ensure consistency
|
|
41
|
+
- [ ] Schema evolution tested with backward and forward compatibility for common evolution scenarios
|
|
42
|
+
- [ ] File management strategy (OPTIMIZE/VACUUM) scheduled with monitoring for small file accumulation
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: data-mesh
|
|
3
|
+
version: 1.0.0
|
|
4
|
+
min_mindforge_version: 10.6.0
|
|
5
|
+
status: stable
|
|
6
|
+
triggers: data mesh architecture, domain data ownership, data product design, federated data governance, self-serve data platform, data mesh implementation, data contract mesh, data domain boundary, mesh interoperability, decentralized data ownership, data product specification, domain-driven data
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Skill — Data Mesh
|
|
10
|
+
|
|
11
|
+
## When this skill activates
|
|
12
|
+
This skill activates when implementing data mesh architectures with domain-oriented ownership, federated governance, and data-as-a-product principles. Use when centralizing data management becomes a bottleneck and domains need autonomy with interoperability.
|
|
13
|
+
|
|
14
|
+
## Mandatory actions when this skill is active
|
|
15
|
+
|
|
16
|
+
### Before writing any code
|
|
17
|
+
1. Define domain boundaries using domain-driven design: bounded contexts, ubiquitous language, core domains vs supporting domains with ownership mapping
|
|
18
|
+
2. Establish data product specification template: SLAs, schemas, semantics, access controls, versioning, and quality guarantees each domain must provide
|
|
19
|
+
3. Design federated computational governance: global standards (discovery, security, interoperability) enforced through platform, local decisions (tech stack, modeling) owned by domains
|
|
20
|
+
4. Create self-serve data platform capabilities: provisioning automation, observability tools, discovery services, and development environments domains can use independently
|
|
21
|
+
|
|
22
|
+
### During implementation
|
|
23
|
+
- Build data product registry with standardized metadata: domain owner, SLAs, schemas, sample data, access request process, and consumer feedback
|
|
24
|
+
- Implement data contracts between domains: schema definitions, backward compatibility guarantees, deprecation policies, and breaking change notifications
|
|
25
|
+
- Create domain-agnostic platform services: infrastructure provisioning (IaC templates), CI/CD pipelines, monitoring dashboards, and cost allocation
|
|
26
|
+
- Design data product APIs with consistency: REST for batch, streaming for real-time, query engines for analytical, with versioning and deprecation paths
|
|
27
|
+
- Establish quality frameworks domains must implement: data validation, profiling, lineage tracking, incident response with federated monitoring
|
|
28
|
+
- Build interoperability layer: common data types, standard formats (Parquet, Avro), semantic layer, and cross-domain joins through data products not direct access
|
|
29
|
+
- Implement federated identity and access: domain-owned authorization, centralized authentication, audit logging, and privacy controls enforced at platform level
|
|
30
|
+
|
|
31
|
+
### After implementation
|
|
32
|
+
- Create data product marketplace: searchable catalog, quality scores, usage analytics, consumer reviews, and onboarding documentation
|
|
33
|
+
- Build platform health metrics: provisioning time, incident resolution SLA, platform uptime, and developer satisfaction scores
|
|
34
|
+
- Generate federated governance reports: compliance by domain, quality trends, cross-domain dependencies, and policy violations
|
|
35
|
+
- Document domain interaction patterns: producer-consumer relationships, data sharing agreements, and conflict resolution processes
|
|
36
|
+
|
|
37
|
+
## Self-check before task completion
|
|
38
|
+
- [ ] Domain boundaries clearly defined with ownership assignments and RACI matrix for responsibilities
|
|
39
|
+
- [ ] Data products meet platform standards for discoverability, access control, quality, and SLAs
|
|
40
|
+
- [ ] Self-serve platform enables domains to provision, deploy, and monitor data products independently
|
|
41
|
+
- [ ] Federated governance enforces global standards while allowing domain autonomy in implementation
|
|
42
|
+
- [ ] Interoperability tested across domains through data product contracts and APIs, not direct database access
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: data-modeling
|
|
3
|
+
version: 1.0.0
|
|
4
|
+
min_mindforge_version: 10.0.7
|
|
5
|
+
status: stable
|
|
6
|
+
triggers: data modeling, dimensional model design, star schema design, snowflake schema design, normalization decision, schema evolution strategy, data contract definition, slowly changing dimension, entity relationship design, data warehouse modeling, schema lifecycle, data lineage mapping
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Data Modeling
|
|
10
|
+
|
|
11
|
+
## When this skill activates
|
|
12
|
+
|
|
13
|
+
This skill activates when the user is designing, implementing, or evolving data models.
|
|
14
|
+
This includes entity-relationship design, dimensional modeling (star/snowflake schemas),
|
|
15
|
+
normalization decisions, slowly changing dimension strategies, schema evolution planning,
|
|
16
|
+
data contract definitions between producers and consumers, and data lineage mapping
|
|
17
|
+
across transformation pipelines.
|
|
18
|
+
|
|
19
|
+
## Mandatory actions
|
|
20
|
+
|
|
21
|
+
### Before
|
|
22
|
+
|
|
23
|
+
1. Identify the workload type: OLTP (transactional) vs OLAP (analytical) vs hybrid.
|
|
24
|
+
2. Determine the primary consumers of the data (applications, analysts, ML pipelines).
|
|
25
|
+
3. Assess data volume, velocity, and variety characteristics.
|
|
26
|
+
4. Review existing schemas and their evolution history.
|
|
27
|
+
5. Identify upstream data sources and downstream consumers (lineage context).
|
|
28
|
+
|
|
29
|
+
### During
|
|
30
|
+
|
|
31
|
+
**Modeling Phases (Conceptual to Logical to Physical):**
|
|
32
|
+
- **Conceptual:** Business entities and relationships, no implementation details. Stakeholder-readable.
|
|
33
|
+
- **Logical:** Attributes, data types, keys, constraints. Technology-agnostic.
|
|
34
|
+
- **Physical:** Indexes, partitions, storage engines, materialized views. Technology-specific.
|
|
35
|
+
- Always start conceptual, refine to logical, then optimize physical. Never skip phases.
|
|
36
|
+
|
|
37
|
+
**Normalization (OLTP):**
|
|
38
|
+
- **1NF:** Eliminate repeating groups; atomic values in every column.
|
|
39
|
+
- **2NF:** Remove partial dependencies (all non-key columns depend on the full primary key).
|
|
40
|
+
- **3NF:** Remove transitive dependencies (non-key columns depend only on the key).
|
|
41
|
+
- **BCNF:** Every determinant is a candidate key.
|
|
42
|
+
- Normalize for OLTP (reduces anomalies, ensures consistency).
|
|
43
|
+
- Denormalize for OLAP (reduces joins, improves query performance).
|
|
44
|
+
- Document every denormalization decision with rationale.
|
|
45
|
+
|
|
46
|
+
**Star Schema (Dimensional Modeling):**
|
|
47
|
+
- **Fact tables:** Measurable events (transactions, clicks, shipments). Contain foreign keys + metrics.
|
|
48
|
+
- **Dimension tables:** Descriptive context (who, what, where, when, how).
|
|
49
|
+
- **Grain definition:** The most atomic level of detail in a fact table. Define grain FIRST.
|
|
50
|
+
- Prefer conformed dimensions (shared across fact tables) for consistency.
|
|
51
|
+
- Junk dimensions: combine low-cardinality flags into a single dimension.
|
|
52
|
+
|
|
53
|
+
**Snowflake Schema:**
|
|
54
|
+
- Use when dimensions have natural sub-hierarchies (geography: country → state → city).
|
|
55
|
+
- Normalizes dimension tables to reduce redundancy.
|
|
56
|
+
- Trade-off: more joins but less storage and clearer hierarchy.
|
|
57
|
+
- Prefer star schema unless dimension table size or update frequency justifies snowflaking.
|
|
58
|
+
|
|
59
|
+
**Slowly Changing Dimensions (SCD):**
|
|
60
|
+
- **Type 0:** Fixed, never changes (date of birth).
|
|
61
|
+
- **Type 1:** Overwrite old value. No history preserved. Use for corrections.
|
|
62
|
+
- **Type 2:** Add new row with version tracking (start_date, end_date, is_current). Full history.
|
|
63
|
+
- **Type 3:** Add "previous" column alongside current. Limited history (one prior value).
|
|
64
|
+
- **Type 6 (Hybrid):** Combines Types 1, 2, and 3 for maximum flexibility.
|
|
65
|
+
- Default to Type 2 unless storage or query complexity is a concern.
|
|
66
|
+
|
|
67
|
+
**Data Contracts:**
|
|
68
|
+
- Agreement between data producer and consumer on schema + semantics + SLA.
|
|
69
|
+
- Schema: field names, types, nullability, constraints.
|
|
70
|
+
- Semantics: business meaning of each field (not just technical definition).
|
|
71
|
+
- SLA: freshness guarantee, completeness threshold, availability window.
|
|
72
|
+
- Enforce contracts via schema validation in pipelines (Great Expectations, dbt tests).
|
|
73
|
+
- Breaking contract changes require notification and migration period.
|
|
74
|
+
|
|
75
|
+
**Schema Evolution:**
|
|
76
|
+
- **Additive (safe):** New optional columns, new tables, new indexes.
|
|
77
|
+
- **Breaking (dangerous):** Column removal, type changes, renaming, adding NOT NULL without default.
|
|
78
|
+
- Use migration scripts (Flyway, Alembic, Liquibase) for all schema changes.
|
|
79
|
+
- Version schemas and maintain a changelog.
|
|
80
|
+
- Test migrations against production-like data volumes before deploying.
|
|
81
|
+
|
|
82
|
+
**Data Lineage:**
|
|
83
|
+
- Track data from source → transformation → consumption.
|
|
84
|
+
- Document at column-level granularity for critical fields.
|
|
85
|
+
- Use tools (dbt lineage graph, Apache Atlas, DataHub) for automated discovery.
|
|
86
|
+
- Lineage enables impact analysis (what breaks if this source changes?).
|
|
87
|
+
- Required for regulatory compliance (GDPR: where does PII flow?).
|
|
88
|
+
|
|
89
|
+
### After
|
|
90
|
+
|
|
91
|
+
1. Verify grain is explicitly defined for every fact table.
|
|
92
|
+
2. Confirm normalization level matches workload type (OLTP normalized, OLAP denormalized).
|
|
93
|
+
3. Validate SCD strategy is documented for every dimension with mutable attributes.
|
|
94
|
+
4. Ensure data contracts exist between all critical producer-consumer pairs.
|
|
95
|
+
5. Check that schema evolution follows additive-first principles.
|
|
96
|
+
6. Verify lineage is documented for compliance-sensitive data flows.
|
|
97
|
+
|
|
98
|
+
## Self-check before task completion
|
|
99
|
+
|
|
100
|
+
- [ ] Modeling followed the conceptual → logical → physical progression.
|
|
101
|
+
- [ ] Normalization level is appropriate for the workload (OLTP vs OLAP).
|
|
102
|
+
- [ ] Fact table grain is explicitly defined and documented.
|
|
103
|
+
- [ ] SCD types are chosen and justified for mutable dimensions.
|
|
104
|
+
- [ ] Data contracts define schema, semantics, and SLA for critical interfaces.
|
|
105
|
+
- [ ] Schema evolution strategy avoids breaking changes without migration.
|
|
106
|
+
- [ ] Data lineage is mapped for compliance-sensitive and business-critical paths.
|
|
107
|
+
- [ ] Physical optimizations (indexes, partitions) are justified by query patterns.
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: data-pipeline-design
|
|
3
|
+
version: 1.0.0
|
|
4
|
+
min_mindforge_version: 10.1.1
|
|
5
|
+
status: stable
|
|
6
|
+
triggers: data pipeline design, ETL pipeline, ELT pattern, batch vs streaming pipeline, exactly-once processing pipeline, schema registry pipeline, data quality gate, pipeline orchestration, data ingestion, pipeline backfill, pipeline monitoring, data freshness
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Skill — Data Pipeline Design
|
|
10
|
+
|
|
11
|
+
## When this skill activates
|
|
12
|
+
Any task involving designing data ingestion, transformation, or delivery pipelines.
|
|
13
|
+
Includes ETL/ELT architecture, batch vs streaming decisions, schema management,
|
|
14
|
+
data quality enforcement, and pipeline orchestration.
|
|
15
|
+
|
|
16
|
+
## Mandatory actions when this skill is active
|
|
17
|
+
|
|
18
|
+
### Before writing any code
|
|
19
|
+
1. Define data contract (schema, freshness SLA, volume, sources, consumers).
|
|
20
|
+
2. Decide batch vs streaming (latency requirement is the primary driver).
|
|
21
|
+
3. Identify exactly-once requirements (financial data = must, analytics = can relax).
|
|
22
|
+
4. Plan schema evolution strategy (backward-compatible changes only).
|
|
23
|
+
|
|
24
|
+
### During implementation
|
|
25
|
+
- Implement data quality gates before consumers see data.
|
|
26
|
+
- Use schema registry for all structured data exchange.
|
|
27
|
+
- Make all transformations idempotent (safe to re-run).
|
|
28
|
+
- Include dead-letter queues for malformed/failed records.
|
|
29
|
+
- Add lineage tracking (where did this data come from?).
|
|
30
|
+
- Monitor freshness SLA with alerting.
|
|
31
|
+
|
|
32
|
+
### After implementation
|
|
33
|
+
- Verify backfill capability (can we reprocess historical data?).
|
|
34
|
+
- Test schema evolution (add column, change type) without breaking consumers.
|
|
35
|
+
- Confirm quality gates catch known bad data patterns.
|
|
36
|
+
- Validate freshness SLA is met under normal load.
|
|
37
|
+
- Document data lineage for every output table.
|
|
38
|
+
|
|
39
|
+
## ETL vs ELT Decision
|
|
40
|
+
|
|
41
|
+
### ETL (Extract → Transform → Load)
|
|
42
|
+
- Transform before loading into destination.
|
|
43
|
+
- Best for: structured sources, known transformations, data quality at boundary.
|
|
44
|
+
- Tools: Airflow + Python, Spark, custom processors.
|
|
45
|
+
- Advantage: Clean data in warehouse, fewer warehouse compute costs.
|
|
46
|
+
|
|
47
|
+
### ELT (Extract → Load → Transform)
|
|
48
|
+
- Load raw data, transform in the warehouse/lakehouse.
|
|
49
|
+
- Best for: diverse sources, evolving transformations, exploratory analysis.
|
|
50
|
+
- Tools: Fivetran/Airbyte (extract+load) + dbt (transform).
|
|
51
|
+
- Advantage: Raw data preserved, transformations versioned and testable.
|
|
52
|
+
|
|
53
|
+
### Decision Matrix
|
|
54
|
+
| Factor | ETL | ELT |
|
|
55
|
+
|--------|-----|-----|
|
|
56
|
+
| Source diversity | Low (known schema) | High (many sources) |
|
|
57
|
+
| Transformation stability | Stable, well-defined | Evolving, experimental |
|
|
58
|
+
| Data volume | Moderate | Very high |
|
|
59
|
+
| Warehouse compute cost | Sensitive | Acceptable |
|
|
60
|
+
| Need raw data access | No | Yes |
|
|
61
|
+
|
|
62
|
+
## Batch vs Streaming
|
|
63
|
+
|
|
64
|
+
### Batch Processing
|
|
65
|
+
- Process data in scheduled intervals (hourly, daily).
|
|
66
|
+
- Simpler implementation, easier debugging.
|
|
67
|
+
- Cheaper for high-volume, latency-tolerant workloads.
|
|
68
|
+
- Tools: Airflow, Spark Batch, dbt.
|
|
69
|
+
|
|
70
|
+
### Stream Processing
|
|
71
|
+
- Process events as they arrive (real-time or near-real-time).
|
|
72
|
+
- Complex: windowing, ordering, late-arriving data.
|
|
73
|
+
- Required when business needs data in <5 minutes.
|
|
74
|
+
- Tools: Kafka Streams, Flink, Spark Structured Streaming.
|
|
75
|
+
|
|
76
|
+
### Decision: Use streaming only when
|
|
77
|
+
- Business requires <5 minute data freshness.
|
|
78
|
+
- Events must trigger immediate actions (fraud, alerts).
|
|
79
|
+
- Source naturally produces events (clickstream, IoT).
|
|
80
|
+
|
|
81
|
+
Otherwise, batch is simpler and cheaper.
|
|
82
|
+
|
|
83
|
+
## Exactly-Once Processing
|
|
84
|
+
|
|
85
|
+
### Why It's Hard
|
|
86
|
+
Network failures + retries = potential duplicates.
|
|
87
|
+
|
|
88
|
+
### Strategies
|
|
89
|
+
1. **Idempotent sinks**: Write operations produce same result regardless of repetition (UPSERT, conditional write).
|
|
90
|
+
2. **Deduplication keys**: Assign unique ID to each record, deduplicate at sink.
|
|
91
|
+
3. **Checkpointing**: Record progress markers, resume from checkpoint on failure.
|
|
92
|
+
4. **Transactional outbox**: Atomic write to source + outbox table, separate relay.
|
|
93
|
+
|
|
94
|
+
### Practical Guarantees
|
|
95
|
+
| Guarantee | Cost | Use When |
|
|
96
|
+
|-----------|------|----------|
|
|
97
|
+
| At-most-once | Lowest | Metrics where loss is acceptable |
|
|
98
|
+
| At-least-once + idempotent sink | Medium | Most pipelines |
|
|
99
|
+
| Exactly-once (Kafka transactions) | Highest | Financial, billing |
|
|
100
|
+
|
|
101
|
+
## Schema Registry
|
|
102
|
+
|
|
103
|
+
### Purpose
|
|
104
|
+
- Central source of truth for data schemas.
|
|
105
|
+
- Enforce compatibility between producers and consumers.
|
|
106
|
+
- Enable schema evolution without breaking downstream.
|
|
107
|
+
|
|
108
|
+
### Compatibility Modes
|
|
109
|
+
- **Backward compatible**: New schema can read old data (add optional fields).
|
|
110
|
+
- **Forward compatible**: Old schema can read new data (remove optional fields).
|
|
111
|
+
- **Full compatible**: Both backward and forward (safest, most restrictive).
|
|
112
|
+
|
|
113
|
+
### Rules
|
|
114
|
+
- All structured data exchange goes through schema registry.
|
|
115
|
+
- Use Avro or Protobuf (self-describing, compact, evolvable).
|
|
116
|
+
- Test schema changes against compatibility rules in CI.
|
|
117
|
+
- Never break backward compatibility without coordinated migration.
|
|
118
|
+
|
|
119
|
+
## Data Quality Gates
|
|
120
|
+
|
|
121
|
+
### Checks to Implement
|
|
122
|
+
| Check | Example | Severity |
|
|
123
|
+
|-------|---------|----------|
|
|
124
|
+
| Not null | Primary keys must exist | CRITICAL |
|
|
125
|
+
| Uniqueness | No duplicate records | CRITICAL |
|
|
126
|
+
| Range | Age between 0-150 | HIGH |
|
|
127
|
+
| Freshness | Data < 1 hour old | HIGH |
|
|
128
|
+
| Volume | Row count ±10% of expected | MEDIUM |
|
|
129
|
+
| Referential | Foreign keys resolve | MEDIUM |
|
|
130
|
+
| Format | Email matches pattern | LOW |
|
|
131
|
+
|
|
132
|
+
### Implementation
|
|
133
|
+
- Run quality checks BEFORE exposing data to consumers.
|
|
134
|
+
- Quarantine failing records in dead-letter table.
|
|
135
|
+
- Alert on quality degradation trends.
|
|
136
|
+
- Track quality metrics over time (quality score per table).
|
|
137
|
+
|
|
138
|
+
## Pipeline Orchestration
|
|
139
|
+
|
|
140
|
+
### Airflow DAG Best Practices
|
|
141
|
+
- One DAG per logical pipeline.
|
|
142
|
+
- Idempotent tasks (re-runnable without side effects).
|
|
143
|
+
- Explicit dependencies (no implicit ordering).
|
|
144
|
+
- SLA alerts for late-running pipelines.
|
|
145
|
+
- Backfill support (catchup=True with idempotent tasks).
|
|
146
|
+
- Retry with exponential backoff for transient failures.
|
|
147
|
+
|
|
148
|
+
### Monitoring
|
|
149
|
+
- Freshness SLA: alert when data is older than threshold.
|
|
150
|
+
- Pipeline duration: alert on >2x normal runtime.
|
|
151
|
+
- Record count: alert on ±20% deviation from expected.
|
|
152
|
+
- Error rate: alert on >1% record failures.
|
|
153
|
+
|
|
154
|
+
## Backfill Strategy
|
|
155
|
+
|
|
156
|
+
### Requirements
|
|
157
|
+
- Every pipeline must support historical reprocessing.
|
|
158
|
+
- Backfill must be idempotent (running twice = same result).
|
|
159
|
+
- Partition by date for efficient backfill of specific ranges.
|
|
160
|
+
- Backfill should not interfere with production pipeline runs.
|
|
161
|
+
|
|
162
|
+
## Self-check
|
|
163
|
+
- [ ] Data contract defined (schema, freshness, volume).
|
|
164
|
+
- [ ] Batch vs streaming decision justified by latency requirement.
|
|
165
|
+
- [ ] Quality gates implemented before consumer access.
|
|
166
|
+
- [ ] Schema registered and compatibility mode set.
|
|
167
|
+
- [ ] All transformations are idempotent.
|
|
168
|
+
- [ ] Dead-letter queue configured for failures.
|
|
169
|
+
- [ ] Backfill capability tested.
|
|
170
|
+
- [ ] Freshness SLA monitored with alerting.
|
|
171
|
+
- [ ] Data lineage documented.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: data-privacy-engineering
|
|
3
|
+
version: 1.0.0
|
|
4
|
+
min_mindforge_version: 10.6.0
|
|
5
|
+
status: stable
|
|
6
|
+
triggers: data privacy engineering, differential privacy implementation, anonymization technique, consent management system, privacy-preserving computation, GDPR data engineering, data masking, privacy by design, homomorphic encryption, federated learning privacy, secure multi-party computation, PII detection pipeline
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Skill — Data Privacy Engineering
|
|
10
|
+
|
|
11
|
+
## When this skill activates
|
|
12
|
+
This skill activates when implementing privacy-preserving data systems, building consent management infrastructure, or applying anonymization techniques. Use when handling sensitive data requires technical controls beyond access restrictions.
|
|
13
|
+
|
|
14
|
+
## Mandatory actions when this skill is active
|
|
15
|
+
|
|
16
|
+
### Before writing any code
|
|
17
|
+
1. Conduct privacy impact assessment: identify PII/sensitive data, data flows, retention requirements, third-party sharing, and regulatory obligations (GDPR, CCPA, HIPAA)
|
|
18
|
+
2. Define privacy requirements: anonymization level (k-anonymity, l-diversity, differential privacy), consent granularity, right-to-erasure scope, and data minimization principles
|
|
19
|
+
3. Select appropriate privacy techniques: tokenization (reversible), hashing (one-way), encryption (protected), differential privacy (statistical), synthetic data (replacement)
|
|
20
|
+
4. Establish privacy testing framework: re-identification risk assessment, privacy budget tracking, consent enforcement verification, and breach simulation
|
|
21
|
+
|
|
22
|
+
### During implementation
|
|
23
|
+
- Implement automated PII detection pipeline: regex patterns, ML models, NER for unstructured text scanning code, logs, databases, and data lakes
|
|
24
|
+
- Build tokenization service with: format-preserving encryption for display, secure token vault, key rotation, and performance caching for high-throughput
|
|
25
|
+
- Create differential privacy mechanisms: Laplace/Gaussian noise addition calibrated to epsilon budget, query result perturbation, and privacy budget accounting across queries
|
|
26
|
+
- Design consent management system: granular opt-in/opt-out, purpose-specific consent, consent version tracking, and propagation to downstream systems
|
|
27
|
+
- Implement data minimization controls: retention policies with automated deletion, purpose limitation enforcement, and necessity justification for data collection
|
|
28
|
+
- Build privacy-preserving analytics: federated learning for ML without centralized data, secure aggregation for metrics, and homomorphic encryption for computation on encrypted data
|
|
29
|
+
- Create data subject rights workflows: search across systems, export in portable format, deletion with verification, and rectification propagation
|
|
30
|
+
|
|
31
|
+
### After implementation
|
|
32
|
+
- Generate privacy compliance reports: PII inventory, consent coverage, retention policy enforcement, third-party data sharing audit, and rights request fulfillment SLA
|
|
33
|
+
- Build privacy monitoring dashboards: PII exposure incidents, consent withdrawal rates, privacy budget consumption, and anonymization quality metrics
|
|
34
|
+
- Create breach response procedures: detection, containment, notification timelines, affected user identification, and remediation workflows
|
|
35
|
+
- Document privacy controls: anonymization methods, re-identification risk levels, consent mechanisms, and data retention justifications for audit purposes
|
|
36
|
+
|
|
37
|
+
## Self-check before task completion
|
|
38
|
+
- [ ] PII detection covers all data stores with automated scanning and alerting on new sensitive data discoveries
|
|
39
|
+
- [ ] Anonymization techniques applied with documented re-identification risk assessment (k-anonymity ≥10 or equivalent)
|
|
40
|
+
- [ ] Consent management enforces purpose limitation with propagation to all downstream processing systems
|
|
41
|
+
- [ ] Differential privacy implementation maintains epsilon budget <1.0 for sensitive aggregations with privacy accounting
|
|
42
|
+
- [ ] Data subject rights workflows tested for completeness across all systems within regulatory SLA (30 days GDPR)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: database-performance
|
|
3
|
+
version: 1.0.0
|
|
4
|
+
min_mindforge_version: 0.3.0
|
|
5
|
+
status: stable
|
|
6
|
+
triggers: database performance, query plan analysis, EXPLAIN ANALYZE, index selection, partition pruning, materialized view, query optimization, slow query, index strategy, table scan elimination, join optimization, query profiling
|
|
7
|
+
compose: database-patterns
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Skill — Database Performance
|
|
11
|
+
|
|
12
|
+
## When this skill activates
|
|
13
|
+
Any task involving slow queries, query optimization, index strategy, EXPLAIN plan
|
|
14
|
+
analysis, partitioning, materialized views, or database profiling.
|
|
15
|
+
|
|
16
|
+
## Mandatory actions when this skill is active
|
|
17
|
+
|
|
18
|
+
### Before optimizing
|
|
19
|
+
1. Get the current query execution plan (EXPLAIN ANALYZE, not just EXPLAIN).
|
|
20
|
+
2. Identify the actual bottleneck (do not guess).
|
|
21
|
+
3. Measure baseline performance (p50, p95, p99 latency).
|
|
22
|
+
4. Understand the data distribution (cardinality, skew).
|
|
23
|
+
|
|
24
|
+
### Reading EXPLAIN ANALYZE output
|
|
25
|
+
|
|
26
|
+
**Key things to look for:**
|
|
27
|
+
|
|
28
|
+
| Signal | Meaning | Action |
|
|
29
|
+
|--------|---------|--------|
|
|
30
|
+
| Seq Scan on large table | Full table scan, no index used | Add appropriate index |
|
|
31
|
+
| Nested Loop with high rows | O(n*m) join strategy | Consider Hash Join, add index on join column |
|
|
32
|
+
| Actual rows >> Estimated rows | Stale statistics | Run ANALYZE on the table |
|
|
33
|
+
| Sort with external merge | Not enough work_mem | Increase work_mem or add index for ORDER BY |
|
|
34
|
+
| Filter removing most rows | Index not selective enough | Add more specific index or partial index |
|
|
35
|
+
|
|
36
|
+
**Node types (best to worst for large tables):**
|
|
37
|
+
1. Index Only Scan — best (reads from index, no table access).
|
|
38
|
+
2. Index Scan — good (uses index, fetches rows from table).
|
|
39
|
+
3. Bitmap Index Scan — okay (for medium selectivity).
|
|
40
|
+
4. Seq Scan — bad on large tables (reads every row).
|
|
41
|
+
|
|
42
|
+
### Index strategy
|
|
43
|
+
|
|
44
|
+
**B-tree (default, most common):**
|
|
45
|
+
- Equality: `WHERE status = 'active'`
|
|
46
|
+
- Range: `WHERE created_at > '2025-01-01'`
|
|
47
|
+
- Prefix matching: `WHERE name LIKE 'foo%'`
|
|
48
|
+
- Sorting: `ORDER BY created_at DESC`
|
|
49
|
+
- Composite: `(tenant_id, created_at)` — order matters, left-to-right.
|
|
50
|
+
|
|
51
|
+
**GIN (Generalized Inverted Index):**
|
|
52
|
+
- JSONB containment: `WHERE data @> '{"key": "value"}'`
|
|
53
|
+
- Array contains: `WHERE tags @> ARRAY['tag1']`
|
|
54
|
+
- Full-text search: `WHERE to_tsvector(body) @@ to_tsquery('search')`
|
|
55
|
+
|
|
56
|
+
**Partial index (conditional):**
|
|
57
|
+
- Index only rows that match a condition.
|
|
58
|
+
- `CREATE INDEX idx_active_orders ON orders(created_at) WHERE status = 'active'`
|
|
59
|
+
- Smaller, faster, less write overhead.
|
|
60
|
+
|
|
61
|
+
**Expression index:**
|
|
62
|
+
- Index a computed value.
|
|
63
|
+
- `CREATE INDEX idx_lower_email ON users(LOWER(email))`
|
|
64
|
+
- Query must use the same expression to hit the index.
|
|
65
|
+
|
|
66
|
+
### Common query anti-patterns
|
|
67
|
+
|
|
68
|
+
**Functions on indexed columns:**
|
|
69
|
+
```sql
|
|
70
|
+
-- BAD: index on created_at is useless
|
|
71
|
+
WHERE EXTRACT(YEAR FROM created_at) = 2025
|
|
72
|
+
|
|
73
|
+
-- GOOD: rewrite as range
|
|
74
|
+
WHERE created_at >= '2025-01-01' AND created_at < '2026-01-01'
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
**OR conditions preventing index use:**
|
|
78
|
+
```sql
|
|
79
|
+
-- BAD: may cause Seq Scan
|
|
80
|
+
WHERE status = 'active' OR status = 'pending'
|
|
81
|
+
|
|
82
|
+
-- GOOD: use IN
|
|
83
|
+
WHERE status IN ('active', 'pending')
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
**SELECT * when you need few columns:**
|
|
87
|
+
```sql
|
|
88
|
+
-- BAD: fetches all columns, prevents index-only scan
|
|
89
|
+
SELECT * FROM orders WHERE tenant_id = 'abc'
|
|
90
|
+
|
|
91
|
+
-- GOOD: select only needed columns
|
|
92
|
+
SELECT id, status, total FROM orders WHERE tenant_id = 'abc'
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
**Missing LIMIT on unbounded queries:**
|
|
96
|
+
```sql
|
|
97
|
+
-- BAD: may return millions of rows
|
|
98
|
+
SELECT * FROM events WHERE type = 'click'
|
|
99
|
+
|
|
100
|
+
-- GOOD: always paginate
|
|
101
|
+
SELECT * FROM events WHERE type = 'click' ORDER BY id LIMIT 50
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Materialized views
|
|
105
|
+
|
|
106
|
+
**When to use:**
|
|
107
|
+
- Expensive aggregations needed frequently (dashboards, reports).
|
|
108
|
+
- Data changes infrequently relative to read frequency.
|
|
109
|
+
- Acceptable staleness (refresh interval is tolerable).
|
|
110
|
+
|
|
111
|
+
**Implementation:**
|
|
112
|
+
```sql
|
|
113
|
+
CREATE MATERIALIZED VIEW monthly_revenue AS
|
|
114
|
+
SELECT tenant_id, date_trunc('month', created_at) AS month, SUM(amount) AS total
|
|
115
|
+
FROM orders
|
|
116
|
+
WHERE status = 'completed'
|
|
117
|
+
GROUP BY tenant_id, month;
|
|
118
|
+
|
|
119
|
+
-- Refresh on schedule
|
|
120
|
+
REFRESH MATERIALIZED VIEW CONCURRENTLY monthly_revenue;
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
**Rules:**
|
|
124
|
+
- Always use CONCURRENTLY (does not lock reads during refresh).
|
|
125
|
+
- Add a unique index for CONCURRENTLY to work.
|
|
126
|
+
- Monitor refresh duration — alert if it exceeds threshold.
|
|
127
|
+
- Consider triggers for real-time materialized views (small tables only).
|
|
128
|
+
|
|
129
|
+
### Partitioning
|
|
130
|
+
|
|
131
|
+
**Range partitioning (time-series data):**
|
|
132
|
+
```sql
|
|
133
|
+
CREATE TABLE events (
|
|
134
|
+
id BIGINT, tenant_id UUID, created_at TIMESTAMPTZ, data JSONB
|
|
135
|
+
) PARTITION BY RANGE (created_at);
|
|
136
|
+
|
|
137
|
+
CREATE TABLE events_2025_01 PARTITION OF events
|
|
138
|
+
FOR VALUES FROM ('2025-01-01') TO ('2025-02-01');
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
**Benefits:**
|
|
142
|
+
- Partition pruning: queries on created_at only scan relevant partitions.
|
|
143
|
+
- Easy data lifecycle: DROP old partitions instead of DELETE (instant, no vacuum).
|
|
144
|
+
- Parallel scan across partitions.
|
|
145
|
+
|
|
146
|
+
**Hash partitioning (even distribution):**
|
|
147
|
+
- For tables with no natural range key.
|
|
148
|
+
- Distributes rows evenly across N partitions.
|
|
149
|
+
- Good for very large tables that need parallel access.
|
|
150
|
+
|
|
151
|
+
**Rules:**
|
|
152
|
+
- Partition key must be in every query's WHERE clause for pruning.
|
|
153
|
+
- Too many partitions (>1000) can slow planning.
|
|
154
|
+
- Automate partition creation (don't rely on manual monthly creation).
|
|
155
|
+
|
|
156
|
+
### Join optimization
|
|
157
|
+
|
|
158
|
+
- Ensure join columns have indexes on both sides.
|
|
159
|
+
- Small table JOIN large table: ensure small table is the "driving" table.
|
|
160
|
+
- Consider denormalization if a join is on the critical path and never changes.
|
|
161
|
+
- Use CTEs carefully — in PostgreSQL < 12, CTEs are optimization fences.
|
|
162
|
+
|
|
163
|
+
### Monitoring
|
|
164
|
+
|
|
165
|
+
- Enable `pg_stat_statements` for query-level statistics.
|
|
166
|
+
- Alert on queries exceeding p95 threshold.
|
|
167
|
+
- Track index usage: `pg_stat_user_indexes` — unused indexes waste write performance.
|
|
168
|
+
- Regular VACUUM and ANALYZE (autovacuum tuning for high-write tables).
|
|
169
|
+
|
|
170
|
+
## Self-check before task completion
|
|
171
|
+
- [ ] Did I follow the mandatory actions for this skill?
|
|
172
|
+
- [ ] Did I apply the patterns appropriate to the context?
|
|
173
|
+
- [ ] Did I verify the implementation meets the criteria above?
|
|
174
|
+
- [ ] Did I document decisions and trade-offs made?
|