agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
# CLI Tools Development Guide
|
|
2
|
+
|
|
3
|
+
Staff-level guidelines for building professional command-line interfaces that are intuitive, robust, and maintainable.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
This guide applies to:
|
|
10
|
+
|
|
11
|
+
- Command-line applications and utilities
|
|
12
|
+
- Developer tools and build systems
|
|
13
|
+
- System administration scripts
|
|
14
|
+
- Automation and DevOps tooling
|
|
15
|
+
- Interactive terminal applications
|
|
16
|
+
|
|
17
|
+
### Key Principles
|
|
18
|
+
|
|
19
|
+
1. **Human-First Design** - Intuitive for humans, scriptable for machines
|
|
20
|
+
2. **Composability** - Do one thing well, compose with others
|
|
21
|
+
3. **Predictability** - Consistent behavior, no surprises
|
|
22
|
+
4. **Progressive Disclosure** - Simple things simple, complex things possible
|
|
23
|
+
|
|
24
|
+
### Technology Stack
|
|
25
|
+
|
|
26
|
+
| Language | Framework | Config | When to Use |
|
|
27
|
+
|----------|-----------|--------|-------------|
|
|
28
|
+
| Go | Cobra + Viper | Viper | Cross-platform binaries, performance |
|
|
29
|
+
| Node.js | Commander.js | cosmiconfig | npm ecosystem, rapid development |
|
|
30
|
+
| Python | Typer/Click | pydantic | Data processing, scripting |
|
|
31
|
+
| Rust | Clap | config-rs | Performance-critical, systems tools |
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Command Structure
|
|
36
|
+
|
|
37
|
+
### Root Command Pattern
|
|
38
|
+
|
|
39
|
+
```go
|
|
40
|
+
// Go with Cobra
|
|
41
|
+
var rootCmd = &cobra.Command{
|
|
42
|
+
Use: "mytool",
|
|
43
|
+
Short: "A brief description",
|
|
44
|
+
PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
|
|
45
|
+
return initConfig()
|
|
46
|
+
},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
func init() {
|
|
50
|
+
rootCmd.PersistentFlags().BoolP("verbose", "v", false, "verbose output")
|
|
51
|
+
rootCmd.PersistentFlags().String("config", "", "config file path")
|
|
52
|
+
}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Subcommands
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
mytool init # Initialize project
|
|
59
|
+
mytool build # Build project
|
|
60
|
+
mytool deploy # Deploy to environment
|
|
61
|
+
mytool config get # Nested subcommand
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Configuration Precedence
|
|
65
|
+
|
|
66
|
+
1. Command-line flags (highest)
|
|
67
|
+
2. Environment variables
|
|
68
|
+
3. Local config file
|
|
69
|
+
4. Global config file
|
|
70
|
+
5. Default values (lowest)
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Arguments and Flags
|
|
75
|
+
|
|
76
|
+
### Standard Flags (Always Support)
|
|
77
|
+
|
|
78
|
+
| Flag | Purpose |
|
|
79
|
+
|------|---------|
|
|
80
|
+
| `-h, --help` | Show help |
|
|
81
|
+
| `-v, --verbose` | Verbose output |
|
|
82
|
+
| `-q, --quiet` | Suppress output |
|
|
83
|
+
| `--version` | Show version |
|
|
84
|
+
| `--config` | Config file path |
|
|
85
|
+
| `--no-color` | Disable colors |
|
|
86
|
+
| `--dry-run` | Preview changes |
|
|
87
|
+
| `-f, --force` | Skip confirmations |
|
|
88
|
+
|
|
89
|
+
### Argument Validation
|
|
90
|
+
|
|
91
|
+
```go
|
|
92
|
+
// Go: Custom validation
|
|
93
|
+
var runCmd = &cobra.Command{
|
|
94
|
+
Use: "run <environment>",
|
|
95
|
+
Args: func(cmd *cobra.Command, args []string) error {
|
|
96
|
+
if len(args) != 1 {
|
|
97
|
+
return fmt.Errorf("requires exactly one argument")
|
|
98
|
+
}
|
|
99
|
+
validEnvs := []string{"dev", "staging", "prod"}
|
|
100
|
+
for _, env := range validEnvs {
|
|
101
|
+
if args[0] == env {
|
|
102
|
+
return nil
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
return fmt.Errorf("invalid environment %q", args[0])
|
|
106
|
+
},
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## User Experience
|
|
113
|
+
|
|
114
|
+
### Output Streams
|
|
115
|
+
|
|
116
|
+
- **stdout**: Data output (for piping)
|
|
117
|
+
- **stderr**: Progress, status, errors (for humans)
|
|
118
|
+
|
|
119
|
+
```go
|
|
120
|
+
// Correct stream usage
|
|
121
|
+
fmt.Fprintln(os.Stderr, "Building...") // Progress
|
|
122
|
+
fmt.Println(result.Path) // Data output
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Colors
|
|
126
|
+
|
|
127
|
+
| Color | Use For |
|
|
128
|
+
|-------|---------|
|
|
129
|
+
| Green | Success |
|
|
130
|
+
| Red | Errors |
|
|
131
|
+
| Yellow | Warnings |
|
|
132
|
+
| Blue | Information |
|
|
133
|
+
| Dim | Secondary info |
|
|
134
|
+
|
|
135
|
+
Always respect `NO_COLOR` environment variable and `--no-color` flag.
|
|
136
|
+
|
|
137
|
+
### Progress Indicators
|
|
138
|
+
|
|
139
|
+
```go
|
|
140
|
+
// Spinner for unknown duration
|
|
141
|
+
spinner.Start()
|
|
142
|
+
defer spinner.Stop()
|
|
143
|
+
|
|
144
|
+
// Progress bar for known progress
|
|
145
|
+
bar := progressbar.New(total)
|
|
146
|
+
for _, item := range items {
|
|
147
|
+
process(item)
|
|
148
|
+
bar.Add(1)
|
|
149
|
+
}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### Interactive Prompts
|
|
153
|
+
|
|
154
|
+
```go
|
|
155
|
+
// Confirmation with non-interactive fallback
|
|
156
|
+
func confirm(message string, force bool) bool {
|
|
157
|
+
if force || !term.IsTerminal(int(os.Stdin.Fd())) {
|
|
158
|
+
return true
|
|
159
|
+
}
|
|
160
|
+
// Show interactive prompt
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## Error Handling
|
|
167
|
+
|
|
168
|
+
### Exit Codes
|
|
169
|
+
|
|
170
|
+
| Code | Meaning |
|
|
171
|
+
|------|---------|
|
|
172
|
+
| 0 | Success |
|
|
173
|
+
| 1 | General error |
|
|
174
|
+
| 2 | Invalid arguments |
|
|
175
|
+
| 130 | Interrupted (Ctrl+C) |
|
|
176
|
+
|
|
177
|
+
### Error Message Structure
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
Error: cannot read config file
|
|
181
|
+
|
|
182
|
+
File: /home/user/.config/mytool/config.yaml
|
|
183
|
+
Cause: permission denied
|
|
184
|
+
|
|
185
|
+
Try:
|
|
186
|
+
• Check file permissions: ls -la ~/.config/mytool/
|
|
187
|
+
• Fix permissions: chmod 600 ~/.config/mytool/config.yaml
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Error Message Guidelines
|
|
191
|
+
|
|
192
|
+
1. **What happened** - Clear description
|
|
193
|
+
2. **Context** - Relevant details (paths, values)
|
|
194
|
+
3. **Why** - Root cause if known
|
|
195
|
+
4. **How to fix** - Actionable suggestions
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## Testing
|
|
200
|
+
|
|
201
|
+
### Test Levels
|
|
202
|
+
|
|
203
|
+
| Level | What to Test | How |
|
|
204
|
+
|-------|--------------|-----|
|
|
205
|
+
| Unit | Business logic | Mock dependencies |
|
|
206
|
+
| Integration | Full commands | Capture stdout/stderr |
|
|
207
|
+
| E2E | User workflows | Run actual binary |
|
|
208
|
+
|
|
209
|
+
### Mocking I/O
|
|
210
|
+
|
|
211
|
+
```go
|
|
212
|
+
// Capture output
|
|
213
|
+
stdout := new(bytes.Buffer)
|
|
214
|
+
stderr := new(bytes.Buffer)
|
|
215
|
+
cmd.SetOut(stdout)
|
|
216
|
+
cmd.SetErr(stderr)
|
|
217
|
+
|
|
218
|
+
// Mock stdin
|
|
219
|
+
r, w, _ := os.Pipe()
|
|
220
|
+
go func() {
|
|
221
|
+
w.WriteString("y\n")
|
|
222
|
+
w.Close()
|
|
223
|
+
}()
|
|
224
|
+
os.Stdin = r
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Test Exit Codes
|
|
228
|
+
|
|
229
|
+
```go
|
|
230
|
+
func TestExitCodes(t *testing.T) {
|
|
231
|
+
cmd := exec.Command("./mytool", "invalid")
|
|
232
|
+
err := cmd.Run()
|
|
233
|
+
exitErr := err.(*exec.ExitError)
|
|
234
|
+
assert.Equal(t, 2, exitErr.ExitCode())
|
|
235
|
+
}
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## Distribution
|
|
241
|
+
|
|
242
|
+
### Version Information
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
$ mytool version
|
|
246
|
+
mytool 1.0.0
|
|
247
|
+
commit: abc123
|
|
248
|
+
built: 2025-01-27
|
|
249
|
+
go: go1.22
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### Installation Methods
|
|
253
|
+
|
|
254
|
+
```bash
|
|
255
|
+
# Homebrew
|
|
256
|
+
brew install myorg/tap/mytool
|
|
257
|
+
|
|
258
|
+
# npm
|
|
259
|
+
npm install -g mytool
|
|
260
|
+
|
|
261
|
+
# Go
|
|
262
|
+
go install github.com/myorg/mytool@latest
|
|
263
|
+
|
|
264
|
+
# Direct download
|
|
265
|
+
curl -fsSL https://example.com/install.sh | bash
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Shell Completions
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
# Generate completions
|
|
272
|
+
mytool completion bash > /etc/bash_completion.d/mytool
|
|
273
|
+
mytool completion zsh > "${fpath[1]}/_mytool"
|
|
274
|
+
mytool completion fish > ~/.config/fish/completions/mytool.fish
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## Definition of Done
|
|
280
|
+
|
|
281
|
+
A CLI tool is ready for release when:
|
|
282
|
+
|
|
283
|
+
- [ ] All commands have help text and examples
|
|
284
|
+
- [ ] Error messages suggest corrective actions
|
|
285
|
+
- [ ] Exit codes follow conventions
|
|
286
|
+
- [ ] Works in non-interactive environments (CI/CD)
|
|
287
|
+
- [ ] Supports stdin/stdout piping
|
|
288
|
+
- [ ] Configuration precedence documented
|
|
289
|
+
- [ ] Shell completions available
|
|
290
|
+
- [ ] Version flag shows useful info
|
|
291
|
+
- [ ] Tests cover happy path and errors
|
|
292
|
+
- [ ] Installation instructions documented
|
|
293
|
+
|
|
294
|
+
---
|
|
295
|
+
|
|
296
|
+
## Common Pitfalls
|
|
297
|
+
|
|
298
|
+
### 1. Ignoring Non-TTY Environments
|
|
299
|
+
|
|
300
|
+
```go
|
|
301
|
+
// Bad: Always show spinner
|
|
302
|
+
spinner.Start()
|
|
303
|
+
|
|
304
|
+
// Good: Detect TTY
|
|
305
|
+
if term.IsTerminal(int(os.Stderr.Fd())) {
|
|
306
|
+
spinner.Start()
|
|
307
|
+
} else {
|
|
308
|
+
fmt.Fprintln(os.Stderr, "Processing...")
|
|
309
|
+
}
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### 2. Hardcoded Paths
|
|
313
|
+
|
|
314
|
+
```go
|
|
315
|
+
// Bad
|
|
316
|
+
configPath := "/etc/mytool/config.yaml"
|
|
317
|
+
|
|
318
|
+
// Good
|
|
319
|
+
configPath := filepath.Join(xdg.ConfigHome, "mytool", "config.yaml")
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### 3. Silent Failures
|
|
323
|
+
|
|
324
|
+
```go
|
|
325
|
+
// Bad
|
|
326
|
+
result, _ := riskyOperation()
|
|
327
|
+
|
|
328
|
+
// Good
|
|
329
|
+
result, err := riskyOperation()
|
|
330
|
+
if err != nil {
|
|
331
|
+
return fmt.Errorf("operation failed: %w", err)
|
|
332
|
+
}
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
### 4. Missing Non-Interactive Support
|
|
336
|
+
|
|
337
|
+
```go
|
|
338
|
+
// Bad: Always prompts
|
|
339
|
+
answer := prompt("Continue?")
|
|
340
|
+
|
|
341
|
+
// Good: Respect --force flag
|
|
342
|
+
if !force {
|
|
343
|
+
answer := prompt("Continue?")
|
|
344
|
+
}
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
---
|
|
348
|
+
|
|
349
|
+
## Resources
|
|
350
|
+
|
|
351
|
+
- [Command Line Interface Guidelines](https://clig.dev/)
|
|
352
|
+
- [Better CLI](https://better-cli.org/)
|
|
353
|
+
- [Cobra Documentation](https://cobra.dev/)
|
|
354
|
+
- [Commander.js](https://github.com/tj/commander.js)
|
|
355
|
+
- [Click Documentation](https://click.palletsprojects.com/)
|
|
356
|
+
- [Clap Documentation](https://docs.rs/clap/)
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
# Data Modeling
|
|
2
|
+
|
|
3
|
+
Patterns for designing scalable, maintainable data models.
|
|
4
|
+
|
|
5
|
+
## Layered Architecture
|
|
6
|
+
|
|
7
|
+
### Medallion Architecture
|
|
8
|
+
|
|
9
|
+
| Layer | Purpose | Characteristics |
|
|
10
|
+
|-------|---------|-----------------|
|
|
11
|
+
| **Bronze/Raw** | Ingest and preserve | Exact source copy, append-only, no transformations |
|
|
12
|
+
| **Silver/Curated** | Clean and conform | Validated, typed, deduplicated, business keys |
|
|
13
|
+
| **Gold/Marts** | Aggregate and serve | Business-ready, optimized for consumption |
|
|
14
|
+
|
|
15
|
+
```sql
|
|
16
|
+
-- Bronze: Raw ingestion
|
|
17
|
+
CREATE TABLE bronze.orders_raw (
|
|
18
|
+
_raw_data STRING, -- Original JSON/CSV
|
|
19
|
+
_source_file STRING,
|
|
20
|
+
_ingested_at TIMESTAMP
|
|
21
|
+
);
|
|
22
|
+
|
|
23
|
+
-- Silver: Cleaned and typed
|
|
24
|
+
CREATE TABLE silver.orders (
|
|
25
|
+
order_id STRING NOT NULL,
|
|
26
|
+
customer_id STRING NOT NULL,
|
|
27
|
+
order_date DATE NOT NULL,
|
|
28
|
+
total_amount DECIMAL(12,2),
|
|
29
|
+
_loaded_at TIMESTAMP
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
-- Gold: Business aggregates
|
|
33
|
+
CREATE TABLE gold.daily_order_summary (
|
|
34
|
+
order_date DATE,
|
|
35
|
+
total_orders INT,
|
|
36
|
+
total_revenue DECIMAL(15,2),
|
|
37
|
+
avg_order_value DECIMAL(12,2)
|
|
38
|
+
);
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Dimensional Modeling
|
|
42
|
+
|
|
43
|
+
### Star Schema
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
+----------------+
|
|
47
|
+
| dim_customer |
|
|
48
|
+
+----------------+
|
|
49
|
+
|
|
|
50
|
+
+------------+ +------+-------+ +-------------+
|
|
51
|
+
| dim_product|------| fact_orders |------| dim_date |
|
|
52
|
+
+------------+ +--------------+ +-------------+
|
|
53
|
+
|
|
|
54
|
+
+------+-------+
|
|
55
|
+
| dim_location |
|
|
56
|
+
+----------------+
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Fact Tables
|
|
60
|
+
|
|
61
|
+
Store measurable events with foreign keys to dimensions.
|
|
62
|
+
|
|
63
|
+
```sql
|
|
64
|
+
CREATE TABLE facts.orders (
|
|
65
|
+
-- Degenerate dimension (no separate table needed)
|
|
66
|
+
order_id STRING NOT NULL,
|
|
67
|
+
|
|
68
|
+
-- Foreign keys to dimensions
|
|
69
|
+
customer_key BIGINT NOT NULL,
|
|
70
|
+
product_key BIGINT NOT NULL,
|
|
71
|
+
date_key INT NOT NULL,
|
|
72
|
+
|
|
73
|
+
-- Measures
|
|
74
|
+
quantity INT NOT NULL,
|
|
75
|
+
unit_price DECIMAL(10,2) NOT NULL,
|
|
76
|
+
discount_amount DECIMAL(10,2) DEFAULT 0,
|
|
77
|
+
total_amount DECIMAL(12,2) NOT NULL,
|
|
78
|
+
|
|
79
|
+
-- Metadata
|
|
80
|
+
_loaded_at TIMESTAMP NOT NULL
|
|
81
|
+
)
|
|
82
|
+
PARTITIONED BY (date_key);
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Dimension Tables
|
|
86
|
+
|
|
87
|
+
Store descriptive attributes for analysis.
|
|
88
|
+
|
|
89
|
+
```sql
|
|
90
|
+
CREATE TABLE dims.customers (
|
|
91
|
+
-- Surrogate key (for SCD)
|
|
92
|
+
customer_key BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
|
|
93
|
+
|
|
94
|
+
-- Natural key (from source)
|
|
95
|
+
customer_id STRING NOT NULL,
|
|
96
|
+
|
|
97
|
+
-- Attributes
|
|
98
|
+
name STRING NOT NULL,
|
|
99
|
+
email STRING,
|
|
100
|
+
segment STRING,
|
|
101
|
+
region STRING,
|
|
102
|
+
|
|
103
|
+
-- SCD Type 2 tracking
|
|
104
|
+
effective_from DATE NOT NULL,
|
|
105
|
+
effective_to DATE,
|
|
106
|
+
is_current BOOLEAN NOT NULL,
|
|
107
|
+
|
|
108
|
+
-- Metadata
|
|
109
|
+
_loaded_at TIMESTAMP NOT NULL
|
|
110
|
+
);
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Slowly Changing Dimensions
|
|
114
|
+
|
|
115
|
+
### Type 1: Overwrite
|
|
116
|
+
|
|
117
|
+
Simply update the record. No history preserved.
|
|
118
|
+
|
|
119
|
+
```sql
|
|
120
|
+
-- Type 1: Overwrite existing value
|
|
121
|
+
UPDATE dims.customers
|
|
122
|
+
SET email = 'new@email.com', _loaded_at = CURRENT_TIMESTAMP
|
|
123
|
+
WHERE customer_id = 'C123';
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Type 2: Add New Row
|
|
127
|
+
|
|
128
|
+
Preserve history with effective dates.
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
def apply_scd_type_2(source_df: DataFrame, target_table: str) -> None:
|
|
132
|
+
target = DeltaTable.forName(spark, target_table)
|
|
133
|
+
|
|
134
|
+
# Close existing current records for changed rows
|
|
135
|
+
(target.alias("t")
|
|
136
|
+
.merge(
|
|
137
|
+
source_df.alias("s"),
|
|
138
|
+
"t.customer_id = s.customer_id AND t.is_current = true"
|
|
139
|
+
)
|
|
140
|
+
.whenMatchedUpdate(
|
|
141
|
+
condition="t.name != s.name OR t.email != s.email", # Tracked columns
|
|
142
|
+
set={
|
|
143
|
+
"effective_to": "current_date()",
|
|
144
|
+
"is_current": "false"
|
|
145
|
+
}
|
|
146
|
+
)
|
|
147
|
+
.execute())
|
|
148
|
+
|
|
149
|
+
# Insert new records for changes and new customers
|
|
150
|
+
new_records = (
|
|
151
|
+
source_df
|
|
152
|
+
.withColumn("effective_from", F.current_date())
|
|
153
|
+
.withColumn("effective_to", F.lit(None))
|
|
154
|
+
.withColumn("is_current", F.lit(True))
|
|
155
|
+
)
|
|
156
|
+
new_records.write.mode("append").saveAsTable(target_table)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Type 3: Add New Column
|
|
160
|
+
|
|
161
|
+
Track limited history with previous value column.
|
|
162
|
+
|
|
163
|
+
```sql
|
|
164
|
+
ALTER TABLE dims.customers ADD COLUMN previous_segment STRING;
|
|
165
|
+
|
|
166
|
+
UPDATE dims.customers
|
|
167
|
+
SET previous_segment = segment, segment = 'Enterprise'
|
|
168
|
+
WHERE customer_id = 'C123';
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Date Dimension
|
|
172
|
+
|
|
173
|
+
Pre-populated calendar table for time-based analysis.
|
|
174
|
+
|
|
175
|
+
```sql
|
|
176
|
+
CREATE TABLE dims.date (
|
|
177
|
+
date_key INT PRIMARY KEY, -- YYYYMMDD format
|
|
178
|
+
full_date DATE NOT NULL,
|
|
179
|
+
|
|
180
|
+
-- Calendar attributes
|
|
181
|
+
day_of_week INT,
|
|
182
|
+
day_name STRING,
|
|
183
|
+
day_of_month INT,
|
|
184
|
+
day_of_year INT,
|
|
185
|
+
week_of_year INT,
|
|
186
|
+
month_number INT,
|
|
187
|
+
month_name STRING,
|
|
188
|
+
quarter INT,
|
|
189
|
+
year INT,
|
|
190
|
+
|
|
191
|
+
-- Business attributes
|
|
192
|
+
is_weekend BOOLEAN,
|
|
193
|
+
is_holiday BOOLEAN,
|
|
194
|
+
holiday_name STRING,
|
|
195
|
+
fiscal_year INT,
|
|
196
|
+
fiscal_quarter INT,
|
|
197
|
+
|
|
198
|
+
-- Relative flags (update daily)
|
|
199
|
+
is_current_day BOOLEAN,
|
|
200
|
+
is_current_week BOOLEAN,
|
|
201
|
+
is_current_month BOOLEAN
|
|
202
|
+
);
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Schema Design Best Practices
|
|
206
|
+
|
|
207
|
+
### Use Appropriate Data Types
|
|
208
|
+
|
|
209
|
+
```sql
|
|
210
|
+
-- Good: Appropriate types
|
|
211
|
+
order_id STRING, -- IDs as strings (UUIDs, alphanumeric)
|
|
212
|
+
quantity INT, -- Whole numbers
|
|
213
|
+
unit_price DECIMAL(10,2), -- Money with fixed precision
|
|
214
|
+
order_date DATE, -- Date without time
|
|
215
|
+
created_at TIMESTAMP, -- Date with time
|
|
216
|
+
is_active BOOLEAN, -- True/false flags
|
|
217
|
+
|
|
218
|
+
-- Bad: Wrong types
|
|
219
|
+
order_id INT, -- May overflow, can't handle UUIDs
|
|
220
|
+
unit_price FLOAT, -- Floating point precision issues
|
|
221
|
+
order_date STRING, -- Can't do date math, inconsistent formats
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
### Naming Conventions
|
|
225
|
+
|
|
226
|
+
```sql
|
|
227
|
+
-- Tables: plural, snake_case
|
|
228
|
+
orders, order_items, customer_addresses
|
|
229
|
+
|
|
230
|
+
-- Columns: singular, snake_case
|
|
231
|
+
order_id, customer_name, created_at
|
|
232
|
+
|
|
233
|
+
-- Foreign keys: referenced_table_key
|
|
234
|
+
customer_key, product_key
|
|
235
|
+
|
|
236
|
+
-- Boolean columns: is_* or has_*
|
|
237
|
+
is_active, is_deleted, has_discount
|
|
238
|
+
|
|
239
|
+
-- Timestamps: *_at
|
|
240
|
+
created_at, updated_at, deleted_at
|
|
241
|
+
|
|
242
|
+
-- Dates: *_date
|
|
243
|
+
order_date, ship_date, due_date
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Avoid Wide Tables
|
|
247
|
+
|
|
248
|
+
```sql
|
|
249
|
+
-- Bad: Wide table with sparse columns
|
|
250
|
+
CREATE TABLE orders (
|
|
251
|
+
order_id STRING,
|
|
252
|
+
customer_name STRING,
|
|
253
|
+
customer_email STRING,
|
|
254
|
+
customer_phone STRING,
|
|
255
|
+
shipping_address_line1 STRING,
|
|
256
|
+
shipping_address_line2 STRING,
|
|
257
|
+
shipping_city STRING,
|
|
258
|
+
-- ... 50 more columns
|
|
259
|
+
);
|
|
260
|
+
|
|
261
|
+
-- Good: Normalized with joins
|
|
262
|
+
CREATE TABLE orders (order_id STRING, customer_key BIGINT, ...);
|
|
263
|
+
CREATE TABLE customers (customer_key BIGINT, name STRING, email STRING, ...);
|
|
264
|
+
CREATE TABLE addresses (address_key BIGINT, customer_key BIGINT, ...);
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
## Partitioning Strategies
|
|
268
|
+
|
|
269
|
+
### Time-Based Partitioning
|
|
270
|
+
|
|
271
|
+
Most common. Partition by date/time for time-series data.
|
|
272
|
+
|
|
273
|
+
```sql
|
|
274
|
+
CREATE TABLE curated.orders (...)
|
|
275
|
+
PARTITIONED BY (order_date);
|
|
276
|
+
|
|
277
|
+
-- Query with partition filter
|
|
278
|
+
SELECT * FROM curated.orders
|
|
279
|
+
WHERE order_date BETWEEN '2024-01-01' AND '2024-01-31';
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### Categorical Partitioning
|
|
283
|
+
|
|
284
|
+
Partition by high-cardinality categorical column.
|
|
285
|
+
|
|
286
|
+
```sql
|
|
287
|
+
CREATE TABLE curated.events (...)
|
|
288
|
+
PARTITIONED BY (event_type);
|
|
289
|
+
|
|
290
|
+
-- Query specific event types
|
|
291
|
+
SELECT * FROM curated.events
|
|
292
|
+
WHERE event_type = 'purchase';
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### Avoid Over-Partitioning
|
|
296
|
+
|
|
297
|
+
```sql
|
|
298
|
+
-- Bad: Too many partitions, small files
|
|
299
|
+
PARTITIONED BY (order_date, customer_id, product_id)
|
|
300
|
+
|
|
301
|
+
-- Good: Partition by date, cluster/sort by other columns
|
|
302
|
+
PARTITIONED BY (order_date)
|
|
303
|
+
CLUSTERED BY (customer_id) INTO 100 BUCKETS
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
## Data Vault (Brief Overview)
|
|
307
|
+
|
|
308
|
+
Alternative modeling approach for enterprise data warehouses.
|
|
309
|
+
|
|
310
|
+
| Component | Purpose |
|
|
311
|
+
|-----------|---------|
|
|
312
|
+
| **Hub** | Business keys (customer_id, product_id) |
|
|
313
|
+
| **Link** | Relationships between hubs |
|
|
314
|
+
| **Satellite** | Descriptive attributes with history |
|
|
315
|
+
|
|
316
|
+
```sql
|
|
317
|
+
-- Hub: Business key
|
|
318
|
+
CREATE TABLE hub_customer (
|
|
319
|
+
customer_hk STRING, -- Hash of business key
|
|
320
|
+
customer_id STRING, -- Business key
|
|
321
|
+
load_date TIMESTAMP,
|
|
322
|
+
record_source STRING
|
|
323
|
+
);
|
|
324
|
+
|
|
325
|
+
-- Satellite: Attributes
|
|
326
|
+
CREATE TABLE sat_customer (
|
|
327
|
+
customer_hk STRING,
|
|
328
|
+
name STRING,
|
|
329
|
+
email STRING,
|
|
330
|
+
load_date TIMESTAMP,
|
|
331
|
+
load_end_date TIMESTAMP,
|
|
332
|
+
record_source STRING
|
|
333
|
+
);
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
## One Big Table (OBT)
|
|
337
|
+
|
|
338
|
+
Denormalized approach for analytics/BI tools.
|
|
339
|
+
|
|
340
|
+
```sql
|
|
341
|
+
-- Pre-joined, denormalized for fast queries
|
|
342
|
+
CREATE TABLE marts.orders_obt AS
|
|
343
|
+
SELECT
|
|
344
|
+
o.order_id,
|
|
345
|
+
o.order_date,
|
|
346
|
+
o.total_amount,
|
|
347
|
+
c.customer_name,
|
|
348
|
+
c.customer_segment,
|
|
349
|
+
c.customer_region,
|
|
350
|
+
p.product_name,
|
|
351
|
+
p.product_category,
|
|
352
|
+
d.month_name,
|
|
353
|
+
d.quarter,
|
|
354
|
+
d.fiscal_year
|
|
355
|
+
FROM facts.orders o
|
|
356
|
+
JOIN dims.customers c ON o.customer_key = c.customer_key
|
|
357
|
+
JOIN dims.products p ON o.product_key = p.product_key
|
|
358
|
+
JOIN dims.date d ON o.date_key = d.date_key
|
|
359
|
+
WHERE c.is_current = true;
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
**Trade-offs:**
|
|
363
|
+
- ✅ Fast queries (no joins)
|
|
364
|
+
- ✅ Simple for BI tools
|
|
365
|
+
- ❌ Data duplication
|
|
366
|
+
- ❌ Harder to update
|
|
367
|
+
- ❌ Wider rows
|