agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,878 @@
1
+ # Infrastructure Testing
2
+
3
+ Guidelines for testing infrastructure, platform components, and reliability.
4
+
5
+ ## Core Principles
6
+
7
+ 1. **Test Early, Test Often** - Validate before apply, not after
8
+ 2. **Test Like Production** - Use realistic data, scale, and scenarios
9
+ 3. **Automate Everything** - Manual testing doesn't scale
10
+ 4. **Test the Recovery** - Verify backups, failover, and disaster recovery
11
+
12
+ ## Testing Pyramid for Infrastructure
13
+
14
+ ```
15
+ ┌─────────────────┐
16
+ │ Chaos Tests │ ← Production resilience
17
+ ╱└─────────────────┘╲
18
+ ╱ ┌─────────────────┐ ╲
19
+ ╱ │ E2E Tests │ ╲ ← Full stack validation
20
+ ╱ ╱└─────────────────┘╲ ╲
21
+ ╱ ╱ ┌─────────────────┐ ╲ ╲
22
+ ╱ ╱ │ Integration │ ╲ ╲ ← Component interaction
23
+ ╱ ╱ ╱└─────────────────┘╲ ╲ ╲
24
+ ╱ ╱ ╱ ┌─────────────────┐ ╲ ╲ ╲
25
+ ╱ ╱ ╱ │ Unit Tests │ ╲ ╲ ╲ ← Module validation
26
+ ╱ ╱ ╱ ╱└─────────────────┘╲ ╲ ╲ ╲
27
+ ╱ ╱ ╱ ╱ ┌─────────────────┐ ╲ ╲ ╲ ╲
28
+ ╱ ╱ ╱ ╱ │ Static Analysis │ ╲ ╲ ╲ ╲ ← Linting, policy
29
+ └───┴───┴───┴───┴─────────────────────┴───┴───┴───┴───┘
30
+ ```
31
+
32
+ ## Static Analysis
33
+
34
+ ### Terraform Validation
35
+
36
+ ```yaml
37
+ # .github/workflows/terraform-validate.yml
38
+ name: Terraform Validation
39
+
40
+ on:
41
+ pull_request:
42
+ paths:
43
+ - 'terraform/**'
44
+
45
+ jobs:
46
+ validate:
47
+ runs-on: ubuntu-latest
48
+ steps:
49
+ - uses: actions/checkout@v4
50
+
51
+ - name: Setup Terraform
52
+ uses: hashicorp/setup-terraform@v3
53
+
54
+ # Format check
55
+ - name: Terraform Format
56
+ run: terraform fmt -check -recursive
57
+ working-directory: terraform
58
+
59
+ # Syntax validation
60
+ - name: Terraform Validate
61
+ run: |
62
+ terraform init -backend=false
63
+ terraform validate
64
+ working-directory: terraform
65
+
66
+ # Linting
67
+ - name: Setup TFLint
68
+ uses: terraform-linters/setup-tflint@v4
69
+
70
+ - name: TFLint
71
+ run: |
72
+ tflint --init
73
+ tflint --recursive
74
+ working-directory: terraform
75
+
76
+ # Security scanning
77
+ - name: tfsec
78
+ uses: aquasecurity/tfsec-action@v1.0.0
79
+ with:
80
+ working_directory: terraform
81
+
82
+ # Policy compliance
83
+ - name: Checkov
84
+ uses: bridgecrewio/checkov-action@v12
85
+ with:
86
+ directory: terraform
87
+ framework: terraform
88
+ quiet: true
89
+ soft_fail: false
90
+ ```
91
+
92
+ ### Kubernetes Manifest Validation
93
+
94
+ ```yaml
95
+ name: Kubernetes Validation
96
+
97
+ on:
98
+ pull_request:
99
+ paths:
100
+ - 'kubernetes/**'
101
+ - 'charts/**'
102
+
103
+ jobs:
104
+ validate:
105
+ runs-on: ubuntu-latest
106
+ steps:
107
+ - uses: actions/checkout@v4
108
+
109
+ # YAML linting
110
+ - name: YAML Lint
111
+ uses: ibiqlik/action-yamllint@v3
112
+ with:
113
+ file_or_dir: kubernetes/
114
+ config_file: .yamllint.yml
115
+
116
+ # Helm linting
117
+ - name: Helm Lint
118
+ run: |
119
+ for chart in charts/*/; do
120
+ helm lint "$chart"
121
+ done
122
+
123
+ # Kubernetes schema validation
124
+ - name: Kubeconform
125
+ uses: docker://ghcr.io/yannh/kubeconform:latest
126
+ with:
127
+ args: >-
128
+ -summary
129
+ -strict
130
+ -kubernetes-version 1.28.0
131
+ kubernetes/
132
+
133
+ # Policy validation
134
+ - name: Kyverno CLI
135
+ run: |
136
+ kyverno apply policies/ --resource kubernetes/
137
+
138
+ # Security scanning
139
+ - name: Kubesec
140
+ run: |
141
+ for file in kubernetes/*.yaml; do
142
+ kubesec scan "$file"
143
+ done
144
+ ```
145
+
146
+ ### Pre-commit Configuration
147
+
148
+ ```yaml
149
+ # .pre-commit-config.yaml
150
+ repos:
151
+ # Terraform
152
+ - repo: https://github.com/antonbabenko/pre-commit-terraform
153
+ rev: v1.83.5
154
+ hooks:
155
+ - id: terraform_fmt
156
+ - id: terraform_validate
157
+ - id: terraform_docs
158
+ args:
159
+ - --hook-config=--path-to-file=README.md
160
+ - --hook-config=--add-to-existing-file=true
161
+ - id: terraform_tflint
162
+ args:
163
+ - --args=--config=__GIT_WORKING_DIR__/.tflint.hcl
164
+ - id: terraform_tfsec
165
+
166
+ # Kubernetes
167
+ - repo: https://github.com/jumanjihouse/pre-commit-hooks
168
+ rev: 3.0.0
169
+ hooks:
170
+ - id: shellcheck
171
+
172
+ - repo: https://github.com/adrienverge/yamllint
173
+ rev: v1.32.0
174
+ hooks:
175
+ - id: yamllint
176
+ args: [-c=.yamllint.yml]
177
+
178
+ # General
179
+ - repo: https://github.com/pre-commit/pre-commit-hooks
180
+ rev: v4.5.0
181
+ hooks:
182
+ - id: trailing-whitespace
183
+ - id: end-of-file-fixer
184
+ - id: check-yaml
185
+ args: [--allow-multiple-documents]
186
+ - id: check-json
187
+ - id: detect-private-key
188
+ - id: check-merge-conflict
189
+ ```
190
+
191
+ ## Unit Tests (Terratest)
192
+
193
+ ### Module Testing
194
+
195
+ ```go
196
+ // test/vpc_test.go
197
+ package test
198
+
199
+ import (
200
+ "testing"
201
+
202
+ "github.com/gruntwork-io/terratest/modules/terraform"
203
+ "github.com/stretchr/testify/assert"
204
+ "github.com/stretchr/testify/require"
205
+ )
206
+
207
+ func TestVpcModule(t *testing.T) {
208
+ t.Parallel()
209
+
210
+ terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
211
+ TerraformDir: "../modules/networking/vpc",
212
+ Vars: map[string]interface{}{
213
+ "environment": "test",
214
+ "vpc_cidr": "10.0.0.0/16",
215
+ "azs": []string{"us-east-1a", "us-east-1b"},
216
+ "private_subnets": []string{"10.0.1.0/24", "10.0.2.0/24"},
217
+ "public_subnets": []string{"10.0.101.0/24", "10.0.102.0/24"},
218
+ },
219
+ NoColor: true,
220
+ })
221
+
222
+ // Clean up after test
223
+ defer terraform.Destroy(t, terraformOptions)
224
+
225
+ // Deploy the module
226
+ terraform.InitAndApply(t, terraformOptions)
227
+
228
+ // Verify outputs
229
+ vpcId := terraform.Output(t, terraformOptions, "vpc_id")
230
+ assert.NotEmpty(t, vpcId)
231
+
232
+ privateSubnetIds := terraform.OutputList(t, terraformOptions, "private_subnet_ids")
233
+ assert.Len(t, privateSubnetIds, 2)
234
+
235
+ publicSubnetIds := terraform.OutputList(t, terraformOptions, "public_subnet_ids")
236
+ assert.Len(t, publicSubnetIds, 2)
237
+
238
+ // Verify VPC configuration
239
+ vpcCidr := terraform.Output(t, terraformOptions, "vpc_cidr")
240
+ assert.Equal(t, "10.0.0.0/16", vpcCidr)
241
+ }
242
+
243
+ func TestVpcModuleWithNatGateway(t *testing.T) {
244
+ t.Parallel()
245
+
246
+ terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
247
+ TerraformDir: "../modules/networking/vpc",
248
+ Vars: map[string]interface{}{
249
+ "environment": "test",
250
+ "vpc_cidr": "10.1.0.0/16",
251
+ "enable_nat": true,
252
+ "single_nat": false, // HA NAT
253
+ "azs": []string{"us-east-1a", "us-east-1b"},
254
+ "private_subnets": []string{"10.1.1.0/24", "10.1.2.0/24"},
255
+ "public_subnets": []string{"10.1.101.0/24", "10.1.102.0/24"},
256
+ },
257
+ })
258
+
259
+ defer terraform.Destroy(t, terraformOptions)
260
+ terraform.InitAndApply(t, terraformOptions)
261
+
262
+ // Verify NAT gateways
263
+ natGatewayIds := terraform.OutputList(t, terraformOptions, "nat_gateway_ids")
264
+ assert.Len(t, natGatewayIds, 2, "Should have 2 NAT gateways for HA")
265
+ }
266
+ ```
267
+
268
+ ### EKS Cluster Testing
269
+
270
+ ```go
271
+ // test/eks_test.go
272
+ package test
273
+
274
+ import (
275
+ "testing"
276
+ "time"
277
+
278
+ "github.com/gruntwork-io/terratest/modules/k8s"
279
+ "github.com/gruntwork-io/terratest/modules/terraform"
280
+ "github.com/stretchr/testify/assert"
281
+ "github.com/stretchr/testify/require"
282
+ )
283
+
284
+ func TestEksCluster(t *testing.T) {
285
+ t.Parallel()
286
+
287
+ terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
288
+ TerraformDir: "../modules/compute/eks",
289
+ Vars: map[string]interface{}{
290
+ "cluster_name": "test-cluster",
291
+ "kubernetes_version": "1.28",
292
+ "vpc_id": "vpc-12345", // From VPC module output
293
+ "subnet_ids": []string{"subnet-1", "subnet-2"},
294
+ },
295
+ })
296
+
297
+ defer terraform.Destroy(t, terraformOptions)
298
+ terraform.InitAndApply(t, terraformOptions)
299
+
300
+ // Get kubeconfig
301
+ kubeconfig := terraform.Output(t, terraformOptions, "kubeconfig")
302
+ require.NotEmpty(t, kubeconfig)
303
+
304
+ // Configure kubectl
305
+ kubectlOptions := k8s.NewKubectlOptions("", kubeconfig, "default")
306
+
307
+ // Verify cluster is accessible
308
+ k8s.WaitUntilAllNodesReady(t, kubectlOptions, 10, 30*time.Second)
309
+
310
+ // Verify node count
311
+ nodes := k8s.GetNodes(t, kubectlOptions)
312
+ assert.GreaterOrEqual(t, len(nodes), 2)
313
+
314
+ // Verify kube-system pods are running
315
+ k8s.WaitUntilNumPodsCreated(t, kubectlOptions,
316
+ k8s.NewPodFilterOptions("kube-system", "k8s-app=kube-dns"),
317
+ 2, 10, 30*time.Second)
318
+ }
319
+ ```
320
+
321
+ ## Integration Tests
322
+
323
+ ### Helm Chart Testing
324
+
325
+ ```yaml
326
+ # charts/api-server/templates/tests/test-connection.yaml
327
+ apiVersion: v1
328
+ kind: Pod
329
+ metadata:
330
+ name: "{{ include "api-server.fullname" . }}-test-connection"
331
+ labels:
332
+ {{- include "api-server.labels" . | nindent 4 }}
333
+ annotations:
334
+ "helm.sh/hook": test
335
+ "helm.sh/hook-delete-policy": hook-succeeded
336
+ spec:
337
+ containers:
338
+ - name: wget
339
+ image: busybox
340
+ command: ['wget']
341
+ args:
342
+ - '--timeout=5'
343
+ - '--tries=3'
344
+ - '-qO-'
345
+ - 'http://{{ include "api-server.fullname" . }}:{{ .Values.service.port }}/healthz'
346
+ restartPolicy: Never
347
+
348
+ ---
349
+ apiVersion: v1
350
+ kind: Pod
351
+ metadata:
352
+ name: "{{ include "api-server.fullname" . }}-test-metrics"
353
+ annotations:
354
+ "helm.sh/hook": test
355
+ "helm.sh/hook-delete-policy": hook-succeeded
356
+ spec:
357
+ containers:
358
+ - name: curl
359
+ image: curlimages/curl:latest
360
+ command: ['curl']
361
+ args:
362
+ - '--fail'
363
+ - '--silent'
364
+ - 'http://{{ include "api-server.fullname" . }}:{{ .Values.metrics.port }}/metrics'
365
+ restartPolicy: Never
366
+ ```
367
+
368
+ ### Integration Test Suite
369
+
370
+ ```go
371
+ // test/integration/platform_test.go
372
+ package integration
373
+
374
+ import (
375
+ "context"
376
+ "testing"
377
+ "time"
378
+
379
+ "github.com/gruntwork-io/terratest/modules/helm"
380
+ "github.com/gruntwork-io/terratest/modules/k8s"
381
+ "github.com/stretchr/testify/suite"
382
+ )
383
+
384
+ type PlatformIntegrationSuite struct {
385
+ suite.Suite
386
+ kubeconfig string
387
+ kubectlOptions *k8s.KubectlOptions
388
+ namespace string
389
+ }
390
+
391
+ func (s *PlatformIntegrationSuite) SetupSuite() {
392
+ s.namespace = "integration-test-" + time.Now().Format("20060102150405")
393
+ s.kubectlOptions = k8s.NewKubectlOptions("", s.kubeconfig, s.namespace)
394
+
395
+ // Create namespace
396
+ k8s.CreateNamespace(s.T(), s.kubectlOptions, s.namespace)
397
+ }
398
+
399
+ func (s *PlatformIntegrationSuite) TearDownSuite() {
400
+ // Delete namespace
401
+ k8s.DeleteNamespace(s.T(), s.kubectlOptions, s.namespace)
402
+ }
403
+
404
+ func (s *PlatformIntegrationSuite) TestDatabaseDeployment() {
405
+ // Deploy PostgreSQL
406
+ helmOptions := &helm.Options{
407
+ KubectlOptions: s.kubectlOptions,
408
+ SetValues: map[string]string{
409
+ "auth.postgresPassword": "testpassword",
410
+ "primary.persistence.enabled": "false",
411
+ },
412
+ }
413
+
414
+ helm.Install(s.T(), helmOptions, "bitnami/postgresql", "test-db")
415
+ defer helm.Delete(s.T(), helmOptions, "test-db", true)
416
+
417
+ // Wait for pod to be ready
418
+ k8s.WaitUntilPodAvailable(s.T(), s.kubectlOptions, "test-db-postgresql-0", 10, 30*time.Second)
419
+
420
+ // Verify connection
421
+ output, err := k8s.RunKubectlAndGetOutputE(s.T(), s.kubectlOptions,
422
+ "exec", "test-db-postgresql-0", "--",
423
+ "psql", "-U", "postgres", "-c", "SELECT 1")
424
+ s.NoError(err)
425
+ s.Contains(output, "1 row")
426
+ }
427
+
428
+ func (s *PlatformIntegrationSuite) TestServiceMesh() {
429
+ // Deploy test application
430
+ k8s.KubectlApply(s.T(), s.kubectlOptions, "../fixtures/test-app.yaml")
431
+ defer k8s.KubectlDelete(s.T(), s.kubectlOptions, "../fixtures/test-app.yaml")
432
+
433
+ // Wait for sidecar injection
434
+ k8s.WaitUntilPodAvailable(s.T(), s.kubectlOptions, "test-app", 10, 60*time.Second)
435
+
436
+ // Verify sidecar is present
437
+ pod := k8s.GetPod(s.T(), s.kubectlOptions, "test-app")
438
+ s.Len(pod.Spec.Containers, 2, "Should have app + sidecar")
439
+ }
440
+
441
+ func TestPlatformIntegration(t *testing.T) {
442
+ suite.Run(t, new(PlatformIntegrationSuite))
443
+ }
444
+ ```
445
+
446
+ ## Chaos Engineering
447
+
448
+ ### Chaos Mesh
449
+
450
+ ```yaml
451
+ # Pod failure experiment
452
+ apiVersion: chaos-mesh.org/v1alpha1
453
+ kind: PodChaos
454
+ metadata:
455
+ name: api-server-pod-failure
456
+ namespace: chaos-testing
457
+ spec:
458
+ action: pod-failure
459
+ mode: one
460
+ duration: "60s"
461
+ selector:
462
+ namespaces:
463
+ - staging
464
+ labelSelectors:
465
+ app.kubernetes.io/name: api-server
466
+
467
+ ---
468
+ # Network delay experiment
469
+ apiVersion: chaos-mesh.org/v1alpha1
470
+ kind: NetworkChaos
471
+ metadata:
472
+ name: api-server-network-delay
473
+ namespace: chaos-testing
474
+ spec:
475
+ action: delay
476
+ mode: all
477
+ selector:
478
+ namespaces:
479
+ - staging
480
+ labelSelectors:
481
+ app.kubernetes.io/name: api-server
482
+ delay:
483
+ latency: "100ms"
484
+ jitter: "20ms"
485
+ correlation: "50"
486
+ duration: "5m"
487
+
488
+ ---
489
+ # CPU stress experiment
490
+ apiVersion: chaos-mesh.org/v1alpha1
491
+ kind: StressChaos
492
+ metadata:
493
+ name: api-server-cpu-stress
494
+ namespace: chaos-testing
495
+ spec:
496
+ mode: one
497
+ selector:
498
+ namespaces:
499
+ - staging
500
+ labelSelectors:
501
+ app.kubernetes.io/name: api-server
502
+ stressors:
503
+ cpu:
504
+ workers: 2
505
+ load: 80
506
+ duration: "5m"
507
+ ```
508
+
509
+ ### Litmus Chaos
510
+
511
+ ```yaml
512
+ # Chaos experiment for pod delete
513
+ apiVersion: litmuschaos.io/v1alpha1
514
+ kind: ChaosEngine
515
+ metadata:
516
+ name: api-server-chaos
517
+ namespace: staging
518
+ spec:
519
+ appinfo:
520
+ appns: staging
521
+ applabel: 'app.kubernetes.io/name=api-server'
522
+ appkind: deployment
523
+ chaosServiceAccount: litmus-admin
524
+ experiments:
525
+ - name: pod-delete
526
+ spec:
527
+ components:
528
+ env:
529
+ - name: TOTAL_CHAOS_DURATION
530
+ value: '60'
531
+ - name: CHAOS_INTERVAL
532
+ value: '10'
533
+ - name: FORCE
534
+ value: 'false'
535
+ ```
536
+
537
+ ### Chaos Test Workflow
538
+
539
+ ```yaml
540
+ name: Chaos Testing
541
+
542
+ on:
543
+ schedule:
544
+ - cron: '0 2 * * *' # Daily at 2 AM
545
+ workflow_dispatch:
546
+
547
+ jobs:
548
+ chaos-tests:
549
+ runs-on: ubuntu-latest
550
+ environment: staging
551
+
552
+ steps:
553
+ - uses: actions/checkout@v4
554
+
555
+ - name: Configure kubectl
556
+ uses: azure/setup-kubectl@v3
557
+
558
+ - name: Configure kubeconfig
559
+ run: |
560
+ echo "${{ secrets.STAGING_KUBECONFIG }}" > kubeconfig
561
+ echo "KUBECONFIG=$PWD/kubeconfig" >> $GITHUB_ENV
562
+
563
+ - name: Apply chaos experiment
564
+ run: kubectl apply -f chaos/pod-failure.yaml
565
+
566
+ - name: Wait for experiment
567
+ run: sleep 120
568
+
569
+ - name: Check SLO compliance
570
+ run: |
571
+ # Query Prometheus for error rate during chaos
572
+ ERROR_RATE=$(curl -s "http://prometheus.staging:9090/api/v1/query" \
573
+ --data-urlencode 'query=sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))' \
574
+ | jq -r '.data.result[0].value[1]')
575
+
576
+ if (( $(echo "$ERROR_RATE > 0.01" | bc -l) )); then
577
+ echo "SLO violated: error rate $ERROR_RATE > 1%"
578
+ exit 1
579
+ fi
580
+
581
+ - name: Delete chaos experiment
582
+ if: always()
583
+ run: kubectl delete -f chaos/pod-failure.yaml
584
+ ```
585
+
586
+ ## Load Testing
587
+
588
+ ### k6 Load Test
589
+
590
+ ```javascript
591
+ // load-tests/api-server.js
592
+ import http from 'k6/http';
593
+ import { check, sleep } from 'k6';
594
+ import { Rate, Trend } from 'k6/metrics';
595
+
596
+ const errorRate = new Rate('errors');
597
+ const latency = new Trend('latency');
598
+
599
+ export const options = {
600
+ stages: [
601
+ { duration: '2m', target: 100 }, // Ramp up
602
+ { duration: '5m', target: 100 }, // Stay at 100
603
+ { duration: '2m', target: 200 }, // Ramp up more
604
+ { duration: '5m', target: 200 }, // Stay at 200
605
+ { duration: '2m', target: 0 }, // Ramp down
606
+ ],
607
+ thresholds: {
608
+ 'http_req_duration': ['p(99)<500'], // 99% requests under 500ms
609
+ 'errors': ['rate<0.01'], // Error rate under 1%
610
+ },
611
+ };
612
+
613
+ export default function () {
614
+ const BASE_URL = __ENV.BASE_URL || 'http://api.staging.example.com';
615
+
616
+ // GET request
617
+ const getRes = http.get(`${BASE_URL}/api/v1/health`);
618
+ check(getRes, {
619
+ 'GET status is 200': (r) => r.status === 200,
620
+ });
621
+ errorRate.add(getRes.status !== 200);
622
+ latency.add(getRes.timings.duration);
623
+
624
+ sleep(1);
625
+
626
+ // POST request
627
+ const payload = JSON.stringify({
628
+ name: 'Test User',
629
+ email: `test-${Date.now()}@example.com`,
630
+ });
631
+
632
+ const postRes = http.post(`${BASE_URL}/api/v1/users`, payload, {
633
+ headers: { 'Content-Type': 'application/json' },
634
+ });
635
+ check(postRes, {
636
+ 'POST status is 201': (r) => r.status === 201,
637
+ });
638
+ errorRate.add(postRes.status !== 201);
639
+ latency.add(postRes.timings.duration);
640
+
641
+ sleep(1);
642
+ }
643
+ ```
644
+
645
+ ### Load Test Pipeline
646
+
647
+ ```yaml
648
+ name: Load Testing
649
+
650
+ on:
651
+ workflow_dispatch:
652
+ inputs:
653
+ target_vus:
654
+ description: 'Target virtual users'
655
+ required: true
656
+ default: '100'
657
+ duration:
658
+ description: 'Test duration'
659
+ required: true
660
+ default: '5m'
661
+
662
+ jobs:
663
+ load-test:
664
+ runs-on: ubuntu-latest
665
+ steps:
666
+ - uses: actions/checkout@v4
667
+
668
+ - name: Run k6 load test
669
+ uses: grafana/k6-action@v0.3.1
670
+ with:
671
+ filename: load-tests/api-server.js
672
+ flags: --vus ${{ inputs.target_vus }} --duration ${{ inputs.duration }}
673
+ env:
674
+ BASE_URL: https://api.staging.example.com
675
+ K6_CLOUD_TOKEN: ${{ secrets.K6_CLOUD_TOKEN }}
676
+
677
+ - name: Upload results
678
+ uses: actions/upload-artifact@v3
679
+ with:
680
+ name: k6-results
681
+ path: k6-results.json
682
+ ```
683
+
684
+ ## Disaster Recovery Testing
685
+
686
+ ### Backup Verification
687
+
688
+ ```yaml
689
+ name: Backup Verification
690
+
691
+ on:
692
+ schedule:
693
+ - cron: '0 4 * * 0' # Weekly on Sunday at 4 AM
694
+
695
+ jobs:
696
+ verify-backups:
697
+ runs-on: ubuntu-latest
698
+
699
+ steps:
700
+ - uses: actions/checkout@v4
701
+
702
+ - name: Configure AWS
703
+ uses: aws-actions/configure-aws-credentials@v4
704
+ with:
705
+ role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
706
+ aws-region: us-east-1
707
+
708
+ - name: List recent backups
709
+ run: |
710
+ aws s3 ls s3://company-backups/database/ --recursive | tail -10
711
+
712
+ - name: Restore to test environment
713
+ run: |
714
+ # Get latest backup
715
+ LATEST=$(aws s3 ls s3://company-backups/database/ --recursive | sort | tail -1 | awk '{print $4}')
716
+
717
+ # Download backup
718
+ aws s3 cp "s3://company-backups/$LATEST" backup.sql.gz
719
+
720
+ # Restore to test database
721
+ gunzip backup.sql.gz
722
+ PGPASSWORD=${{ secrets.TEST_DB_PASSWORD }} psql \
723
+ -h test-db.example.com \
724
+ -U postgres \
725
+ -d test_restore \
726
+ -f backup.sql
727
+
728
+ - name: Verify data integrity
729
+ run: |
730
+ # Run verification queries
731
+ PGPASSWORD=${{ secrets.TEST_DB_PASSWORD }} psql \
732
+ -h test-db.example.com \
733
+ -U postgres \
734
+ -d test_restore \
735
+ -c "SELECT COUNT(*) FROM users" | grep -q "[0-9]"
736
+
737
+ - name: Cleanup
738
+ if: always()
739
+ run: |
740
+ PGPASSWORD=${{ secrets.TEST_DB_PASSWORD }} psql \
741
+ -h test-db.example.com \
742
+ -U postgres \
743
+ -c "DROP DATABASE IF EXISTS test_restore"
744
+ ```
745
+
746
+ ### Failover Testing
747
+
748
+ ```yaml
749
+ name: Failover Test
750
+
751
+ on:
752
+ workflow_dispatch:
753
+
754
+ jobs:
755
+ failover-test:
756
+ runs-on: ubuntu-latest
757
+ environment: staging
758
+
759
+ steps:
760
+ - name: Record baseline metrics
761
+ run: |
762
+ # Capture current error rate and latency
763
+ curl -s "http://prometheus.staging:9090/api/v1/query?query=rate(http_requests_total[5m])" > baseline.json
764
+
765
+ - name: Trigger failover
766
+ run: |
767
+ # Simulate primary node failure
768
+ kubectl delete pod -l app.kubernetes.io/name=api-server --wait=false
769
+
770
+ - name: Wait for recovery
771
+ run: |
772
+ # Wait for pods to recover
773
+ kubectl rollout status deployment/api-server --timeout=300s
774
+
775
+ - name: Verify service health
776
+ run: |
777
+ # Check health endpoint
778
+ for i in {1..10}; do
779
+ curl -sf https://api.staging.example.com/health && break
780
+ sleep 5
781
+ done
782
+
783
+ - name: Compare metrics
784
+ run: |
785
+ # Verify error rate returned to normal
786
+ ERROR_RATE=$(curl -s "http://prometheus.staging:9090/api/v1/query" \
787
+ --data-urlencode 'query=rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])' \
788
+ | jq -r '.data.result[0].value[1]')
789
+
790
+ if (( $(echo "$ERROR_RATE > 0.05" | bc -l) )); then
791
+ echo "Recovery failed: error rate still high ($ERROR_RATE)"
792
+ exit 1
793
+ fi
794
+
795
+ echo "Recovery successful: error rate $ERROR_RATE"
796
+ ```
797
+
798
+ ## Common Pitfalls
799
+
800
+ ### 1. Tests Only in Happy Path
801
+
802
+ ```go
803
+ // Bad - only tests success
804
+ func TestCreateUser(t *testing.T) {
805
+ user := createUser(validData)
806
+ assert.NotNil(t, user)
807
+ }
808
+
809
+ // Good - tests failures too
810
+ func TestCreateUser(t *testing.T) {
811
+ t.Run("valid data succeeds", func(t *testing.T) {
812
+ user := createUser(validData)
813
+ assert.NotNil(t, user)
814
+ })
815
+
816
+ t.Run("duplicate email fails", func(t *testing.T) {
817
+ _, err := createUser(duplicateEmail)
818
+ assert.ErrorIs(t, err, ErrDuplicateEmail)
819
+ })
820
+
821
+ t.Run("invalid data returns validation error", func(t *testing.T) {
822
+ _, err := createUser(invalidData)
823
+ assert.ErrorIs(t, err, ErrValidation)
824
+ })
825
+ }
826
+ ```
827
+
828
+ ### 2. Tests Depend on Order
829
+
830
+ ```go
831
+ // Bad - tests depend on each other
832
+ func TestA(t *testing.T) { globalState = "A" }
833
+ func TestB(t *testing.T) { assert.Equal(t, "A", globalState) } // Fails if run alone
834
+
835
+ // Good - isolated tests
836
+ func TestA(t *testing.T) {
837
+ state := setupState()
838
+ defer teardown(state)
839
+ // ...
840
+ }
841
+ ```
842
+
843
+ ### 3. Flaky Tests
844
+
845
+ ```go
846
+ // Bad - timing dependent
847
+ func TestAsync(t *testing.T) {
848
+ triggerAsyncJob()
849
+ time.Sleep(time.Second) // Might not be enough
850
+ assert.True(t, jobCompleted())
851
+ }
852
+
853
+ // Good - wait with timeout
854
+ func TestAsync(t *testing.T) {
855
+ triggerAsyncJob()
856
+ require.Eventually(t, func() bool {
857
+ return jobCompleted()
858
+ }, 30*time.Second, time.Second)
859
+ }
860
+ ```
861
+
862
+ ### 4. No Cleanup
863
+
864
+ ```go
865
+ // Bad - leaves resources behind
866
+ func TestEks(t *testing.T) {
867
+ terraform.Apply(t, opts)
868
+ // Test code...
869
+ // Forgot to destroy!
870
+ }
871
+
872
+ // Good - always cleanup
873
+ func TestEks(t *testing.T) {
874
+ defer terraform.Destroy(t, opts) // FIRST thing after Apply
875
+ terraform.Apply(t, opts)
876
+ // Test code...
877
+ }
878
+ ```