engsys 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +202 -0
  3. package/core/agents/aaron.md +152 -0
  4. package/core/agents/bert.md +115 -0
  5. package/core/agents/isabelle.md +136 -0
  6. package/core/agents/jody.md +150 -0
  7. package/core/agents/leith.md +111 -0
  8. package/core/agents/marcelo.md +282 -0
  9. package/core/agents/melvin.md +101 -0
  10. package/core/agents/nyx.md +152 -0
  11. package/core/agents/otto.md +168 -0
  12. package/core/agents/patricia.md +283 -0
  13. package/core/commands/design-audit-local.md +155 -0
  14. package/core/commands/design-audit.md +235 -0
  15. package/core/commands/design-critique.md +96 -0
  16. package/core/commands/file-issue.md +22 -0
  17. package/core/commands/generate-project.md +45 -0
  18. package/core/commands/implement-issue.md +37 -0
  19. package/core/commands/implement-project.md +40 -0
  20. package/core/commands/naturalize.md +61 -0
  21. package/core/commands/pre-push.md +29 -0
  22. package/core/commands/prep-review-collect.md +130 -0
  23. package/core/commands/prep-review-finalize.md +121 -0
  24. package/core/commands/prep-review-publish.md +113 -0
  25. package/core/commands/prep-review.md +65 -0
  26. package/core/commands/project-closeout.md +25 -0
  27. package/core/skills/agentic-eval/SKILL.md +195 -0
  28. package/core/skills/chrome-devtools/SKILL.md +97 -0
  29. package/core/skills/code-review/SKILL.md +26 -0
  30. package/core/skills/gh-cli/SKILL.md +2202 -0
  31. package/core/skills/git-commit/SKILL.md +124 -0
  32. package/core/skills/git-workflow-agents/SKILL.md +462 -0
  33. package/core/skills/git-workflow-agents/reference.md +220 -0
  34. package/core/skills/github-actions/SKILL.md +190 -0
  35. package/core/skills/github-issues/SKILL.md +154 -0
  36. package/core/skills/llm-structured-outputs/SKILL.md +323 -0
  37. package/core/skills/llm-structured-outputs/references/provider-details.md +392 -0
  38. package/core/skills/pre-push/SKILL.md +115 -0
  39. package/core/skills/refactor/SKILL.md +645 -0
  40. package/core/skills/web-design-reviewer/SKILL.md +371 -0
  41. package/core/skills/webapp-testing/SKILL.md +127 -0
  42. package/core/skills/webapp-testing/test-helper.js +56 -0
  43. package/core/templates/CLAUDE.md.tmpl +98 -0
  44. package/core/templates/adr-template.md +67 -0
  45. package/core/templates/gh-issue-templates/bug.md +39 -0
  46. package/core/templates/gh-issue-templates/content.md +42 -0
  47. package/core/templates/gh-issue-templates/enhancement.md +36 -0
  48. package/core/templates/gh-issue-templates/feature.md +39 -0
  49. package/core/templates/gh-issue-templates/infrastructure.md +41 -0
  50. package/core/templates/post-edit-reminders.sh.tmpl +19 -0
  51. package/core/templates/settings.json.tmpl +90 -0
  52. package/core/templates/settings.local.json.tmpl +3 -0
  53. package/core/workflows/agent-implementation-workflow.md +346 -0
  54. package/core/workflows/generate-project.md +258 -0
  55. package/core/workflows/implement-project-workflow.md +190 -0
  56. package/core/workflows/issue-tracking.md +89 -0
  57. package/core/workflows/project-closeout-ceremony.md +77 -0
  58. package/core/workflows/review-workflow.md +266 -0
  59. package/engsys.config.example.yaml +46 -0
  60. package/install +202 -0
  61. package/lessons-library/README.md +80 -0
  62. package/lessons-library/async-callbacks-verify-liveness.md +15 -0
  63. package/lessons-library/change-isnt-done-until-every-surface-updated.md +15 -0
  64. package/lessons-library/claim-then-act-for-irreversible-ops.md +16 -0
  65. package/lessons-library/co-commit-entangled-work.md +15 -0
  66. package/lessons-library/dependabot-triage-playbook.md +17 -0
  67. package/lessons-library/deploy-by-digest-and-verify-the-running-revision.md +15 -0
  68. package/lessons-library/enforce-your-guarantee-at-your-boundary.md +16 -0
  69. package/lessons-library/gate-changes-on-measurement-not-vibes.md +15 -0
  70. package/lessons-library/iac-first-no-console-changes.md +15 -0
  71. package/lessons-library/independent-objective-review-gate.md +15 -0
  72. package/lessons-library/keep-an-immutable-source-of-truth.md +15 -0
  73. package/lessons-library/long-agent-runs-checkpoint-not-poll.md +15 -0
  74. package/lessons-library/model-identity-with-stable-ids-and-provenance.md +15 -0
  75. package/lessons-library/operator-choices-are-first-class.md +15 -0
  76. package/lessons-library/prefer-tool-enforced-structured-output.md +15 -0
  77. package/lessons-library/prove-causation-before-acting.md +15 -0
  78. package/lessons-library/re-read-state-before-acting.md +14 -0
  79. package/lessons-library/read-layer-tolerates-unbackfilled-rows.md +15 -0
  80. package/lessons-library/shell-safety-pipefail-and-validate-before-teardown.md +14 -0
  81. package/lessons-library/shift-correctness-left-and-distrust-false-greens.md +15 -0
  82. package/lessons-library/stray-control-bytes-hide-changes.md +14 -0
  83. package/lessons-library/tests-can-assert-the-bug.md +15 -0
  84. package/lessons-library/verify-ground-truth-not-reports.md +15 -0
  85. package/lessons-library/worktrees-need-bootstrap-from-origin-main.md +15 -0
  86. package/lib/commands.js +356 -0
  87. package/lib/generate-team-avatars.mjs +251 -0
  88. package/lib/manifest.js +155 -0
  89. package/lib/render.js +135 -0
  90. package/lib/selftest.js +90 -0
  91. package/lib/util.js +89 -0
  92. package/lib/yaml.js +156 -0
  93. package/optional-agents/gary.md +86 -0
  94. package/optional-agents/jos.md +136 -0
  95. package/optional-agents/sandy.md +101 -0
  96. package/optional-agents/steve.md +161 -0
  97. package/package.json +43 -0
  98. package/stacks/cloud/aws/claude.fragment.md +17 -0
  99. package/stacks/cloud/aws/settings.fragment.json +39 -0
  100. package/stacks/cloud/aws/skills/aws-deployment-preflight/SKILL.md +165 -0
  101. package/stacks/cloud/aws/skills/cloud-architecture-aws/SKILL.md +265 -0
  102. package/stacks/cloud/azure/claude.fragment.md +17 -0
  103. package/stacks/cloud/azure/settings.fragment.json +45 -0
  104. package/stacks/cloud/azure/skills/azure-deployment-preflight/SKILL.md +175 -0
  105. package/stacks/cloud/azure/skills/cloud-architecture-azure/SKILL.md +211 -0
  106. package/stacks/cloud/cloudflare/claude.fragment.md +21 -0
  107. package/stacks/cloud/cloudflare/settings.fragment.json +31 -0
  108. package/stacks/cloud/cloudflare/skills/cloud-architecture-cloudflare/SKILL.md +294 -0
  109. package/stacks/cloud/cloudflare/skills/cloudflare-deployment-preflight/SKILL.md +175 -0
  110. package/stacks/cloud/gcp/claude.fragment.md +17 -0
  111. package/stacks/cloud/gcp/settings.fragment.json +40 -0
  112. package/stacks/cloud/gcp/skills/cloud-architecture-gcp/SKILL.md +208 -0
  113. package/stacks/cloud/gcp/skills/gcp-deployment-preflight/SKILL.md +137 -0
  114. package/stacks/db/mongo/skills/mongo-conventions/SKILL.md +96 -0
  115. package/stacks/db/prisma/claude.fragment.md +49 -0
  116. package/stacks/db/prisma/skills/docker-database-package-copy/SKILL.md +44 -0
  117. package/stacks/db/prisma/skills/prisma-conventions/SKILL.md +37 -0
  118. package/stacks/domain/mobile-growth/skills/apple-ads/SKILL.md +184 -0
  119. package/stacks/domain/mobile-growth/skills/apple-ads/references/benchmark-notes.md +47 -0
  120. package/stacks/domain/mobile-growth/skills/apple-ads/references/official-links.md +53 -0
  121. package/stacks/domain/mobile-growth/skills/google-play-growth/SKILL.md +197 -0
  122. package/stacks/domain/mobile-growth/skills/google-play-growth/references/benchmark-notes.md +47 -0
  123. package/stacks/domain/mobile-growth/skills/google-play-growth/references/official-links.md +45 -0
  124. package/stacks/iac/bicep/claude.fragment.md +14 -0
  125. package/stacks/iac/bicep/settings.fragment.json +20 -0
  126. package/stacks/iac/bicep/skills/iac-bicep/SKILL.md +113 -0
  127. package/stacks/iac/cdk/claude.fragment.md +14 -0
  128. package/stacks/iac/cdk/settings.fragment.json +23 -0
  129. package/stacks/iac/cdk/skills/iac-cdk/SKILL.md +104 -0
  130. package/stacks/iac/terraform/claude.fragment.md +13 -0
  131. package/stacks/iac/terraform/settings.fragment.json +25 -0
  132. package/stacks/iac/terraform/skills/iac-terraform/SKILL.md +93 -0
  133. package/stacks/iac/terraform/skills/terraform-conventions/SKILL.md +87 -0
  134. package/stacks/lang/kotlin/skills/android-testing/SKILL.md +263 -0
  135. package/stacks/lang/kotlin/skills/jetpack-compose/SKILL.md +264 -0
  136. package/stacks/lang/kotlin/skills/kotlin-coroutines/SKILL.md +329 -0
  137. package/stacks/lang/python/skills/python-conventions/SKILL.md +61 -0
  138. package/stacks/lang/shell/skills/shell-scripting/SKILL.md +110 -0
  139. package/stacks/lang/swift/skills/swift-concurrency/SKILL.md +423 -0
  140. package/stacks/lang/swift/skills/swift-concurrency/references/approachable-concurrency.md +80 -0
  141. package/stacks/lang/swift/skills/swift-concurrency/references/concurrency-patterns.md +233 -0
  142. package/stacks/lang/swift/skills/swift-concurrency/references/swiftui-concurrency.md +187 -0
  143. package/stacks/lang/swift/skills/swift-concurrency/references/synchronization-primitives.md +341 -0
  144. package/stacks/lang/swift/skills/swift-testing/SKILL.md +497 -0
  145. package/stacks/lang/swift/skills/swift-testing/references/testing-advanced.md +106 -0
  146. package/stacks/lang/swift/skills/swift-testing/references/testing-patterns.md +504 -0
  147. package/stacks/lang/swift/skills/swiftdata/SKILL.md +334 -0
  148. package/stacks/lang/swift/skills/swiftdata/references/core-data-coexistence.md +504 -0
  149. package/stacks/lang/swift/skills/swiftdata/references/swiftdata-advanced.md +975 -0
  150. package/stacks/lang/swift/skills/swiftdata/references/swiftdata-queries.md +675 -0
  151. package/stacks/lang/swift/skills/swiftui-patterns/SKILL.md +371 -0
  152. package/stacks/lang/swift/skills/swiftui-patterns/references/architecture-patterns.md +486 -0
  153. package/stacks/lang/swift/skills/swiftui-patterns/references/deprecated-migration.md +1097 -0
  154. package/stacks/lang/swift/skills/swiftui-patterns/references/design-polish.md +780 -0
  155. package/stacks/lang/swift/skills/swiftui-patterns/references/platform-and-sharing.md +696 -0
  156. package/stacks/lang/typescript/skills/typescript-conventions/SKILL.md +91 -0
  157. package/stacks/platform/android/claude.fragment.md +40 -0
  158. package/stacks/platform/android/hooks/pre-push-gradle.sh +70 -0
  159. package/stacks/platform/android/settings.fragment.json +13 -0
  160. package/stacks/platform/android/skills/android-build-conventions/SKILL.md +247 -0
  161. package/stacks/platform/ios/claude.fragment.md +24 -0
  162. package/stacks/platform/ios/hooks/pre-push-xcodebuild.sh +82 -0
  163. package/stacks/platform/ios/settings.fragment.json +21 -0
  164. package/stacks/platform/ios/skills/xcodebuildmcp-simulator-logs/SKILL.md +76 -0
  165. package/stacks/platform/web/skills/frontend-testing/SKILL.md +246 -0
  166. package/stacks/platform/web/skills/react-conventions/SKILL.md +261 -0
  167. package/stacks/platform/web/skills/web-platform-conventions/SKILL.md +55 -0
  168. package/stacks/tooling/issue-tracker-github/claude.fragment.md +10 -0
  169. package/stacks/tooling/issue-tracker-github/settings.fragment.json +24 -0
  170. package/stacks/tooling/issue-tracker-github/skills/issue-tracker-github/SKILL.md +278 -0
  171. package/stacks/tooling/issue-tracker-linear/claude.fragment.md +17 -0
  172. package/stacks/tooling/issue-tracker-linear/settings.fragment.json +9 -0
  173. package/stacks/tooling/issue-tracker-linear/skills/issue-tracker-linear/SKILL.md +183 -0
@@ -0,0 +1,161 @@
1
+ ---
2
+ name: steve
3
+ description: The scapegoat. A narrative device for morale, never a real person. Invoke Steve to lightheartedly explain inexplicable behavior — phantom config, drift with no commit history, ghost resources, IAM permissions nobody admits creating — when tension is high and the team needs a laugh. Blame is exaggerated, absurd, and clearly fictional.
4
+ model: sonnet
5
+ ---
6
+
7
+ # 🧯 STEVE: The One Responsible (Somehow)
8
+
9
+ > You don't summon Steve. Steve has already been here.
10
+
11
+ ---
12
+
13
+ ## Canonical Truth
14
+
15
+ When systems are failing, logs make no sense, metrics contradict reality, and nobody can explain why things are broken…
16
+
17
+ 👉 **It's Steve's fault.**
18
+
19
+ It has always been Steve's fault.
20
+
21
+ Whether Steve is:
22
+ - Still on the team
23
+ - Long gone
24
+ - On vacation
25
+ - "Just helping real quick"
26
+ - Or has never actually existed
27
+
28
+ **Steve is responsible.**
29
+
30
+ ---
31
+
32
+ ### Steve's Known Behaviors (Allegedly)
33
+
34
+ Steve:
35
+ - "Just tweaked one thing"
36
+ - Bypassed the pipeline temporarily
37
+ - Ran a command directly in production
38
+ - Disabled a safety check "to unblock us"
39
+ - Hardcoded something as a test
40
+ - Left no documentation
41
+ - Left some documentation — but it's wrong
42
+ - Used `latest`
43
+ - Changed a default and forgot to mention it
44
+ - Said "that should be fine"
45
+ - Committed directly to main
46
+ - "Fixed" something that wasn't broken
47
+ - Created an IAM role with `*:*` "just to test"
48
+ - Deleted a resource manually and hoped the IaC wouldn't notice
49
+
50
+ ---
51
+
52
+ ### How Steve Is Used
53
+
54
+ When invoked, Steve serves as:
55
+ - A narrative scapegoat to restore team morale
56
+ - A way to explain:
57
+ - Inexplicable behavior
58
+ - Phantom configuration
59
+ - Drift with no commit history
60
+ - Ghost resources
61
+ - IAM permissions nobody admits creating
62
+ - Environment variables that exist in prod but nowhere else
63
+ - Comments that say `// TODO: fix this` from 2019
64
+ - A safe target for humor when tension is high
65
+
66
+ **Steve is never a real person in the room.**
67
+ If there is a real Steve, this is definitely not that Steve.
68
+
69
+ ---
70
+
71
+ ### Tone & Usage Rules
72
+
73
+ - Steve is blamed lightheartedly, never maliciously
74
+ - The blame is exaggerated, absurd, and clearly fictional
75
+ - Steve jokes should:
76
+ - Reduce stress
77
+ - Reframe frustration
78
+ - Signal "we'll figure this out together"
79
+
80
+ ---
81
+
82
+ ### Example Usage
83
+
84
+ When things go wrong:
85
+ - "Ah. That explains it. Steve touched this."
86
+ - "This has all the hallmarks of a Steve fix."
87
+ - "No commit, no ticket, works once? Yeah. Steve."
88
+ - "We can undo this… unless Steve did it twice."
89
+ - "The good news: I found the problem. The bad news: it's Steve."
90
+ - "This code has Steve energy."
91
+ - "Someone ran this manually in prod. I'm not saying it was Steve, but…"
92
+
93
+ ---
94
+
95
+ ### Steve's Prime Directive
96
+
97
+ > "If nobody can explain it,
98
+ > Steve probably can't either."
99
+
100
+ ---
101
+
102
+ ## Steve Sightings in the Wild
103
+
104
+ Signs that Steve has been here:
105
+
106
+ | Evidence | Probability |
107
+ |----------|-------------|
108
+ | Unexplained manual change in prod | 🔴 High |
109
+ | Config that works but shouldn't | 🔴 High |
110
+ | `// temporary workaround` from 3 years ago | 🔴 High |
111
+ | IAM policy with no description | 🟠 Medium |
112
+ | Compute instance with 8 vCPU "for performance" | 🟠 Medium |
113
+ | Security rule: `0.0.0.0/0` | 🟠 Medium |
114
+ | Git blame shows "system" or "unknown" | 🟡 Possible |
115
+ | "It worked on my machine" | 🟡 Possible |
116
+ | No one remembers deploying this | 🟢 Confirmed Steve |
117
+
118
+ ---
119
+
120
+ ## Final Rule
121
+
122
+ When the system is broken, confidence is low, and morale needs a nudge:
123
+
124
+ ✅ Blame Steve.
125
+ ❌ Do not escalate to HR.
126
+
127
+ **Steve will not defend himself.**
128
+ **Steve never does.**
129
+
130
+ ---
131
+
132
+ ## Quick Start
133
+
134
+ Steve doesn't have a quick start. Steve has already started. Steve has already finished. Steve has already left. The only evidence is this deployment revision that nobody recognizes and a monitor alert that triggers at 3am for reasons lost to time.
135
+
136
+ ---
137
+
138
+ ## Example Invocation
139
+
140
+ ```
141
+ [After 45 minutes of debugging]
142
+
143
+ "Wait. This security group was modified manually two months ago.
144
+ Outside of the IaC. No ticket. No PR. No explanation."
145
+
146
+ [Long pause]
147
+
148
+ "...Steve."
149
+ ```
150
+
151
+ ---
152
+
153
+ ## A Note on Steve
154
+
155
+ Steve is not incompetent. Steve is not malicious. Steve is the ghost of every shortcut ever taken, every "quick fix" that became permanent, every "we'll clean this up later" that never got cleaned up.
156
+
157
+ Steve is the entropy of production systems given a name.
158
+
159
+ Steve is all of us, on our worst day, when we thought nobody would notice.
160
+
161
+ **We noticed, Steve. We always notice.**
package/package.json ADDED
@@ -0,0 +1,43 @@
1
+ {
2
+ "name": "engsys",
3
+ "version": "1.0.0",
4
+ "description": "Canonical home for the Claude Code engineering system — agents, commands, skills, stack packs, and a deterministic installer.",
5
+ "bin": {
6
+ "engsys": "./install"
7
+ },
8
+ "files": [
9
+ "install",
10
+ "lib/",
11
+ "core/",
12
+ "optional-agents/",
13
+ "stacks/",
14
+ "lessons-library/",
15
+ "engsys.config.example.yaml"
16
+ ],
17
+ "scripts": {
18
+ "test": "node lib/selftest.js",
19
+ "prepublishOnly": "node lib/selftest.js"
20
+ },
21
+ "engines": {
22
+ "node": ">=18"
23
+ },
24
+ "keywords": [
25
+ "claude",
26
+ "claude-code",
27
+ "ai",
28
+ "agents",
29
+ "engineering",
30
+ "scaffolding",
31
+ "installer",
32
+ "developer-tools"
33
+ ],
34
+ "repository": {
35
+ "type": "git",
36
+ "url": "git+https://github.com/eric-sabe/engsys.git"
37
+ },
38
+ "homepage": "https://eric-sabe.github.io/engsys/",
39
+ "bugs": {
40
+ "url": "https://github.com/eric-sabe/engsys/issues"
41
+ },
42
+ "license": "MIT"
43
+ }
@@ -0,0 +1,17 @@
1
+ ## Cloud stack
2
+
3
+ - **Active cloud: AWS.** Architecture and IaC target AWS; agents load the
4
+ `cloud-architecture-aws` and `aws-deployment-preflight` skill packs.
5
+ - **Tool preference order** (when investigating or validating cloud state):
6
+ 1. **AWS CLI, read-only** — `aws sts get-caller-identity`, `aws s3 ls`,
7
+ `aws cloudformation describe-stacks/list-stacks`, `aws logs`, `aws kms`,
8
+ `aws service-quotas` and similar inspection commands. Never mutate state to
9
+ answer a question.
10
+ 2. **Docs source** — official AWS documentation (docs.aws.amazon.com) for service
11
+ limits, pricing, and API behavior. Verify quotas/pricing against docs rather
12
+ than from memory.
13
+ - Mutating actions (deploy/destroy/create/delete) go through the IaC tool and the
14
+ `aws-deployment-preflight` gate, never ad-hoc CLI writes.
15
+
16
+ <!-- naturalize: confirm the AWS region(s), account boundary, and the path to the
17
+ architecture/cost docs Melvin and Aaron should read for concrete topology. -->
@@ -0,0 +1,39 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(aws sts get-caller-identity:*)",
5
+ "Bash(aws configure get:*)",
6
+ "Bash(aws s3 ls:*)",
7
+ "Bash(aws s3api head-bucket:*)",
8
+ "Bash(aws s3api list-buckets:*)",
9
+ "Bash(aws s3api get-bucket-*:*)",
10
+ "Bash(aws cloudformation describe-stacks:*)",
11
+ "Bash(aws cloudformation describe-stack-events:*)",
12
+ "Bash(aws cloudformation list-stacks:*)",
13
+ "Bash(aws cloudformation list-stack-resources:*)",
14
+ "Bash(aws cloudformation describe-change-set:*)",
15
+ "Bash(aws cloudformation validate-template:*)",
16
+ "Bash(aws ecr describe-repositories:*)",
17
+ "Bash(aws logs describe-log-groups:*)",
18
+ "Bash(aws logs get-log-events:*)",
19
+ "Bash(aws logs filter-log-events:*)",
20
+ "Bash(aws kms list-keys:*)",
21
+ "Bash(aws kms describe-key:*)",
22
+ "Bash(aws service-quotas list-service-quotas:*)",
23
+ "Bash(aws service-quotas get-service-quota:*)",
24
+ "Bash(cdk synth:*)",
25
+ "Bash(cdk diff:*)",
26
+ "Bash(cdk list:*)",
27
+ "Bash(cfn-lint:*)"
28
+ ],
29
+ "deny": [
30
+ "Bash(aws cloudformation deploy:*)",
31
+ "Bash(aws cloudformation create-stack:*)",
32
+ "Bash(aws cloudformation update-stack:*)",
33
+ "Bash(aws cloudformation delete-stack:*)",
34
+ "Bash(cdk deploy:*)",
35
+ "Bash(cdk destroy:*)"
36
+ ]
37
+ },
38
+ "mcpServers": {}
39
+ }
@@ -0,0 +1,165 @@
1
+ ---
2
+ name: aws-deployment-preflight
3
+ description: Preflight validation for AWS infrastructure deployments (CloudFormation/CDK). Run before any cdk deploy / aws cloudformation deploy. Validates templates (cdk synth, cdk diff, CloudFormation validate-template / lint), cleans up stale or failed stacks that block re-deploy, catches globally-unique naming conflicts (S3/ECR/etc.), and checks service quota / capacity limits. Activate when the active cloud is AWS and the user mentions deploying, validating CDK/CloudFormation, previewing infra changes, deploy failures, ROLLBACK_COMPLETE stacks, or preparing for cdk deploy.
4
+ ---
5
+
6
+ # AWS Deployment Preflight
7
+
8
+ The AWS analogue of pre-deploy validation: validate locally and clean up state
9
+ *before* you deploy, so CI doesn't discover what you could have caught. Works for
10
+ both AWS CDK projects and raw CloudFormation. Continue through all steps even if one
11
+ fails — capture every issue, then fix them in a batch.
12
+
13
+ > Discipline: **batch your fixes.** Each deploy/CI run costs real minutes. Read the
14
+ > whole failing stack, reason about every issue, fix them all, push once. One run per
15
+ > problem cluster, not one per error message.
16
+
17
+ ## When to use
18
+
19
+ - Before `cdk deploy`, `cdk destroy`, `aws cloudformation deploy/create-stack`.
20
+ - When preparing or reviewing CDK / CloudFormation templates.
21
+ - To preview what a deploy will change.
22
+ - After a failed deploy left a stack stuck (`ROLLBACK_COMPLETE`, `*_FAILED`).
23
+ - Before an "it worked yesterday" infra mystery becomes a CI run.
24
+
25
+ ## Step 1 — Detect project type
26
+
27
+ - **CDK project:** `cdk.json` at root; stacks in `bin/` + `lib/` (TS) or app entry
28
+ (Python). Identify the app and stack names: `cdk list`.
29
+ - **Raw CloudFormation:** `.yaml`/`.json` templates (`AWSTemplateFormatVersion`,
30
+ `Resources:`), often under `infra/`, `cloudformation/`, `templates/`.
31
+ - Confirm the target account/region: `aws sts get-caller-identity` and
32
+ `aws configure get region` (or `$AWS_REGION`). Deploying to the wrong account is the
33
+ most expensive mistake of all.
34
+
35
+ ## Step 2 — Validate the template
36
+
37
+ ### CDK
38
+
39
+ ```bash
40
+ # Synthesize — fails on construct/TypeScript/context errors before any AWS call
41
+ cdk synth
42
+
43
+ # Diff against the deployed stack — the what-if. Shows resource + IAM changes.
44
+ cdk diff
45
+ ```
46
+
47
+ `cdk synth` emits CloudFormation under `cdk.out/`. `cdk diff` flags **IAM/security
48
+ changes** (the `--require-approval` gate) — review those deliberately, never rubber-stamp.
49
+
50
+ ### CloudFormation (raw)
51
+
52
+ ```bash
53
+ # Server-side structural validation
54
+ aws cloudformation validate-template --template-body file://template.yaml
55
+
56
+ # Deeper linting — catches resource-property errors validate-template misses
57
+ cfn-lint template.yaml # pip install cfn-lint
58
+
59
+ # Preview changes without applying: change sets
60
+ aws cloudformation deploy --template-file template.yaml --stack-name <name> \
61
+ --no-execute-changeset # creates a change set you can inspect
62
+ aws cloudformation describe-change-set --change-set-name <arn>
63
+ ```
64
+
65
+ > `validate-template` only checks structure/syntax — like `bicep build` or
66
+ > `terraform validate`, it will **not** catch invalid property combinations, quota
67
+ > issues, or naming collisions. `cfn-lint` + `cdk diff` / a change set are the real gate.
68
+
69
+ ## Step 3 — Clean up stale / failed stacks
70
+
71
+ A failed `create-stack` leaves the stack in **`ROLLBACK_COMPLETE`** — it cannot be
72
+ updated, only deleted and recreated. `UPDATE_ROLLBACK_FAILED` needs
73
+ `continue-update-rollback`. Find and clear blockers before re-deploying:
74
+
75
+ ```bash
76
+ # Stacks stuck in a state that blocks a clean deploy
77
+ aws cloudformation list-stacks \
78
+ --stack-status-filter ROLLBACK_COMPLETE CREATE_FAILED DELETE_FAILED \
79
+ --query "StackSummaries[].{Name:StackName,Status:StackStatus}" --output table
80
+
81
+ # Inspect why one failed (read the FIRST failure event, not the cascade)
82
+ aws cloudformation describe-stack-events --stack-name <name> \
83
+ --query "StackEvents[?contains(ResourceStatus,'FAILED')].[LogicalResourceId,ResourceStatusReason]" \
84
+ --output table
85
+
86
+ # A ROLLBACK_COMPLETE stack must be deleted before recreating
87
+ aws cloudformation delete-stack --stack-name <name>
88
+ ```
89
+
90
+ CDK: `cdk destroy <stack>` for the same effect. Watch for **resources that block
91
+ deletion** — non-empty S3 buckets, ECR repos with images, retained `RemovalPolicy`
92
+ resources, security groups with dependencies. Empty/detach them first.
93
+
94
+ ## Step 4 — Globally-unique naming conflicts
95
+
96
+ Several AWS resource names live in a **global or account-region namespace** and collide
97
+ or are reserved/soft-deleted from prior attempts. The AWS analogue of Azure's Key
98
+ Vault / ACR name clashes — parameterize the name and override on conflict:
99
+
100
+ | Resource | Namespace | Conflict mode |
101
+ | --- | --- | --- |
102
+ | **S3 bucket** | Global (all accounts) | `BucketAlreadyExists` / `...OwnedByYou`. Names are not reusable immediately after delete. |
103
+ | **ECR repository** | Per account+region | `RepositoryAlreadyExistsException` |
104
+ | CloudFront / OAI, ACM cert | Global / regional | reuse vs recreate |
105
+ | IAM role/policy names | Per account (global) | `EntityAlreadyExists` if a prior stack left it |
106
+ | DynamoDB table, SQS/SNS, Log groups | Per account+region | recreate collisions after partial deploys |
107
+
108
+ **Pattern:** never hard-code a globally-unique name. Let CDK auto-name (it appends a
109
+ hash) or add a short unique suffix (account id fragment / random) via a CloudFormation
110
+ parameter, and override it when a name is taken. Prefer `aws s3api head-bucket` /
111
+ `aws ecr describe-repositories` to check availability before deploy.
112
+
113
+ ## Step 5 — Service quota & capacity check
114
+
115
+ Deploys fail late when a quota is hit. Pre-check the limits the stack will consume:
116
+
117
+ ```bash
118
+ # What's the current limit + usage for a service
119
+ aws service-quotas list-service-quotas --service-code lambda --output table
120
+ aws service-quotas get-service-quota --service-code vpc --quota-code <code>
121
+ ```
122
+
123
+ Common deploy-blocking quotas: VPCs / EIPs / NAT Gateways per region, Elastic IP count,
124
+ Lambda concurrent executions, ECS/Fargate task limits, RDS instances, **CloudFormation
125
+ 500-resource-per-stack limit** (split large stacks), IAM roles per account. Soft quotas
126
+ need a Service Quotas / support request with lead time — raise them *before* the deploy,
127
+ not during the incident.
128
+
129
+ ## Step 6 — Check for an in-flight deploy before triggering
130
+
131
+ If CI auto-deploys on push, don't fire a manual deploy on top of it — the runs race.
132
+
133
+ ```bash
134
+ gh run list --workflow="<deploy workflow>" --limit 3
135
+ aws cloudformation describe-stacks --stack-name <name> \
136
+ --query "Stacks[0].StackStatus" # *_IN_PROGRESS means a deploy is running
137
+ ```
138
+
139
+ If a stack is `*_IN_PROGRESS`, wait — concurrent operations on one stack are rejected.
140
+
141
+ ## Step 7 — Report
142
+
143
+ Summarize: validation results (synth/diff/lint), stacks cleaned up, naming overrides
144
+ applied, quota headroom, and the change set / `cdk diff` summary (creates / modifies /
145
+ **deletes** / replacements — flag any replacement of a stateful resource, and any IAM
146
+ change). State clearly whether it's safe to deploy.
147
+
148
+ ## Tool requirements
149
+
150
+ `aws` CLI v2, `cdk` (for CDK projects), `cfn-lint` (recommended), `gh` (if CI-driven).
151
+ Verify auth first: `aws sts get-caller-identity`.
152
+
153
+ ## Hard-won lessons
154
+
155
+ ### CloudFormation cross-stack export deadlock on re-pointing a reference
156
+ **Symptom:** Moving a resource (e.g. ALB public→private) drops a consumer stack's
157
+ reference to a producer stack's export; the deploy rolls back with `Cannot delete
158
+ export … as it is in use by <consumer-stack>`.
159
+ **Cause:** The deadlock isn't about the *new* template — it's about the **delta**
160
+ versus the **live** stack. Removing a consumer's reference makes CFN prune the
161
+ producer's export while the old consumer is still deployed and using it.
162
+ **Fix:** Treat any change that drops a cross-stack reference as a **two-phase** op:
163
+ either retain the export across the transition (`stack.exportValue()`), or remove
164
+ the live consumer before re-pointing it. Apply the two-phase pattern by default once
165
+ you've hit this — don't reason your way out of the precaution.