engsys 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +202 -0
  3. package/core/agents/aaron.md +152 -0
  4. package/core/agents/bert.md +115 -0
  5. package/core/agents/isabelle.md +136 -0
  6. package/core/agents/jody.md +150 -0
  7. package/core/agents/leith.md +111 -0
  8. package/core/agents/marcelo.md +282 -0
  9. package/core/agents/melvin.md +101 -0
  10. package/core/agents/nyx.md +152 -0
  11. package/core/agents/otto.md +168 -0
  12. package/core/agents/patricia.md +283 -0
  13. package/core/commands/design-audit-local.md +155 -0
  14. package/core/commands/design-audit.md +235 -0
  15. package/core/commands/design-critique.md +96 -0
  16. package/core/commands/file-issue.md +22 -0
  17. package/core/commands/generate-project.md +45 -0
  18. package/core/commands/implement-issue.md +37 -0
  19. package/core/commands/implement-project.md +40 -0
  20. package/core/commands/naturalize.md +61 -0
  21. package/core/commands/pre-push.md +29 -0
  22. package/core/commands/prep-review-collect.md +130 -0
  23. package/core/commands/prep-review-finalize.md +121 -0
  24. package/core/commands/prep-review-publish.md +113 -0
  25. package/core/commands/prep-review.md +65 -0
  26. package/core/commands/project-closeout.md +25 -0
  27. package/core/skills/agentic-eval/SKILL.md +195 -0
  28. package/core/skills/chrome-devtools/SKILL.md +97 -0
  29. package/core/skills/code-review/SKILL.md +26 -0
  30. package/core/skills/gh-cli/SKILL.md +2202 -0
  31. package/core/skills/git-commit/SKILL.md +124 -0
  32. package/core/skills/git-workflow-agents/SKILL.md +462 -0
  33. package/core/skills/git-workflow-agents/reference.md +220 -0
  34. package/core/skills/github-actions/SKILL.md +190 -0
  35. package/core/skills/github-issues/SKILL.md +154 -0
  36. package/core/skills/llm-structured-outputs/SKILL.md +323 -0
  37. package/core/skills/llm-structured-outputs/references/provider-details.md +392 -0
  38. package/core/skills/pre-push/SKILL.md +115 -0
  39. package/core/skills/refactor/SKILL.md +645 -0
  40. package/core/skills/web-design-reviewer/SKILL.md +371 -0
  41. package/core/skills/webapp-testing/SKILL.md +127 -0
  42. package/core/skills/webapp-testing/test-helper.js +56 -0
  43. package/core/templates/CLAUDE.md.tmpl +98 -0
  44. package/core/templates/adr-template.md +67 -0
  45. package/core/templates/gh-issue-templates/bug.md +39 -0
  46. package/core/templates/gh-issue-templates/content.md +42 -0
  47. package/core/templates/gh-issue-templates/enhancement.md +36 -0
  48. package/core/templates/gh-issue-templates/feature.md +39 -0
  49. package/core/templates/gh-issue-templates/infrastructure.md +41 -0
  50. package/core/templates/post-edit-reminders.sh.tmpl +19 -0
  51. package/core/templates/settings.json.tmpl +90 -0
  52. package/core/templates/settings.local.json.tmpl +3 -0
  53. package/core/workflows/agent-implementation-workflow.md +346 -0
  54. package/core/workflows/generate-project.md +258 -0
  55. package/core/workflows/implement-project-workflow.md +190 -0
  56. package/core/workflows/issue-tracking.md +89 -0
  57. package/core/workflows/project-closeout-ceremony.md +77 -0
  58. package/core/workflows/review-workflow.md +266 -0
  59. package/engsys.config.example.yaml +46 -0
  60. package/install +202 -0
  61. package/lessons-library/README.md +80 -0
  62. package/lessons-library/async-callbacks-verify-liveness.md +15 -0
  63. package/lessons-library/change-isnt-done-until-every-surface-updated.md +15 -0
  64. package/lessons-library/claim-then-act-for-irreversible-ops.md +16 -0
  65. package/lessons-library/co-commit-entangled-work.md +15 -0
  66. package/lessons-library/dependabot-triage-playbook.md +17 -0
  67. package/lessons-library/deploy-by-digest-and-verify-the-running-revision.md +15 -0
  68. package/lessons-library/enforce-your-guarantee-at-your-boundary.md +16 -0
  69. package/lessons-library/gate-changes-on-measurement-not-vibes.md +15 -0
  70. package/lessons-library/iac-first-no-console-changes.md +15 -0
  71. package/lessons-library/independent-objective-review-gate.md +15 -0
  72. package/lessons-library/keep-an-immutable-source-of-truth.md +15 -0
  73. package/lessons-library/long-agent-runs-checkpoint-not-poll.md +15 -0
  74. package/lessons-library/model-identity-with-stable-ids-and-provenance.md +15 -0
  75. package/lessons-library/operator-choices-are-first-class.md +15 -0
  76. package/lessons-library/prefer-tool-enforced-structured-output.md +15 -0
  77. package/lessons-library/prove-causation-before-acting.md +15 -0
  78. package/lessons-library/re-read-state-before-acting.md +14 -0
  79. package/lessons-library/read-layer-tolerates-unbackfilled-rows.md +15 -0
  80. package/lessons-library/shell-safety-pipefail-and-validate-before-teardown.md +14 -0
  81. package/lessons-library/shift-correctness-left-and-distrust-false-greens.md +15 -0
  82. package/lessons-library/stray-control-bytes-hide-changes.md +14 -0
  83. package/lessons-library/tests-can-assert-the-bug.md +15 -0
  84. package/lessons-library/verify-ground-truth-not-reports.md +15 -0
  85. package/lessons-library/worktrees-need-bootstrap-from-origin-main.md +15 -0
  86. package/lib/commands.js +356 -0
  87. package/lib/generate-team-avatars.mjs +251 -0
  88. package/lib/manifest.js +155 -0
  89. package/lib/render.js +135 -0
  90. package/lib/selftest.js +90 -0
  91. package/lib/util.js +89 -0
  92. package/lib/yaml.js +156 -0
  93. package/optional-agents/gary.md +86 -0
  94. package/optional-agents/jos.md +136 -0
  95. package/optional-agents/sandy.md +101 -0
  96. package/optional-agents/steve.md +161 -0
  97. package/package.json +43 -0
  98. package/stacks/cloud/aws/claude.fragment.md +17 -0
  99. package/stacks/cloud/aws/settings.fragment.json +39 -0
  100. package/stacks/cloud/aws/skills/aws-deployment-preflight/SKILL.md +165 -0
  101. package/stacks/cloud/aws/skills/cloud-architecture-aws/SKILL.md +265 -0
  102. package/stacks/cloud/azure/claude.fragment.md +17 -0
  103. package/stacks/cloud/azure/settings.fragment.json +45 -0
  104. package/stacks/cloud/azure/skills/azure-deployment-preflight/SKILL.md +175 -0
  105. package/stacks/cloud/azure/skills/cloud-architecture-azure/SKILL.md +211 -0
  106. package/stacks/cloud/cloudflare/claude.fragment.md +21 -0
  107. package/stacks/cloud/cloudflare/settings.fragment.json +31 -0
  108. package/stacks/cloud/cloudflare/skills/cloud-architecture-cloudflare/SKILL.md +294 -0
  109. package/stacks/cloud/cloudflare/skills/cloudflare-deployment-preflight/SKILL.md +175 -0
  110. package/stacks/cloud/gcp/claude.fragment.md +17 -0
  111. package/stacks/cloud/gcp/settings.fragment.json +40 -0
  112. package/stacks/cloud/gcp/skills/cloud-architecture-gcp/SKILL.md +208 -0
  113. package/stacks/cloud/gcp/skills/gcp-deployment-preflight/SKILL.md +137 -0
  114. package/stacks/db/mongo/skills/mongo-conventions/SKILL.md +96 -0
  115. package/stacks/db/prisma/claude.fragment.md +49 -0
  116. package/stacks/db/prisma/skills/docker-database-package-copy/SKILL.md +44 -0
  117. package/stacks/db/prisma/skills/prisma-conventions/SKILL.md +37 -0
  118. package/stacks/domain/mobile-growth/skills/apple-ads/SKILL.md +184 -0
  119. package/stacks/domain/mobile-growth/skills/apple-ads/references/benchmark-notes.md +47 -0
  120. package/stacks/domain/mobile-growth/skills/apple-ads/references/official-links.md +53 -0
  121. package/stacks/domain/mobile-growth/skills/google-play-growth/SKILL.md +197 -0
  122. package/stacks/domain/mobile-growth/skills/google-play-growth/references/benchmark-notes.md +47 -0
  123. package/stacks/domain/mobile-growth/skills/google-play-growth/references/official-links.md +45 -0
  124. package/stacks/iac/bicep/claude.fragment.md +14 -0
  125. package/stacks/iac/bicep/settings.fragment.json +20 -0
  126. package/stacks/iac/bicep/skills/iac-bicep/SKILL.md +113 -0
  127. package/stacks/iac/cdk/claude.fragment.md +14 -0
  128. package/stacks/iac/cdk/settings.fragment.json +23 -0
  129. package/stacks/iac/cdk/skills/iac-cdk/SKILL.md +104 -0
  130. package/stacks/iac/terraform/claude.fragment.md +13 -0
  131. package/stacks/iac/terraform/settings.fragment.json +25 -0
  132. package/stacks/iac/terraform/skills/iac-terraform/SKILL.md +93 -0
  133. package/stacks/iac/terraform/skills/terraform-conventions/SKILL.md +87 -0
  134. package/stacks/lang/kotlin/skills/android-testing/SKILL.md +263 -0
  135. package/stacks/lang/kotlin/skills/jetpack-compose/SKILL.md +264 -0
  136. package/stacks/lang/kotlin/skills/kotlin-coroutines/SKILL.md +329 -0
  137. package/stacks/lang/python/skills/python-conventions/SKILL.md +61 -0
  138. package/stacks/lang/shell/skills/shell-scripting/SKILL.md +110 -0
  139. package/stacks/lang/swift/skills/swift-concurrency/SKILL.md +423 -0
  140. package/stacks/lang/swift/skills/swift-concurrency/references/approachable-concurrency.md +80 -0
  141. package/stacks/lang/swift/skills/swift-concurrency/references/concurrency-patterns.md +233 -0
  142. package/stacks/lang/swift/skills/swift-concurrency/references/swiftui-concurrency.md +187 -0
  143. package/stacks/lang/swift/skills/swift-concurrency/references/synchronization-primitives.md +341 -0
  144. package/stacks/lang/swift/skills/swift-testing/SKILL.md +497 -0
  145. package/stacks/lang/swift/skills/swift-testing/references/testing-advanced.md +106 -0
  146. package/stacks/lang/swift/skills/swift-testing/references/testing-patterns.md +504 -0
  147. package/stacks/lang/swift/skills/swiftdata/SKILL.md +334 -0
  148. package/stacks/lang/swift/skills/swiftdata/references/core-data-coexistence.md +504 -0
  149. package/stacks/lang/swift/skills/swiftdata/references/swiftdata-advanced.md +975 -0
  150. package/stacks/lang/swift/skills/swiftdata/references/swiftdata-queries.md +675 -0
  151. package/stacks/lang/swift/skills/swiftui-patterns/SKILL.md +371 -0
  152. package/stacks/lang/swift/skills/swiftui-patterns/references/architecture-patterns.md +486 -0
  153. package/stacks/lang/swift/skills/swiftui-patterns/references/deprecated-migration.md +1097 -0
  154. package/stacks/lang/swift/skills/swiftui-patterns/references/design-polish.md +780 -0
  155. package/stacks/lang/swift/skills/swiftui-patterns/references/platform-and-sharing.md +696 -0
  156. package/stacks/lang/typescript/skills/typescript-conventions/SKILL.md +91 -0
  157. package/stacks/platform/android/claude.fragment.md +40 -0
  158. package/stacks/platform/android/hooks/pre-push-gradle.sh +70 -0
  159. package/stacks/platform/android/settings.fragment.json +13 -0
  160. package/stacks/platform/android/skills/android-build-conventions/SKILL.md +247 -0
  161. package/stacks/platform/ios/claude.fragment.md +24 -0
  162. package/stacks/platform/ios/hooks/pre-push-xcodebuild.sh +82 -0
  163. package/stacks/platform/ios/settings.fragment.json +21 -0
  164. package/stacks/platform/ios/skills/xcodebuildmcp-simulator-logs/SKILL.md +76 -0
  165. package/stacks/platform/web/skills/frontend-testing/SKILL.md +246 -0
  166. package/stacks/platform/web/skills/react-conventions/SKILL.md +261 -0
  167. package/stacks/platform/web/skills/web-platform-conventions/SKILL.md +55 -0
  168. package/stacks/tooling/issue-tracker-github/claude.fragment.md +10 -0
  169. package/stacks/tooling/issue-tracker-github/settings.fragment.json +24 -0
  170. package/stacks/tooling/issue-tracker-github/skills/issue-tracker-github/SKILL.md +278 -0
  171. package/stacks/tooling/issue-tracker-linear/claude.fragment.md +17 -0
  172. package/stacks/tooling/issue-tracker-linear/settings.fragment.json +9 -0
  173. package/stacks/tooling/issue-tracker-linear/skills/issue-tracker-linear/SKILL.md +183 -0
@@ -0,0 +1,175 @@
1
+ ---
2
+ name: cloudflare-deployment-preflight
3
+ description: Preflight validation for Cloudflare Workers/Pages deployments via Wrangler. Run before any wrangler deploy. Dry-run builds (wrangler deploy --dry-run), gradual rollout via versions upload + deployments, secrets via wrangler secret, D1 migrations (wrangler d1 migrations), account/auth check (wrangler whoami), bindings correctness in wrangler.toml (vars/KV/R2/D1/Durable Objects/Queues), and wrangler tail for logs. Activate when the active cloud is Cloudflare and the user mentions deploying a Worker/Pages, validating wrangler config, gradual rollout, secrets, D1 migrations, bindings, or preparing for wrangler deploy.
4
+ ---
5
+
6
+ # Cloudflare Deployment Preflight
7
+
8
+ The Cloudflare analogue of pre-deploy validation: build and validate locally, confirm the
9
+ target account and bindings, and stage the rollout *before* you push live, so users don't
10
+ discover what you could have caught. Works for Workers and Pages (both run on Wrangler).
11
+ Continue through all steps even if one fails — capture every issue, then fix them in a
12
+ batch.
13
+
14
+ > Discipline: **batch your fixes.** A Worker `deploy` is global within seconds — there's no
15
+ > per-region canary by default. Read the whole config, reason about every issue, fix them
16
+ > all, then deploy once behind a gradual rollout. One staged rollout, not one deploy per
17
+ > error.
18
+
19
+ ## When to use
20
+
21
+ - Before `wrangler deploy` / `wrangler versions upload` / `wrangler pages deploy`.
22
+ - When preparing or reviewing `wrangler.toml` (bindings, vars, compatibility settings).
23
+ - To preview what a deploy will produce (`--dry-run`).
24
+ - Before running D1 migrations against a production database.
25
+ - When a deploy "worked locally" but the live Worker errors on a missing binding/secret.
26
+
27
+ ## Step 1 — Confirm the target account & auth
28
+
29
+ Deploying to the wrong account is the most expensive mistake. Wrangler picks up auth from
30
+ `wrangler login` (OAuth) or a `CLOUDFLARE_API_TOKEN` env var, and the account from
31
+ `account_id` in `wrangler.toml` or `CLOUDFLARE_ACCOUNT_ID`.
32
+
33
+ ```bash
34
+ wrangler whoami # who am I, and which account(s) can I deploy to?
35
+ ```
36
+
37
+ Confirm the printed account matches the intended one and that the token has the needed
38
+ scopes (Workers Scripts, D1, R2, KV, etc.). If `whoami` shows multiple accounts, pin
39
+ `account_id` in `wrangler.toml` so a deploy can't silently land in the wrong account.
40
+
41
+ ## Step 2 — Dry-run the build
42
+
43
+ `--dry-run` runs the full bundle + binding resolution **without uploading anything** — it
44
+ catches build errors, missing modules, oversized bundles, and (with `--outdir`) lets you
45
+ inspect the output.
46
+
47
+ ```bash
48
+ # Build + validate, upload NOTHING. The core preflight.
49
+ wrangler deploy --dry-run --outdir dist/
50
+
51
+ # Pages equivalent: build locally and inspect, no deploy
52
+ wrangler pages functions build # builds Functions to inspect
53
+ ```
54
+
55
+ This is the equivalent of `cdk synth` / `bicep build` / `terraform plan`'s build half —
56
+ it will **not** catch a binding that exists in config but not in the account (a KV
57
+ namespace / D1 / R2 bucket that was never created), or a missing secret. Those are Steps
58
+ 3–4. Watch the reported **bundle size** against the plan limit (3MB Free / 10MB paid).
59
+
60
+ ## Step 3 — Validate bindings in wrangler.toml
61
+
62
+ The #1 cause of "works in dev, 1101/exception in prod" is a binding that's declared but the
63
+ underlying resource doesn't exist, or an ID mismatch. Cross-check every binding in
64
+ `wrangler.toml` against what actually exists in the account:
65
+
66
+ | Binding | wrangler.toml | Verify the resource exists |
67
+ | --- | --- | --- |
68
+ | **Vars** (plaintext) | `[vars]` | non-secret config only — never put secrets here |
69
+ | **KV** | `[[kv_namespaces]]` `id` | `wrangler kv namespace list` |
70
+ | **R2** | `[[r2_buckets]]` `bucket_name` | `wrangler r2 bucket list` |
71
+ | **D1** | `[[d1_databases]]` `database_id` | `wrangler d1 list` / `wrangler d1 info <db>` |
72
+ | **Durable Objects** | `[[durable_objects.bindings]]` + `[[migrations]]` | DO classes need a migration tag (see Step 5) |
73
+ | **Queues** | `[[queues.producers]]` / `[[queues.consumers]]` | `wrangler queues list` |
74
+ | **Service bindings** | `[[services]]` | the target Worker must be deployed |
75
+
76
+ ```bash
77
+ wrangler kv namespace list # IDs must match [[kv_namespaces]].id
78
+ wrangler r2 bucket list # names must match [[r2_buckets]].bucket_name
79
+ wrangler d1 list # D1 databases + their IDs
80
+ wrangler d1 info <db-name> # size, region, details for one D1
81
+ wrangler queues list # queues must exist before a consumer deploys
82
+ ```
83
+
84
+ A mismatched `id`/`name`, or a `binding` name the code references that isn't in the toml,
85
+ is a runtime exception, not a build error — `--dry-run` won't catch it. Confirm the
86
+ `compatibility_date` and any `compatibility_flags` (e.g. `nodejs_compat`) are set, since a
87
+ stale compat date can change runtime behavior.
88
+
89
+ ## Step 4 — Secrets (never in wrangler.toml)
90
+
91
+ Secrets are set out-of-band and are **not** in `wrangler.toml` (only non-secret `[vars]`
92
+ go there). A Worker that reads `env.MY_SECRET` will get `undefined` and throw if the secret
93
+ was never uploaded to that environment.
94
+
95
+ ```bash
96
+ wrangler secret list # which secrets exist for this Worker/env
97
+ wrangler secret put MY_SECRET # set (prompts for value) — MUTATING, gated
98
+ wrangler secret put MY_SECRET --env prod # per-environment
99
+ ```
100
+
101
+ Preflight: `wrangler secret list` and confirm every secret the code reads is present for
102
+ the target environment. Setting secrets (`secret put`) is a mutating action — do it
103
+ deliberately, per environment, not as part of casual inspection.
104
+
105
+ ## Step 5 — D1 migrations
106
+
107
+ D1 schema changes go through Wrangler's migration system. Apply to **local first, then a
108
+ remote staging DB, then production** — never run an unreviewed migration straight at prod.
109
+
110
+ ```bash
111
+ wrangler d1 migrations create <db> <name> # scaffold a new migration file
112
+ wrangler d1 migrations list <db> # which migrations are applied vs pending
113
+ wrangler d1 migrations list <db> --remote # against the real remote DB
114
+ wrangler d1 migrations apply <db> --local # apply locally first
115
+ wrangler d1 migrations apply <db> --remote # apply to remote — MUTATING, gated
116
+ ```
117
+
118
+ > `migrations list` is read-only (safe preflight — see exactly what's pending). `apply`
119
+ > and any `d1 execute` are mutating and gated. Remember D1 is **SQLite** with a **10GB
120
+ > ceiling** and primary-region writes — a migration that rewrites a large table can be slow
121
+ > and bills `rows_written`. Review the SQL; back up / export if it's destructive.
122
+
123
+ ## Step 6 — Stage the rollout (versions + gradual deployment)
124
+
125
+ `wrangler deploy` publishes **globally within seconds** with no built-in canary. For
126
+ anything risky, use the **versions** workflow to upload a version without serving it, then
127
+ ramp traffic gradually and roll back instantly if metrics turn.
128
+
129
+ ```bash
130
+ # Upload a new version WITHOUT routing any traffic to it
131
+ wrangler versions upload
132
+
133
+ wrangler versions list # see versions + which is serving
134
+ wrangler versions view <version-id> # inspect one
135
+
136
+ # Roll out gradually: split traffic across versions (e.g. 10% new / 90% old)
137
+ wrangler versions deploy # interactive percentage split
138
+
139
+ # Watch the new version under real traffic, then ramp to 100% — or roll back
140
+ # by deploying 100% of the previous version.
141
+ ```
142
+
143
+ This is the Cloudflare answer to a canary / change set: ship the version, point a slice of
144
+ production at it, watch `wrangler tail` + analytics, then complete or revert. Far safer
145
+ than a bare `wrangler deploy` for a change touching a hot path.
146
+
147
+ ## Step 7 — Verify after rollout (tail the logs)
148
+
149
+ Once a version is taking traffic, watch live requests for new exceptions before ramping to
150
+ 100%:
151
+
152
+ ```bash
153
+ wrangler tail # live request log stream
154
+ wrangler tail --status error # only errored invocations
155
+ wrangler tail --format json | jq '.exceptions' # structured, filter exceptions
156
+ ```
157
+
158
+ Look for `Exceeded CPU`/`Exceeded Memory` (1102), subrequest-limit errors, missing-binding
159
+ exceptions, and elevated 5xx/exception rate on the new version. If clean, complete the
160
+ rollout; if not, roll back to the prior version immediately.
161
+
162
+ ## Step 8 — Report
163
+
164
+ Summarize: account confirmed (`whoami`), dry-run build result + bundle size vs limit,
165
+ binding/resource cross-check (every KV/R2/D1/DO/Queue binding resolved to a real resource),
166
+ secrets present for the target env, D1 migrations pending/applied, and the rollout plan
167
+ (version id, traffic split, rollback path). Flag any **destructive D1 migration**, any
168
+ **missing binding or secret**, and any **bundle over the plan limit**. State clearly whether
169
+ it's safe to deploy and at what initial traffic percentage.
170
+
171
+ ## Tool requirements
172
+
173
+ `wrangler` CLI (v3+), authenticated via `wrangler login` or `CLOUDFLARE_API_TOKEN`. Verify
174
+ auth + account first: `wrangler whoami`. `jq` optional for parsing `wrangler tail --format
175
+ json`.
@@ -0,0 +1,17 @@
1
+ ## Cloud stack
2
+
3
+ - **Active cloud: GCP.** Architecture and IaC target Google Cloud; agents load the
4
+ `cloud-architecture-gcp` and `gcp-deployment-preflight` skill packs.
5
+ - **Tool preference order** (when investigating or validating cloud state):
6
+ 1. **gcloud / gsutil, read-only** — `gcloud config get-value project`,
7
+ `gcloud auth list`, `gcloud run services list`, `gcloud sql instances describe`,
8
+ `gcloud logging read`, `gcloud compute regions describe`,
9
+ `gcloud services list --enabled`, `gsutil ls` and similar inspection commands.
10
+ Never mutate state to answer a question.
11
+ 2. **Docs source** — official Google Cloud documentation (cloud.google.com/docs) for
12
+ quotas, pricing, and API behavior. Verify against docs rather than from memory.
13
+ - Mutating actions (deploy/apply/delete) go through the IaC tool and the
14
+ `gcp-deployment-preflight` gate, never ad-hoc CLI writes.
15
+
16
+ <!-- naturalize: confirm the GCP project ID(s), region(s), and the path to the
17
+ architecture/cost docs Melvin and Aaron should read for concrete topology. -->
@@ -0,0 +1,40 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(gcloud config get-value:*)",
5
+ "Bash(gcloud auth list:*)",
6
+ "Bash(gcloud projects list:*)",
7
+ "Bash(gcloud projects describe:*)",
8
+ "Bash(gcloud services list:*)",
9
+ "Bash(gcloud run services list:*)",
10
+ "Bash(gcloud run services describe:*)",
11
+ "Bash(gcloud run revisions list:*)",
12
+ "Bash(gcloud sql instances list:*)",
13
+ "Bash(gcloud sql instances describe:*)",
14
+ "Bash(gcloud pubsub topics list:*)",
15
+ "Bash(gcloud pubsub subscriptions describe:*)",
16
+ "Bash(gcloud compute regions describe:*)",
17
+ "Bash(gcloud compute project-info describe:*)",
18
+ "Bash(gcloud artifacts repositories describe:*)",
19
+ "Bash(gcloud deployment-manager deployments list:*)",
20
+ "Bash(gcloud deployment-manager deployments describe:*)",
21
+ "Bash(gcloud deployment-manager operations list:*)",
22
+ "Bash(gcloud logging read:*)",
23
+ "Bash(gsutil ls:*)",
24
+ "Bash(terraform fmt:*)",
25
+ "Bash(terraform validate:*)",
26
+ "Bash(terraform plan:*)",
27
+ "Bash(terraform show:*)"
28
+ ],
29
+ "deny": [
30
+ "Bash(gcloud run deploy:*)",
31
+ "Bash(gcloud sql instances delete:*)",
32
+ "Bash(gcloud deployment-manager deployments create:*)",
33
+ "Bash(gcloud deployment-manager deployments update:*)",
34
+ "Bash(gcloud deployment-manager deployments delete:*)",
35
+ "Bash(terraform apply:*)",
36
+ "Bash(terraform destroy:*)"
37
+ ]
38
+ },
39
+ "mcpServers": {}
40
+ }
@@ -0,0 +1,208 @@
1
+ ---
2
+ name: cloud-architecture-gcp
3
+ description: GCP service-level architecture knowledge — compute (Cloud Run/GKE/Cloud Functions/GCE), data (Cloud SQL/Spanner/Firestore/Bigtable), messaging (Pub/Sub/Cloud Tasks/Workflows), analytics (BigQuery), edge (Cloud CDN/Load Balancing/API Gateway), storage + secrets (GCS/Secret Manager/Artifact Registry), and Vertex AI. Cost models, quotas, failure modes, and cold-start gotchas. Activate when the active cloud is GCP and the work involves designing, scaling, costing, or diagnosing GCP architecture (Cloud Run cold starts, Cloud SQL connection limits, Spanner hot spots, egress, Pub/Sub backlog).
4
+ ---
5
+
6
+ # GCP Architecture Knowledge
7
+
8
+ Service-level detail for a GCP-backed project. Pairs with Melvin's cloud-agnostic
9
+ diagnostic checklist (traffic pattern, state location, SLAs, blast radius, cost
10
+ explosion, coordination, limits, observability) — this pack supplies the GCP-specific
11
+ answers. For concrete topology, cost tiers, and stack context, read the architecture
12
+ docs named in `CLAUDE.md`.
13
+
14
+ ## Compute
15
+
16
+ ### Cloud Run
17
+
18
+ - Serverless containers, request- or instance-billed, **scales to zero**. The
19
+ cost win and the latency trap: a scaled-to-zero service pays a **cold start** (image
20
+ pull + container start) on the next request. Set **`min-instances >= 1`** to keep a
21
+ warm instance for latency-sensitive paths — the Cloud Run analogue of provisioned
22
+ concurrency.
23
+ - **Concurrency:** one Cloud Run instance serves *multiple* concurrent requests
24
+ (default up to 80, tunable) — unlike Lambda's one-request-per-instance. Right concurrency
25
+ setting massively affects cost and tail latency; CPU-bound work wants lower concurrency,
26
+ IO-bound can go higher. `--cpu-throttling` (CPU only during requests) vs always-on CPU
27
+ (for background work) is a real cost lever.
28
+ - **Limits:** per-service max instances (set it to cap blast radius and spend), request
29
+ timeout up to 60 min, memory/CPU per instance. Cloud Run **jobs** for run-to-completion
30
+ batch (vs services for request serving).
31
+ - **Good for:** HTTP APIs, event consumers (Eventarc/Pub/Sub push), web apps. Prefer it
32
+ over GKE unless you need Kubernetes primitives.
33
+
34
+ ### GKE
35
+
36
+ - Managed Kubernetes. **Autopilot** (Google manages nodes, pay per pod resource — less
37
+ ops, good default) vs **Standard** (you manage node pools — more control, GPUs, custom
38
+ scheduling). Reach for GKE only when you need the K8s ecosystem (operators, mesh,
39
+ complex scheduling, multi-tenant platform); otherwise Cloud Run is far less overhead.
40
+
41
+ ### Cloud Functions
42
+
43
+ - Event-driven serverless (2nd gen runs on Cloud Run + Eventarc under the hood — same
44
+ cold-start and concurrency model). Triggers: HTTP, Pub/Sub, GCS, Firestore, Eventarc.
45
+ Good for glue and event handlers; for sustained or latency-critical work, Cloud Run
46
+ with min-instances.
47
+
48
+ ### Compute Engine (GCE)
49
+
50
+ - VMs: any machine type (general/compute/memory/GPU/TPU), **Spot/Preemptible** for
51
+ fault-tolerant batch (deep discount, can be reclaimed), **Committed Use Discounts**
52
+ (1/3-yr) and Sustained Use Discounts for steady baseline. The escape hatch when
53
+ managed compute can't meet a hardware/licensing/latency need.
54
+
55
+ ## Data
56
+
57
+ ### Cloud SQL (PostgreSQL / MySQL)
58
+
59
+ - Managed relational. **Connection limits** scale with tier and are the classic
60
+ bottleneck — serverless/many-instance apps exhaust them. Use the **Cloud SQL Auth
61
+ Proxy** + a pooler (PgBouncer) or built-in connection pooling; serverless callers
62
+ should pool aggressively. HA = regional (synchronous standby; failover drops
63
+ connections — apps must reconnect/retry). Read replicas scale reads, not writes.
64
+ - IOPS/throughput scale with disk size and tier; watch on write-heavy workloads. Right-
65
+ size the tier from metrics, don't over-provision.
66
+
67
+ ### Spanner
68
+
69
+ - Horizontally-scalable, strongly-consistent relational (global). **Schema / primary-key
70
+ design is everything** — monotonically increasing keys (timestamps, sequential IDs)
71
+ create **hot spots** on one split; use hashed/UUID or bit-reversed keys for even
72
+ distribution. Billed by **node/processing-unit + storage**; not cheap — justify it
73
+ with genuine horizontal-scale or global-consistency needs over Cloud SQL.
74
+
75
+ ### Firestore
76
+
77
+ - Serverless document DB, auto-scaling, real-time listeners. Strong consistency,
78
+ multi-region options. **Cost is per-operation** (reads/writes/deletes) + storage —
79
+ read-heavy fan-out and unbounded queries get expensive; **avoid N+1 read patterns**
80
+ and design for composite indexes. Great for app/user data with realtime needs.
81
+
82
+ ### Bigtable
83
+
84
+ - Wide-column NoSQL for massive scale + low-latency (time-series, IoT, analytics
85
+ serving). **Row-key design is everything** — sequential keys hot-spot a single node;
86
+ design for even distribution. Billed per node + storage. Reach for it at very high
87
+ throughput where Firestore/Spanner don't fit.
88
+
89
+ ## Messaging & Orchestration
90
+
91
+ ### Pub/Sub
92
+
93
+ - Global, auto-scaling messaging. **Push** (delivers to an endpoint — Cloud Run/Functions)
94
+ vs **pull** (consumer fetches). At-least-once delivery → **idempotent consumers
95
+ required**; **ordering keys** for ordered delivery (lower throughput per key).
96
+ - Configure **ack deadline** > processing time or you get redelivery, and a **dead-letter
97
+ topic** with max delivery attempts for poison messages. Subscription **backlog**
98
+ (`num_undelivered_messages` / oldest-unacked-age) is the metric that predicts pain.
99
+ - **Pub/Sub Lite** is a cheaper, zonal, capacity-provisioned variant for high-volume
100
+ cost-sensitive streaming — fewer features, you manage capacity.
101
+
102
+ ### Cloud Tasks / Workflows
103
+
104
+ - **Cloud Tasks:** managed task queues with rate limiting + scheduled/deferred dispatch
105
+ to HTTP targets — good for decoupling and controlled throughput to a downstream.
106
+ - **Workflows:** serverless orchestration (YAML/JSON) of service calls with retries,
107
+ error handling, and parallel steps — the explicit-state-machine option (vs hiding
108
+ coordination in code). **Eventarc** routes events (GCS, Pub/Sub, audit logs) to Cloud
109
+ Run/Functions/Workflows.
110
+
111
+ ## Analytics
112
+
113
+ ### BigQuery
114
+
115
+ - Serverless data warehouse. **Two pricing models:** **on-demand** (per TB *scanned* —
116
+ a `SELECT *` or unpartitioned full scan is a budget event) vs **capacity/slots**
117
+ (reserved compute, predictable). Control cost with **partitioning + clustering**,
118
+ selecting only needed columns, and `--maximum-bytes-billed` guards. Streaming inserts
119
+ and storage are separate line items. Not an OLTP store — it's analytics.
120
+
121
+ ## Edge & Networking
122
+
123
+ ### Cloud Load Balancing + Cloud CDN
124
+
125
+ - Global external HTTP(S) load balancer (Anycast, single global IP) with **Cloud CDN**
126
+ edge caching and **Cloud Armor** WAF/DDoS. Cache key + TTL design drive hit ratio.
127
+ Use it for global entry, edge caching, and WAF.
128
+
129
+ ### API Gateway / Apigee
130
+
131
+ - **API Gateway** (lightweight, managed, for serverless backends) vs **Apigee** (full
132
+ enterprise API management — policies, monetization, developer portal; heavier and
133
+ pricier). Pick by how much API-management you actually need.
134
+
135
+ ### VPC / Networking — cost landmines
136
+
137
+ - **Cloud NAT** for egress from private instances (per-GB + hourly), and **egress data
138
+ transfer** — inter-zone, inter-region, and internet egress are all per-GB (internet
139
+ and cross-region the most). Use **Private Google Access** / **Private Service Connect**
140
+ to reach Google APIs/services without public egress, and **VPC Service Controls** for
141
+ data-exfil boundaries. Default to private; public only where required.
142
+
143
+ ## Storage, Secrets & Registry
144
+
145
+ ### Cloud Storage (GCS)
146
+
147
+ - Object storage. Classes: Standard → Nearline → Coldline → Archive (retrieval cost +
148
+ minimum storage duration). Lifecycle rules to age data down. **Globally-unique bucket
149
+ names** (see preflight). Cost = storage + **operations** (per-1000 Class A/B) + egress.
150
+ Uniform bucket-level access + IAM; encryption at rest by default (CMEK via Cloud KMS).
151
+
152
+ ### Secret Manager
153
+
154
+ - Versioned secrets with IAM access control; reference from Cloud Run/Functions as
155
+ mounted secrets or env. Keep secrets out of images and config. Pair with **Workload
156
+ Identity** so services authenticate without long-lived keys.
157
+
158
+ ### Artifact Registry
159
+
160
+ - Container images + language packages (successor to Container Registry). Use Workload
161
+ Identity / service-account auth for pulls; regional repos to avoid cross-region pull
162
+ egress.
163
+
164
+ ## Vertex AI
165
+
166
+ - Managed ML + GenAI: **Model Garden** (Gemini, plus third-party and open models),
167
+ online **prediction endpoints** (deploy a model behind an autoscaling endpoint — set
168
+ min replicas to avoid cold start, max to cap cost), batch prediction, training
169
+ pipelines, **Vector Search** for managed RAG.
170
+ - **Quotas matter:** per-model/region requests-per-minute and tokens-per-minute (or QPM)
171
+ limits will throttle a naive high-volume pipeline — request increases early and add
172
+ backoff. Provisioned Throughput for guaranteed capacity (committed spend). Cost is
173
+ token/prediction-driven; model availability varies by region. Right-size the model per
174
+ task.
175
+
176
+ ## Cost realism (where GCP bills explode)
177
+
178
+ 1. **Egress** — internet + cross-region + inter-zone data transfer, per-GB.
179
+ 2. **Cloud NAT** — per-GB processing + hourly. Use Private Google Access / PSC.
180
+ 3. **BigQuery on-demand scans** — per-TB; partition/cluster and limit columns.
181
+ 4. **Cloud Run / GKE over-provisioning** — instance-seconds on idle min-instances /
182
+ always-on CPU; over-large GKE node pools.
183
+ 5. **Spanner / Bigtable nodes** — billed even when idle.
184
+ 6. **Firestore operations** — per read/write at high fan-out.
185
+ 7. **Vertex AI tokens / always-on endpoints**.
186
+
187
+ Levers: Committed Use Discounts + Sustained Use Discounts (steady baseline), Spot/
188
+ Preemptible (fault-tolerant batch), right-sizing from Cloud Monitoring, Private Google
189
+ Access, GCS lifecycle tiering, BigQuery partitioning + `maximum-bytes-billed`, budgets +
190
+ alerts.
191
+
192
+ ## Quotas (request increases *before* they bite)
193
+
194
+ Per-project/region: Compute CPUs + GPUs/TPUs per family, in-use external IPs, Cloud Run
195
+ max instances + CPU quota, Cloud Functions instances, Cloud SQL connections + instances,
196
+ Spanner nodes, Pub/Sub publish/throughput, BigQuery concurrent queries + slots, Vertex
197
+ AI per-model RPM/TPM, GCS request rate. Many are soft (raise via IAM & Admin → Quotas
198
+ with lead time); some are hard. Check `gcloud compute regions describe` / the Quotas page
199
+ and plan around the hard ones.
200
+
201
+ ## Observability
202
+
203
+ Cloud **Operations Suite**: Cloud Monitoring (metrics + alerting policies), Cloud Logging
204
+ (Log Explorer + log-based metrics), Cloud Trace (distributed tracing), Error Reporting,
205
+ Profiler. Alert on the predictors of pain: Cloud Run instance count + request latency p99
206
+ + container startup, Pub/Sub subscription backlog + oldest-unacked-age, Cloud SQL
207
+ connections + CPU + replica lag, Spanner CPU + hot-split, BigQuery bytes-billed,
208
+ Cloud NAT allocation/dropped connections.
@@ -0,0 +1,137 @@
1
+ ---
2
+ name: gcp-deployment-preflight
3
+ description: Preflight validation for GCP infrastructure deployments (Terraform / gcloud / Deployment Manager / Config Connector). Run before any deploy. Validates templates (terraform validate/plan or gcloud deployment preview), cleans up stale/failed deployments and stuck resources, catches globally-unique naming conflicts (GCS bucket, project IDs, Artifact Registry), and checks quota/capacity limits. Activate when the active cloud is GCP and the user mentions deploying, validating, previewing infra changes, gcloud deploy, terraform plan/apply on GCP, or deploy failures.
4
+ ---
5
+
6
+ # GCP Deployment Preflight
7
+
8
+ Validate GCP infrastructure changes locally and clear blocking state *before* you
9
+ deploy, so CI doesn't discover what you could have caught. Continue through all steps
10
+ even if one fails — capture every issue, then fix them in a batch.
11
+
12
+ > Discipline: **batch your fixes.** Each deploy/CI run costs real minutes. Read the
13
+ > whole failing config, reason about every issue, fix them all, push once. One run per
14
+ > problem cluster, not one per error message.
15
+
16
+ ## When to use
17
+
18
+ - Before `terraform apply`, `gcloud run deploy`, `gcloud deployment-manager deployments
19
+ create/update`, or Config Connector / KCC applies.
20
+ - When preparing or reviewing GCP IaC (Terraform google provider, Deployment Manager,
21
+ Config Connector manifests).
22
+ - To preview what a deploy will change.
23
+ - After a failed deploy left a deployment or resource stuck.
24
+
25
+ ## Step 1 — Detect project type & confirm context
26
+
27
+ - **Terraform (most common for GCP IaC):** `*.tf` with the `google`/`google-beta`
28
+ provider, backend in GCS. → use the Terraform flow (Step 2a).
29
+ - **gcloud / Deployment Manager:** `*.yaml`/`*.jinja` DM configs, or imperative
30
+ `gcloud` deploys. → Step 2b.
31
+ - **Config Connector / KCC:** Kubernetes-style GCP-resource manifests applied to a
32
+ cluster.
33
+ - Confirm context — wrong project is the expensive mistake:
34
+
35
+ ```bash
36
+ gcloud config get-value project
37
+ gcloud auth list # active account
38
+ gcloud config get-value compute/region
39
+ ```
40
+
41
+ ## Step 2 — Validate & preview
42
+
43
+ ### 2a — Terraform
44
+
45
+ ```bash
46
+ terraform fmt -check
47
+ terraform validate
48
+ terraform plan -out=tfplan # the what-if; review creates/changes/destroys
49
+ terraform show -no-color tfplan # inspect details
50
+ ```
51
+
52
+ Review the plan for **destroys / replacements of stateful resources** (Cloud SQL,
53
+ Spanner, GCS buckets, persistent disks) and any IAM-binding changes. Never rubber-stamp
54
+ a replacement of a data store.
55
+
56
+ ### 2b — gcloud / Deployment Manager
57
+
58
+ ```bash
59
+ # Deployment Manager dry-run preview
60
+ gcloud deployment-manager deployments update <name> --config config.yaml --preview
61
+ gcloud deployment-manager deployments describe <name> # inspect the preview
62
+
63
+ # Cloud Run: validate the service spec without serving traffic
64
+ gcloud run deploy <svc> --image <img> --no-traffic --tag preflight # revision w/o traffic
65
+ ```
66
+
67
+ ## Step 3 — Clean up stale / failed deployments & stuck resources
68
+
69
+ ```bash
70
+ # Deployment Manager: failed deployments block re-create
71
+ gcloud deployment-manager deployments list \
72
+ --filter="operation.status!=DONE OR operation.error:*"
73
+ gcloud deployment-manager deployments delete <name> # if stuck/failed
74
+
75
+ # Recent failed operations (root cause — read the first error, not the cascade)
76
+ gcloud logging read 'severity>=ERROR' --limit 20 --freshness=1h
77
+ ```
78
+
79
+ Terraform: a partial apply leaves real resources with no/partial state. Use
80
+ `terraform state list` + `terraform import` to reconcile, or `-target` a clean re-apply.
81
+ Watch for **resources that block deletion** — non-empty GCS buckets, Artifact Registry
82
+ repos with images, resources with `deletion_protection = true` (Cloud SQL, Spanner),
83
+ in-use IPs/networks. Clear them first.
84
+
85
+ ## Step 4 — Globally-unique naming conflicts
86
+
87
+ | Resource | Namespace | Conflict mode |
88
+ | --- | --- | --- |
89
+ | **GCS bucket** | Global (all of GCP) | name taken / recently-deleted names reserved |
90
+ | **Project ID** | Global, **immutable**, not reusable | must be unique forever |
91
+ | **Artifact Registry repo** | Per project+location | already-exists |
92
+ | Cloud Run service, Pub/Sub topic, etc. | Per project (+region) | recreate collisions after partial deploys |
93
+
94
+ **Pattern:** never hard-code a globally-unique name — add a short unique suffix
95
+ (project-id fragment / random) via a Terraform variable or `random_id`, and check
96
+ availability before deploy (`gsutil ls -b gs://<name>` / `gcloud artifacts repositories
97
+ describe`). Project IDs especially are permanent — pick deliberately.
98
+
99
+ ## Step 5 — Quota & capacity check
100
+
101
+ Deploys fail late when a quota is hit. Pre-check what the change will consume:
102
+
103
+ ```bash
104
+ gcloud compute regions describe <region> \
105
+ --format="table(quotas.metric, quotas.limit, quotas.usage)"
106
+ gcloud compute project-info describe \
107
+ --format="table(quotas.metric, quotas.limit, quotas.usage)"
108
+ ```
109
+
110
+ Common deploy-blocking quotas: Compute CPUs / in-use external IPs per region, GPUs/TPUs,
111
+ Cloud Run CPU + max-instances quota, Cloud SQL instances, Spanner nodes, VPC networks/
112
+ subnets, and **API enablement** — a deploy fails if the required API isn't enabled
113
+ (`gcloud services list --enabled`; enable with `gcloud services enable`). Soft quotas
114
+ need an IAM & Admin → Quotas request with lead time — raise them before the deploy.
115
+
116
+ ## Step 6 — Check for an in-flight deploy before triggering
117
+
118
+ If CI auto-deploys on push, don't fire a manual deploy on top — the runs race.
119
+
120
+ ```bash
121
+ gh run list --workflow="<deploy workflow>" --limit 3
122
+ gcloud deployment-manager operations list --filter="status!=DONE"
123
+ ```
124
+
125
+ If a deploy is in progress, wait.
126
+
127
+ ## Step 7 — Report
128
+
129
+ Summarize: validate/plan results (creates / modifies / **destroys** / replacements),
130
+ deployments cleaned up, naming overrides applied, required-API + quota headroom, and
131
+ whether it's safe to deploy. Flag any replacement of a stateful resource and any IAM
132
+ change.
133
+
134
+ ## Tool requirements
135
+
136
+ `gcloud` CLI, `terraform` (for TF projects), `gsutil`, `gh` (if CI-driven). Verify auth:
137
+ `gcloud auth list` / `gcloud config get-value project`.