cpflow 5.0.4 → 5.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/.github/actions/cpflow-wait-for-health/action.yml +11 -4
  3. data/.github/workflows/cpflow-promote-staging-to-production.yml +269 -43
  4. data/.github/workflows/rspec-shared.yml +8 -1
  5. data/CHANGELOG.md +28 -1
  6. data/Gemfile.lock +1 -1
  7. data/README.md +36 -11
  8. data/docs/ai-github-flow-prompt.md +1 -1
  9. data/docs/assets/logo/favicon.ico +0 -0
  10. data/docs/assets/logo/icon-1024.png +0 -0
  11. data/docs/assets/logo/icon-128.png +0 -0
  12. data/docs/assets/logo/icon-16.png +0 -0
  13. data/docs/assets/logo/icon-192.png +0 -0
  14. data/docs/assets/logo/icon-24.png +0 -0
  15. data/docs/assets/logo/icon-32.png +0 -0
  16. data/docs/assets/logo/icon-48.png +0 -0
  17. data/docs/assets/logo/icon-512.png +0 -0
  18. data/docs/assets/logo/icon-64.png +0 -0
  19. data/docs/assets/logo/icon-tile.svg +17 -0
  20. data/docs/assets/logo/mark-transparent.svg +16 -0
  21. data/docs/ci-automation.md +137 -47
  22. data/docs/commands.md +13 -3
  23. data/docs/postgres.md +6 -0
  24. data/docs/rds-private-networking.md +649 -0
  25. data/docs/secrets-and-env-values.md +49 -0
  26. data/docs/tips.md +256 -10
  27. data/examples/controlplane.yml +8 -0
  28. data/lib/command/ai_github_flow_prompt.rb +1 -1
  29. data/lib/command/apply_template.rb +3 -0
  30. data/lib/command/base.rb +69 -0
  31. data/lib/command/cleanup_stale_apps.rb +1 -1
  32. data/lib/command/delete.rb +85 -10
  33. data/lib/command/deploy_image.rb +30 -8
  34. data/lib/command/generate_github_actions.rb +6 -0
  35. data/lib/command/maintenance_off.rb +1 -0
  36. data/lib/command/maintenance_on.rb +1 -0
  37. data/lib/command/run.rb +25 -5
  38. data/lib/command/setup_app.rb +11 -2
  39. data/lib/core/config.rb +81 -0
  40. data/lib/core/controlplane.rb +15 -5
  41. data/lib/core/maintenance_mode.rb +93 -6
  42. data/lib/core/template_parser.rb +4 -0
  43. data/lib/cpflow/version.rb +1 -1
  44. data/lib/generator_templates/controlplane.yml +7 -0
  45. data/lib/generator_templates_sqlite/controlplane.yml +7 -0
  46. data/lib/github_flow_templates/.github/cpflow-help.md +48 -13
  47. data/lib/github_flow_templates/.github/workflows/cpflow-promote-staging-to-production.yml +768 -15
  48. data/lib/github_flow_templates/bin/pin-cpflow-github-ref +17 -3
  49. data/lib/github_flow_templates/bin/test-cpflow-github-flow +61 -9
  50. metadata +15 -2
@@ -9,22 +9,775 @@ on:
9
9
  type: string
10
10
 
11
11
  permissions:
12
- # The upstream reusable workflow's create-github-release job needs
13
- # contents: write, and callers must grant the union of callee permissions.
14
- contents: write
12
+ contents: read
13
+
14
+ env:
15
+ # Override these by editing this file or by setting the matching repository variable.
16
+ # Worst-case wall time per attempt is HEALTH_CHECK_INTERVAL plus the curl --max-time below
17
+ # (10s), so the defaults give a ~10 minute window (24 × (15 + 10) = 600s) — enough for
18
+ # most Rails cold boots (asset precompile + db:migrate + workload readiness).
19
+ HEALTH_CHECK_RETRIES: ${{ vars.HEALTH_CHECK_RETRIES || '24' }}
20
+ HEALTH_CHECK_INTERVAL: ${{ vars.HEALTH_CHECK_INTERVAL || '15' }}
21
+ # Space-separated list of HTTP statuses considered healthy. The default accepts 301/302
22
+ # because `curl` is invoked without `-L`, so a root `/` that redirects to a login page
23
+ # (common for Rails apps that auth-gate `/`) would otherwise be reported as unhealthy
24
+ # despite the workload itself being up.
25
+ #
26
+ # Strongly recommended: expose a dedicated `/health` endpoint that returns `200` and set
27
+ # HEALTH_CHECK_ACCEPTED_STATUSES to `"200"` in repository variables. The 301/302 default
28
+ # trades correctness for ergonomics — a maintenance-mode redirect or an auth-gate redirect
29
+ # to a login page can pass this check even when the underlying app is broken. Override
30
+ # via the HEALTH_CHECK_ACCEPTED_STATUSES repo variable to tighten this for apps that
31
+ # expose a dedicated health endpoint (e.g. "200" for a plain /health, or "200 401 403"
32
+ # for apps that auth-gate / without redirecting).
33
+ HEALTH_CHECK_ACCEPTED_STATUSES: ${{ vars.HEALTH_CHECK_ACCEPTED_STATUSES || '200 301 302' }}
34
+ COPY_IMAGE_RETRIES: ${{ vars.COPY_IMAGE_RETRIES || '3' }}
35
+ COPY_IMAGE_RETRY_INTERVAL: ${{ vars.COPY_IMAGE_RETRY_INTERVAL || '20' }}
36
+ ROLLBACK_READINESS_RETRIES: ${{ vars.ROLLBACK_READINESS_RETRIES || '24' }}
37
+ ROLLBACK_READINESS_INTERVAL: ${{ vars.ROLLBACK_READINESS_INTERVAL || '15' }}
38
+
39
+ concurrency:
40
+ # Single global group: only one production promotion may run at a time across the
41
+ # whole repo. Independent of staging deploys and review-app workflows (different
42
+ # GVCs / different concurrency keys), so those can still run in parallel.
43
+ group: cpflow-promote-staging-to-production
44
+ # Don't cancel an in-flight promotion: a half-finished `cpflow deploy-image` plus a
45
+ # rollback can leave production in a worse state than letting the first run finish.
46
+ cancel-in-progress: false
15
47
 
16
48
  jobs:
17
49
  promote-to-production:
18
50
  if: github.event.inputs.confirm_promotion == 'promote'
19
- uses: shakacode/control-plane-flow/.github/workflows/cpflow-promote-staging-to-production.yml@__CPFLOW_GITHUB_ACTIONS_REF__
20
- with:
21
- # Keep CPLN_TOKEN_PRODUCTION as a secret on this protected GitHub
22
- # Environment. The caller passes the environment name, the upstream
23
- # reusable workflow runs its production job in that environment, and
24
- # GitHub exposes environment secrets only after required reviewers approve.
25
- production_environment: production
26
- # Only pass the staging token explicitly. CPLN_TOKEN_PRODUCTION must live on
27
- # the protected production Environment, where GitHub exposes it only after
28
- # the required reviewers approve this job.
29
- secrets:
30
- CPLN_TOKEN_STAGING: ${{ secrets.CPLN_TOKEN_STAGING }}
51
+ runs-on: ubuntu-latest
52
+ # This normal caller-repo job declares the protected production Environment
53
+ # directly, so GitHub exposes environment secrets in this job after the
54
+ # environment gate. Do not move production promotion back behind a
55
+ # cross-repo reusable workflow; environment secrets are not available there.
56
+ environment: production
57
+ timeout-minutes: 45
58
+ outputs:
59
+ staging_app_name: ${{ steps.release-context.outputs.staging_app_name }}
60
+ production_app_name: ${{ steps.release-context.outputs.production_app_name }}
61
+
62
+ steps:
63
+ - name: Checkout repository
64
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
65
+ with:
66
+ persist-credentials: false
67
+
68
+ - name: Checkout control-plane-flow actions
69
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
70
+ with:
71
+ repository: shakacode/control-plane-flow
72
+ ref: __CPFLOW_GITHUB_ACTIONS_REF__
73
+ path: .cpflow
74
+ persist-credentials: false
75
+
76
+ - name: Validate production token
77
+ shell: bash
78
+ env:
79
+ # GitHub does not expose which secret scope supplied this value.
80
+ # Keep CPLN_TOKEN_PRODUCTION absent from repo/org secrets so the
81
+ # protected production Environment is the only configured source.
82
+ CPLN_TOKEN_PRODUCTION: ${{ secrets.CPLN_TOKEN_PRODUCTION }}
83
+ PRODUCTION_ENVIRONMENT: production
84
+ run: |
85
+ set -euo pipefail
86
+
87
+ if [[ -z "${CPLN_TOKEN_PRODUCTION}" ]]; then
88
+ echo "::error::CPLN_TOKEN_PRODUCTION is not set. Add it as a secret on the '${PRODUCTION_ENVIRONMENT}' GitHub Environment."
89
+ exit 1
90
+ fi
91
+
92
+ - name: Validate required secrets and variables
93
+ uses: ./.cpflow/.github/actions/cpflow-validate-config
94
+ # Pass secrets via env so the composite action checks indirect shell
95
+ # variables instead of interpolating secret values into a run script.
96
+ env:
97
+ CPLN_TOKEN_STAGING: ${{ secrets.CPLN_TOKEN_STAGING }}
98
+ CPLN_TOKEN_PRODUCTION: ${{ secrets.CPLN_TOKEN_PRODUCTION }}
99
+ CPLN_ORG_STAGING: ${{ vars.CPLN_ORG_STAGING }}
100
+ CPLN_ORG_PRODUCTION: ${{ vars.CPLN_ORG_PRODUCTION }}
101
+ STAGING_APP_NAME: ${{ vars.STAGING_APP_NAME }}
102
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
103
+ with:
104
+ required: |
105
+ secret:CPLN_TOKEN_STAGING
106
+ secret:CPLN_TOKEN_PRODUCTION
107
+ variable:CPLN_ORG_STAGING
108
+ variable:CPLN_ORG_PRODUCTION
109
+ variable:STAGING_APP_NAME
110
+ variable:PRODUCTION_APP_NAME
111
+
112
+ - name: Normalize Control Plane org names
113
+ id: cpln-orgs
114
+ env:
115
+ CPLN_ORG_STAGING: ${{ vars.CPLN_ORG_STAGING }}
116
+ CPLN_ORG_PRODUCTION: ${{ vars.CPLN_ORG_PRODUCTION }}
117
+ shell: bash
118
+ run: |
119
+ set -euo pipefail
120
+
121
+ sanitize_control_plane_name() {
122
+ local label="$1"
123
+ local value="$2"
124
+
125
+ value="${value#"${value%%[![:space:]]*}"}"
126
+ value="${value%"${value##*[![:space:]]}"}"
127
+
128
+ if [[ "${value}" == *$'\r'* || "${value}" == *$'\n'* ]]; then
129
+ echo "::error::${label} contains embedded line endings; remove them from the repository variable instead of relying on normalization." >&2
130
+ exit 1
131
+ fi
132
+
133
+ printf '%s' "${value}"
134
+ }
135
+
136
+ validate_control_plane_org() {
137
+ local label="$1"
138
+ local value="$2"
139
+
140
+ if ! [[ "${value}" =~ ^[a-z0-9]([a-z0-9-]*[a-z0-9])?$ ]]; then
141
+ local display_value
142
+ display_value="$(printf '%q' "${value}")"
143
+ echo "::error::${label} (${display_value}) must be a valid Control Plane org name; use lowercase alphanumeric characters and hyphens only, with no leading or trailing hyphen." >&2
144
+ exit 1
145
+ fi
146
+ }
147
+
148
+ staging_org="$(sanitize_control_plane_name "CPLN_ORG_STAGING" "${CPLN_ORG_STAGING}")"
149
+ production_org="$(sanitize_control_plane_name "CPLN_ORG_PRODUCTION" "${CPLN_ORG_PRODUCTION}")"
150
+
151
+ validate_control_plane_org "CPLN_ORG_STAGING" "${staging_org}"
152
+ validate_control_plane_org "CPLN_ORG_PRODUCTION" "${production_org}"
153
+
154
+ {
155
+ echo "staging=${staging_org}"
156
+ echo "production=${production_org}"
157
+ } >> "$GITHUB_OUTPUT"
158
+
159
+ - name: Capture release context
160
+ id: release-context
161
+ env:
162
+ STAGING_APP_NAME: ${{ vars.STAGING_APP_NAME }}
163
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
164
+ shell: bash
165
+ run: |
166
+ set -euo pipefail
167
+
168
+ {
169
+ echo "staging_app_name=${STAGING_APP_NAME}"
170
+ echo "production_app_name=${PRODUCTION_APP_NAME}"
171
+ } >> "$GITHUB_OUTPUT"
172
+
173
+ - name: Setup production environment
174
+ uses: ./.cpflow/.github/actions/cpflow-setup-environment
175
+ with:
176
+ token: ${{ secrets.CPLN_TOKEN_PRODUCTION }}
177
+ org: ${{ steps.cpln-orgs.outputs.production }}
178
+ working_directory: .cpflow
179
+ cpln_cli_version: ${{ vars.CPLN_CLI_VERSION }}
180
+ cpflow_version: ${{ vars.CPFLOW_VERSION }}
181
+ # The setup action validates CPFLOW_VERSION against this full workflow ref.
182
+ control_plane_flow_ref: shakacode/control-plane-flow/.github/workflows/cpflow-promote-staging-to-production.yml@__CPFLOW_GITHUB_ACTIONS_REF__
183
+
184
+ # Runs after Setup production environment so the pinned Ruby (>= 3.1) is on PATH.
185
+ # YAML.load_file(..., aliases: true) is not supported on Ruby 3.0 (system Ruby on ubuntu-22.04).
186
+ - name: Resolve production app workloads
187
+ id: workloads
188
+ env:
189
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
190
+ PRIMARY_WORKLOAD: ${{ vars.PRIMARY_WORKLOAD }}
191
+ shell: bash
192
+ run: |
193
+ set -euo pipefail
194
+
195
+ ruby - "${PRODUCTION_APP_NAME}" "${PRIMARY_WORKLOAD}" "${GITHUB_OUTPUT}" <<'RUBY'
196
+ require "yaml"
197
+
198
+ app = ARGV.fetch(0)
199
+ requested_primary = ARGV.fetch(1, "").to_s.strip
200
+ output_path = ARGV.fetch(2)
201
+ data = YAML.safe_load(File.read(".controlplane/controlplane.yml"), aliases: true)
202
+ apps = data["apps"] || {}
203
+ app_config = apps[app]
204
+
205
+ unless app_config
206
+ warn "Error: app '#{app}' is not defined under `apps:` in `.controlplane/controlplane.yml`."
207
+ warn " Fix the PRODUCTION_APP_NAME repository variable or add the app to controlplane.yml."
208
+ exit 1
209
+ end
210
+
211
+ workloads = Array(app_config["app_workloads"]).map(&:to_s).reject(&:empty?)
212
+ workloads = ["rails"] if workloads.empty?
213
+
214
+ primary =
215
+ if requested_primary.empty?
216
+ if workloads.length == 1
217
+ workloads.first
218
+ elsif workloads.include?("rails")
219
+ "rails"
220
+ else
221
+ puts "::error::PRIMARY_WORKLOAD is not configured and app '#{app}' has multiple workloads: #{workloads.join(', ')}."
222
+ warn " Set the PRIMARY_WORKLOAD repository variable to one of these workloads."
223
+ exit 1
224
+ end
225
+ elsif workloads.include?(requested_primary)
226
+ requested_primary
227
+ else
228
+ puts "::error::PRIMARY_WORKLOAD '#{requested_primary}' is not one of: #{workloads.join(', ')}."
229
+ exit 1
230
+ end
231
+
232
+ File.open(output_path, "a") do |output|
233
+ output.puts "names=#{workloads.join(',')}"
234
+ output.puts "primary=#{primary}"
235
+ end
236
+ RUBY
237
+
238
+ - name: Detect release phase support
239
+ id: release-phase
240
+ uses: ./.cpflow/.github/actions/cpflow-detect-release-phase
241
+ with:
242
+ app_name: ${{ vars.PRODUCTION_APP_NAME }}
243
+
244
+ - name: Verify production environment variables
245
+ env:
246
+ CPLN_TOKEN_STAGING: ${{ secrets.CPLN_TOKEN_STAGING }}
247
+ CPLN_TOKEN_PRODUCTION: ${{ secrets.CPLN_TOKEN_PRODUCTION }}
248
+ STAGING_APP_NAME: ${{ vars.STAGING_APP_NAME }}
249
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
250
+ CPLN_ORG_STAGING: ${{ steps.cpln-orgs.outputs.staging }}
251
+ CPLN_ORG_PRODUCTION: ${{ steps.cpln-orgs.outputs.production }}
252
+ WORKLOAD_NAMES: ${{ steps.workloads.outputs.names }}
253
+ shell: bash
254
+ run: |
255
+ set -euo pipefail
256
+
257
+ list_gvc_env_names() {
258
+ local token="$1"
259
+ local org="$2"
260
+ local app="$3"
261
+
262
+ CPLN_TOKEN="${token}" cpln gvc get "${app}" --org "${org}" -o json |
263
+ jq -r '.spec.env // [] | .[] | .name // empty' |
264
+ sort -u
265
+ }
266
+
267
+ list_workload_env_names() {
268
+ local token="$1"
269
+ local org="$2"
270
+ local app="$3"
271
+ local workload="$4"
272
+
273
+ CPLN_TOKEN="${token}" cpln workload get "${workload}" --gvc "${app}" --org "${org}" -o json |
274
+ jq -r '.spec.containers // [] | .[] | (.env // [])[]? | .name // empty' |
275
+ sort -u
276
+ }
277
+
278
+ check_required_vars() {
279
+ local staging_scope="$1"
280
+ local production_scope="$2"
281
+ local missing_message="$3"
282
+ local staging_vars="$4"
283
+ local production_vars="$5"
284
+ local missing_vars
285
+ local production_only_vars
286
+
287
+ if [[ -z "${staging_vars}" ]]; then
288
+ echo "Staging ${staging_scope} exposes no environment variables; skipping parity check."
289
+ return
290
+ fi
291
+
292
+ # Treat staging as the promotion source of truth: fail when a variable
293
+ # present in staging is missing in production. Production-only variables
294
+ # are allowed, but surface them so teams can spot drift.
295
+ missing_vars="$(comm -23 <(printf '%s\n' "${staging_vars}") <(printf '%s\n' "${production_vars}"))"
296
+ production_only_vars="$(comm -13 <(printf '%s\n' "${staging_vars}") <(printf '%s\n' "${production_vars}"))"
297
+
298
+ if [[ -n "${production_only_vars}" ]]; then
299
+ echo "::warning::Production ${production_scope} has environment variables that are not present in staging:"
300
+ echo "${production_only_vars}"
301
+ fi
302
+
303
+ if [[ -n "${missing_vars}" ]]; then
304
+ echo "::error::${missing_message}"
305
+ echo "${missing_vars}"
306
+ env_check_failed=1
307
+ fi
308
+ }
309
+
310
+ # check_required_vars intentionally mutates env_check_failed in this
311
+ # shell; keep calls outside subshells so failures aggregate before the
312
+ # final exit.
313
+ env_check_failed=0
314
+
315
+ staging_vars="$(list_gvc_env_names "${CPLN_TOKEN_STAGING}" "${CPLN_ORG_STAGING}" "${STAGING_APP_NAME}")"
316
+ production_vars="$(list_gvc_env_names "${CPLN_TOKEN_PRODUCTION}" "${CPLN_ORG_PRODUCTION}" "${PRODUCTION_APP_NAME}")"
317
+ check_required_vars \
318
+ "GVC '${STAGING_APP_NAME}'" \
319
+ "GVC '${PRODUCTION_APP_NAME}'" \
320
+ "Production GVC '${PRODUCTION_APP_NAME}' is missing environment variables that exist in staging" \
321
+ "${staging_vars}" \
322
+ "${production_vars}"
323
+
324
+ while IFS= read -r workload_name; do
325
+ [[ -n "${workload_name}" ]] || continue
326
+
327
+ staging_workload_vars="$(list_workload_env_names "${CPLN_TOKEN_STAGING}" "${CPLN_ORG_STAGING}" "${STAGING_APP_NAME}" "${workload_name}")"
328
+ production_workload_vars="$(list_workload_env_names "${CPLN_TOKEN_PRODUCTION}" "${CPLN_ORG_PRODUCTION}" "${PRODUCTION_APP_NAME}" "${workload_name}")"
329
+ check_required_vars \
330
+ "workload '${workload_name}'" \
331
+ "workload '${workload_name}'" \
332
+ "Production workload '${workload_name}' is missing environment variables that exist in staging" \
333
+ "${staging_workload_vars}" \
334
+ "${production_workload_vars}"
335
+ done < <(tr ',' '\n' <<< "${WORKLOAD_NAMES}")
336
+
337
+ exit "${env_check_failed}"
338
+
339
+ - name: Capture current production image
340
+ id: capture-current
341
+ env:
342
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
343
+ CPLN_ORG_PRODUCTION: ${{ steps.cpln-orgs.outputs.production }}
344
+ WORKLOAD_NAMES: ${{ steps.workloads.outputs.names }}
345
+ PRIMARY_WORKLOAD: ${{ steps.workloads.outputs.primary }}
346
+ shell: bash
347
+ run: |
348
+ set -euo pipefail
349
+
350
+ selected_workload="${PRIMARY_WORKLOAD}"
351
+ selected_image=""
352
+ selected_version=""
353
+ rollback_state='{}'
354
+
355
+ # Validate all workloads have images, then promote the primary workload's
356
+ # image as the canonical image for this GVC.
357
+ while IFS= read -r workload_name; do
358
+ [[ -n "${workload_name}" ]] || continue
359
+
360
+ workload_json="$(cpln workload get "${workload_name}" --gvc "${PRODUCTION_APP_NAME}" --org "${CPLN_ORG_PRODUCTION}" -o json)"
361
+ workload_image="$(echo "${workload_json}" | jq -r '.spec.containers[0].image // empty')"
362
+ workload_containers="$(echo "${workload_json}" | jq -c '.spec.containers | map({name, image})')"
363
+ workload_version="$(echo "${workload_json}" | jq -r '.version')"
364
+
365
+ if [[ "${workload_name}" == "${selected_workload}" ]]; then
366
+ selected_image="${workload_image}"
367
+ selected_version="${workload_version}"
368
+ fi
369
+
370
+ rollback_state="$(
371
+ jq -c \
372
+ --arg workload "${workload_name}" \
373
+ --arg image "${workload_image}" \
374
+ --arg version "${workload_version}" \
375
+ --argjson containers "${workload_containers}" \
376
+ '. + {($workload): {image: $image, version: $version, containers: $containers}}' \
377
+ <<< "${rollback_state}"
378
+ )"
379
+ done < <(tr ',' '\n' <<< "${WORKLOAD_NAMES}")
380
+
381
+ if [[ -z "${selected_image}" || -z "${selected_version}" ]]; then
382
+ echo "::error::Could not capture current image/version for primary workload '${selected_workload}'." >&2
383
+ exit 1
384
+ fi
385
+
386
+ echo "current_image=${selected_image}" >> "$GITHUB_OUTPUT"
387
+ echo "current_version=${selected_version}" >> "$GITHUB_OUTPUT"
388
+ # Randomize the heredoc delimiter so a stray "EOF" line inside rollback_state can't terminate it early.
389
+ delim="EOF_$(openssl rand -hex 8)"
390
+ {
391
+ echo "rollback_state<<${delim}"
392
+ echo "${rollback_state}"
393
+ echo "${delim}"
394
+ } >> "$GITHUB_OUTPUT"
395
+
396
+ - name: Capture deployed staging image
397
+ id: staging-image
398
+ env:
399
+ CPLN_TOKEN_STAGING: ${{ secrets.CPLN_TOKEN_STAGING }}
400
+ STAGING_APP_NAME: ${{ vars.STAGING_APP_NAME }}
401
+ CPLN_ORG_STAGING: ${{ steps.cpln-orgs.outputs.staging }}
402
+ WORKLOAD_NAMES: ${{ steps.workloads.outputs.names }}
403
+ PRIMARY_WORKLOAD: ${{ steps.workloads.outputs.primary }}
404
+ shell: bash
405
+ run: |
406
+ set -euo pipefail
407
+
408
+ selected_workload="${PRIMARY_WORKLOAD}"
409
+ selected_image=""
410
+
411
+ while IFS= read -r workload_name; do
412
+ [[ -n "${workload_name}" ]] || continue
413
+
414
+ workload_json="$(CPLN_TOKEN="${CPLN_TOKEN_STAGING}" cpln workload get "${workload_name}" --gvc "${STAGING_APP_NAME}" --org "${CPLN_ORG_STAGING}" -o json)"
415
+ workload_image="$(echo "${workload_json}" | jq -r '.spec.containers[0].image // empty')"
416
+
417
+ if [[ -z "${workload_image}" ]]; then
418
+ echo "::error::Could not find an image on staging workload '${workload_name}'." >&2
419
+ exit 1
420
+ fi
421
+
422
+ if [[ "${workload_name}" == "${selected_workload}" ]]; then
423
+ selected_image="${workload_image}"
424
+ fi
425
+ done < <(tr ',' '\n' <<< "${WORKLOAD_NAMES}")
426
+
427
+ staging_image_ref="${selected_image}"
428
+ if [[ -z "${staging_image_ref}" ]]; then
429
+ echo "::error::Could not determine the deployed staging image for primary workload '${selected_workload}'." >&2
430
+ exit 1
431
+ fi
432
+
433
+ if [[ "${staging_image_ref}" == /org/*/image/* ]]; then
434
+ staging_image="${staging_image_ref##*/image/}"
435
+ elif [[ "${staging_image_ref}" == *.registry.cpln.io/* ]]; then
436
+ staging_image="${staging_image_ref#*.registry.cpln.io/}"
437
+ else
438
+ staging_image="${staging_image_ref}"
439
+ fi
440
+
441
+ echo "image=${staging_image}" >> "$GITHUB_OUTPUT"
442
+
443
+ - name: Set up Docker Buildx
444
+ uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5
445
+
446
+ - name: Copy image from staging
447
+ id: copy-image
448
+ env:
449
+ CPLN_TOKEN_STAGING: ${{ secrets.CPLN_TOKEN_STAGING }}
450
+ CPLN_TOKEN_PRODUCTION: ${{ secrets.CPLN_TOKEN_PRODUCTION }}
451
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
452
+ CPLN_ORG_STAGING: ${{ steps.cpln-orgs.outputs.staging }}
453
+ CPLN_ORG_PRODUCTION: ${{ steps.cpln-orgs.outputs.production }}
454
+ STAGING_IMAGE: ${{ steps.staging-image.outputs.image }}
455
+ shell: bash
456
+ run: |
457
+ set -euo pipefail
458
+
459
+ if ! [[ "${COPY_IMAGE_RETRIES}" =~ ^[0-9]+$ ]]; then
460
+ echo "::error::COPY_IMAGE_RETRIES must be a non-negative integer."
461
+ exit 1
462
+ fi
463
+
464
+ if ! [[ "${COPY_IMAGE_RETRY_INTERVAL}" =~ ^[0-9]+$ ]]; then
465
+ echo "::error::COPY_IMAGE_RETRY_INTERVAL must be a non-negative integer."
466
+ exit 1
467
+ fi
468
+
469
+ copy_image_retries=$((10#${COPY_IMAGE_RETRIES}))
470
+ copy_image_attempts=$((copy_image_retries + 1))
471
+ copy_image_retry_interval=$((10#${COPY_IMAGE_RETRY_INTERVAL}))
472
+
473
+ staging_image="${STAGING_IMAGE}"
474
+ if [[ -z "${staging_image}" ]]; then
475
+ echo "::error::STAGING_IMAGE is not set or is empty."
476
+ exit 1
477
+ fi
478
+
479
+ if ! CPLN_TOKEN="${CPLN_TOKEN_STAGING}" cpln image get "${staging_image}" --org "${CPLN_ORG_STAGING}" -o json >/dev/null; then
480
+ echo "::error::Staging image '${STAGING_IMAGE}' was not found in org '${CPLN_ORG_STAGING}'; aborting promotion."
481
+ exit 1
482
+ fi
483
+
484
+ staging_tag=""
485
+ if [[ "${staging_image}" == *@* ]]; then
486
+ staging_tag="${staging_image##*@}"
487
+ elif [[ "${staging_image}" == *:* ]]; then
488
+ staging_tag="${staging_image##*:}"
489
+ fi
490
+ staging_commit=""
491
+ if [[ "${staging_tag}" == *_* ]]; then
492
+ staging_commit="${staging_tag##*_}"
493
+ else
494
+ echo "::warning::Staging image '${staging_image}' did not include a '_<commit>' suffix; production image tag will omit the commit suffix."
495
+ fi
496
+
497
+ # The workflow-level concurrency group serializes this sequence so two
498
+ # production promotions cannot derive and publish the same next tag.
499
+ # See the top-level concurrency group: cpflow-promote-staging-to-production.
500
+ latest_number="$(
501
+ cpln image query --org "${CPLN_ORG_PRODUCTION}" --prop "name~${PRODUCTION_APP_NAME}:" --max 0 -o json |
502
+ jq -r --arg prefix "${PRODUCTION_APP_NAME}:" \
503
+ '[.items[].name | select(startswith($prefix)) | (try capture("^[^:]+:(?<number>[0-9]+)") catch empty) | .number | tonumber] | max // 0'
504
+ )"
505
+ if ! [[ "${latest_number}" =~ ^[0-9]+$ ]]; then
506
+ echo "::error::Could not determine the next production image number for app '${PRODUCTION_APP_NAME}' in org '${CPLN_ORG_PRODUCTION}'."
507
+ exit 1
508
+ fi
509
+
510
+ production_image="${PRODUCTION_APP_NAME}:$((latest_number + 1))"
511
+ if [[ -n "${staging_commit}" ]]; then
512
+ production_image="${production_image}_${staging_commit}"
513
+ fi
514
+
515
+ staging_registry="${CPLN_ORG_STAGING}.registry.cpln.io"
516
+ production_registry="${CPLN_ORG_PRODUCTION}.registry.cpln.io"
517
+ source_image_ref="${staging_registry}/${STAGING_IMAGE}"
518
+ production_image_ref="${production_registry}/${production_image}"
519
+
520
+ docker_config_dir="$(mktemp -d)"
521
+ cleanup_copy_credentials() {
522
+ rm -rf "${docker_config_dir}"
523
+ }
524
+ trap cleanup_copy_credentials EXIT
525
+
526
+ export DOCKER_CONFIG="${docker_config_dir}"
527
+
528
+ if ! printf '%s' "${CPLN_TOKEN_STAGING}" |
529
+ docker login "${staging_registry}" -u '<token>' --password-stdin >/dev/null; then
530
+ echo "::error::Failed to authenticate to staging registry '${staging_registry}'."
531
+ exit 1
532
+ fi
533
+
534
+ if ! printf '%s' "${CPLN_TOKEN_PRODUCTION}" |
535
+ docker login "${production_registry}" -u '<token>' --password-stdin >/dev/null; then
536
+ echo "::error::Failed to authenticate to production registry '${production_registry}'."
537
+ exit 1
538
+ fi
539
+
540
+ if docker buildx imagetools inspect "${production_image_ref}" >/dev/null 2>&1; then
541
+ echo "::error::Production image '${production_image}' already exists in org '${CPLN_ORG_PRODUCTION}'; aborting to avoid overwriting it."
542
+ exit 1
543
+ fi
544
+
545
+ copy_status=1
546
+ for attempt in $(seq 1 "${copy_image_attempts}"); do
547
+ if docker buildx imagetools inspect "${source_image_ref}" >/dev/null &&
548
+ docker buildx imagetools create --prefer-index=false --tag "${production_image_ref}" "${source_image_ref}"; then
549
+ copy_status=0
550
+ break
551
+ else
552
+ copy_status=$?
553
+ fi
554
+
555
+ if [[ "${attempt}" -lt "${copy_image_attempts}" ]]; then
556
+ echo "::warning::Image copy attempt ${attempt}/${copy_image_attempts} failed with exit ${copy_status}; retrying in ${copy_image_retry_interval}s."
557
+ sleep "${copy_image_retry_interval}"
558
+ else
559
+ echo "::warning::Image copy attempt ${attempt}/${copy_image_attempts} failed with exit ${copy_status}; no attempts remain."
560
+ fi
561
+ done
562
+
563
+ if [[ "${copy_status}" -ne 0 ]]; then
564
+ echo "::error::Could not copy staging image '${STAGING_IMAGE}' from '${CPLN_ORG_STAGING}' to '${CPLN_ORG_PRODUCTION}' after ${copy_image_attempts} attempt(s)."
565
+ exit "${copy_status}"
566
+ fi
567
+
568
+ echo "image=${production_image}" >> "$GITHUB_OUTPUT"
569
+
570
+ - name: Deploy image to production
571
+ env:
572
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
573
+ CPLN_ORG_PRODUCTION: ${{ steps.cpln-orgs.outputs.production }}
574
+ RELEASE_PHASE_FLAG: ${{ steps.release-phase.outputs.flag }}
575
+ shell: bash
576
+ run: |
577
+ set -euo pipefail
578
+
579
+ deploy_args=(-a "${PRODUCTION_APP_NAME}")
580
+ if [[ -n "${RELEASE_PHASE_FLAG}" ]]; then
581
+ deploy_args+=("${RELEASE_PHASE_FLAG}")
582
+ fi
583
+ # `cpflow deploy-image` deploys the latest image for the app. The
584
+ # workflow-level concurrency group keeps production promotion copy and
585
+ # deploy steps coupled across workflow runs.
586
+ deploy_args+=(--org "${CPLN_ORG_PRODUCTION}" --verbose)
587
+
588
+ cpflow deploy-image "${deploy_args[@]}"
589
+
590
+ - name: Wait for deployment health
591
+ id: health-check
592
+ uses: ./.cpflow/.github/actions/cpflow-wait-for-health
593
+ with:
594
+ workload_name: ${{ steps.workloads.outputs.primary }}
595
+ app_name: ${{ vars.PRODUCTION_APP_NAME }}
596
+ org: ${{ steps.cpln-orgs.outputs.production }}
597
+ max_retries: ${{ env.HEALTH_CHECK_RETRIES }}
598
+ interval_seconds: ${{ env.HEALTH_CHECK_INTERVAL }}
599
+ accepted_statuses: ${{ env.HEALTH_CHECK_ACCEPTED_STATUSES }}
600
+
601
+ - name: Roll back on failure
602
+ if: failure() && steps.capture-current.outcome == 'success'
603
+ env:
604
+ ROLLBACK_STATE: ${{ steps.capture-current.outputs.rollback_state }}
605
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
606
+ CPLN_ORG_PRODUCTION: ${{ steps.cpln-orgs.outputs.production }}
607
+ shell: bash
608
+ run: |
609
+ # Best-effort rollback: try every workload, aggregate failures, exit non-zero at the end
610
+ # if any failed. A single cpln hiccup shouldn't leave other workloads mid-promotion.
611
+ # Keep -e disabled here so rollback can aggregate failures across workloads.
612
+ set -uo pipefail
613
+
614
+ rollback_failures=0
615
+ if ! rollback_entries="$(echo "${ROLLBACK_STATE}" | jq -r 'to_entries[] | "\(.key)\t\(.value.containers | @json)"')"; then
616
+ echo "::error::Could not parse rollback state; manual recovery may be required." >&2
617
+ exit 1
618
+ fi
619
+
620
+ while IFS=$'\t' read -r workload_name previous_containers; do
621
+ rollback_args=()
622
+ if ! current_names="$(cpln workload get "${workload_name}" --gvc "${PRODUCTION_APP_NAME}" --org "${CPLN_ORG_PRODUCTION}" -o json | jq -c '.spec.containers | map(.name)')"; then
623
+ echo "::warning::Could not retrieve current containers for workload '${workload_name}'; skipping rollback for this workload." >&2
624
+ rollback_failures=$((rollback_failures + 1))
625
+ continue
626
+ fi
627
+ if ! previous_names="$(echo "${previous_containers}" | jq -c 'map(.name)')"; then
628
+ echo "::warning::Could not parse captured containers for workload '${workload_name}'; skipping rollback for this workload." >&2
629
+ rollback_failures=$((rollback_failures + 1))
630
+ continue
631
+ fi
632
+
633
+ if [[ "$(echo "${current_names}" | jq -c 'sort')" != "$(echo "${previous_names}" | jq -c 'sort')" ]]; then
634
+ echo "::error::Container set changed for workload '${workload_name}'; refusing rollback." >&2
635
+ rollback_failures=$((rollback_failures + 1))
636
+ continue
637
+ fi
638
+
639
+ if ! rollback_container_entries="$(jq -r '.[] | "\(.name)\t\(.image)"' <<< "${previous_containers}")"; then
640
+ echo "::warning::Could not build rollback image list for workload '${workload_name}'; skipping rollback for this workload." >&2
641
+ rollback_failures=$((rollback_failures + 1))
642
+ continue
643
+ fi
644
+
645
+ while IFS=$'\t' read -r container_name image; do
646
+ rollback_args+=(--set "spec.containers.${container_name}.image=${image}")
647
+ done <<< "${rollback_container_entries}"
648
+
649
+ if ! cpln workload update "${workload_name}" \
650
+ --gvc "${PRODUCTION_APP_NAME}" \
651
+ --org "${CPLN_ORG_PRODUCTION}" \
652
+ "${rollback_args[@]}"; then
653
+ echo "::warning::Rollback failed for workload '${workload_name}'; continuing with remaining workloads." >&2
654
+ rollback_failures=$((rollback_failures + 1))
655
+ fi
656
+ done <<< "${rollback_entries}"
657
+
658
+ if [[ "${rollback_failures}" -gt 0 ]]; then
659
+ echo "::error::${rollback_failures} workload(s) failed to roll back; inspect the logs above." >&2
660
+ exit 1
661
+ fi
662
+
663
+ - name: Wait for rollback readiness
664
+ if: failure() && steps.capture-current.outcome == 'success'
665
+ env:
666
+ ROLLBACK_STATE: ${{ steps.capture-current.outputs.rollback_state }}
667
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
668
+ CPLN_ORG_PRODUCTION: ${{ steps.cpln-orgs.outputs.production }}
669
+ shell: bash
670
+ run: |
671
+ set -euo pipefail
672
+
673
+ mapfile -t workloads < <(echo "${ROLLBACK_STATE}" | jq -r 'keys[]')
674
+
675
+ # Poll workloads in parallel so the worst-case wall time during a
676
+ # production incident is `retries × interval` rather than scaling
677
+ # linearly with the number of workloads. Each per-workload retry
678
+ # loop runs in a backgrounded subshell that writes its final state
679
+ # to a status file; the parent waits for all of them before
680
+ # aggregating warnings, keeping output ordered and deterministic.
681
+ status_dir="$(mktemp -d)"
682
+ trap 'rm -rf "${status_dir}"' EXIT
683
+
684
+ pids=()
685
+ for workload_name in "${workloads[@]}"; do
686
+ [[ -n "${workload_name}" ]] || continue
687
+ status_name="${workload_name//\//_}"
688
+
689
+ echo "Polling rollback readiness for workload '${workload_name}'..."
690
+ (
691
+ set -euo pipefail
692
+ ready=false
693
+ for attempt in $(seq 1 "${ROLLBACK_READINESS_RETRIES}"); do
694
+ workload_status="$(cpln workload get "${workload_name}" --gvc "${PRODUCTION_APP_NAME}" --org "${CPLN_ORG_PRODUCTION}" -o json)"
695
+ deployment_ready="$(echo "${workload_status}" | jq -r '.status.ready // false')"
696
+ latest_ready="$(echo "${workload_status}" | jq -r '.status.readyLatest // false')"
697
+ if [[ "${deployment_ready}" == "true" && "${latest_ready}" == "true" ]]; then
698
+ ready=true
699
+ break
700
+ fi
701
+
702
+ if [[ "${attempt}" -lt "${ROLLBACK_READINESS_RETRIES}" ]]; then
703
+ sleep "${ROLLBACK_READINESS_INTERVAL}"
704
+ fi
705
+ done
706
+
707
+ if [[ "${ready}" == "true" ]]; then
708
+ printf 'ready\n' > "${status_dir}/${status_name}"
709
+ else
710
+ printf 'not_ready\n' > "${status_dir}/${status_name}"
711
+ fi
712
+ ) &
713
+ pids+=("$!")
714
+ done
715
+
716
+ # `|| true` so a single workload that fails to poll (e.g. transient
717
+ # cpln API error) doesn't abort the parent before the others finish.
718
+ # Missing or non-`ready` status files are surfaced in the aggregation
719
+ # loop below, so the failure is still visible to operators.
720
+ for pid in "${pids[@]}"; do
721
+ wait "${pid}" || true
722
+ done
723
+
724
+ for workload_name in "${workloads[@]}"; do
725
+ [[ -n "${workload_name}" ]] || continue
726
+ status_name="${workload_name//\//_}"
727
+ status_file="${status_dir}/${status_name}"
728
+ if [[ ! -f "${status_file}" ]] || [[ "$(<"${status_file}")" != "ready" ]]; then
729
+ echo "::warning::Workload '${workload_name}' did not report ready after rollback."
730
+ fi
731
+ done
732
+
733
+ - name: Promotion summary
734
+ if: always()
735
+ env:
736
+ HEALTHY: ${{ steps.health-check.outputs.healthy }}
737
+ PREVIOUS_IMAGE: ${{ steps.capture-current.outputs.current_image }}
738
+ PREVIOUS_VERSION: ${{ steps.capture-current.outputs.current_version }}
739
+ COPIED_IMAGE: ${{ steps.copy-image.outputs.image }}
740
+ shell: bash
741
+ run: |
742
+ {
743
+ echo "## Promotion Summary"
744
+ echo
745
+ if [[ "${HEALTHY}" == "true" ]]; then
746
+ echo "✅ Status: deployment successful"
747
+ deployed_image="${COPIED_IMAGE}"
748
+ else
749
+ echo "❌ Status: deployment failed"
750
+ deployed_image="${PREVIOUS_IMAGE}"
751
+ fi
752
+ echo
753
+ echo "Previous image: \`${PREVIOUS_IMAGE}\`"
754
+ echo "Previous version: ${PREVIOUS_VERSION}"
755
+ echo "Deployed image: \`${deployed_image}\`"
756
+ } >> "$GITHUB_STEP_SUMMARY"
757
+
758
+ create-github-release:
759
+ needs: promote-to-production
760
+ if: needs.promote-to-production.result == 'success'
761
+ runs-on: ubuntu-latest
762
+ permissions:
763
+ contents: write
764
+
765
+ steps:
766
+ - name: Create GitHub release
767
+ env:
768
+ GH_REPO: ${{ github.repository }}
769
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
770
+ GITHUB_RUN_ID: ${{ github.run_id }}
771
+ STAGING_APP_NAME: ${{ needs.promote-to-production.outputs.staging_app_name }}
772
+ PRODUCTION_APP_NAME: ${{ needs.promote-to-production.outputs.production_app_name }}
773
+ shell: bash
774
+ run: |
775
+ set -euo pipefail
776
+
777
+ release_date="$(date '+%Y-%m-%d')"
778
+ timestamp="$(date '+%H%M%S')"
779
+ release_tag="production-${release_date}-${timestamp}-${GITHUB_RUN_ID}"
780
+
781
+ gh release create "${release_tag}" \
782
+ --title "Production Release ${release_date} ${timestamp}" \
783
+ --notes "Promoted ${STAGING_APP_NAME} to ${PRODUCTION_APP_NAME} on ${release_date} at ${timestamp}."