cpflow 5.0.4 → 5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,22 +9,590 @@ on:
9
9
  type: string
10
10
 
11
11
  permissions:
12
- # The upstream reusable workflow's create-github-release job needs
13
- # contents: write, and callers must grant the union of callee permissions.
14
- contents: write
12
+ contents: read
13
+
14
+ env:
15
+ # Override these by editing this file or by setting the matching repository variable.
16
+ # Worst-case wall time per attempt is HEALTH_CHECK_INTERVAL plus the curl --max-time below
17
+ # (10s), so the defaults give a ~10 minute window (24 × (15 + 10) = 600s) — enough for
18
+ # most Rails cold boots (asset precompile + db:migrate + workload readiness).
19
+ HEALTH_CHECK_RETRIES: ${{ vars.HEALTH_CHECK_RETRIES || '24' }}
20
+ HEALTH_CHECK_INTERVAL: ${{ vars.HEALTH_CHECK_INTERVAL || '15' }}
21
+ # Space-separated list of HTTP statuses considered healthy. The default accepts 301/302
22
+ # because `curl` is invoked without `-L`, so a root `/` that redirects to a login page
23
+ # (common for Rails apps that auth-gate `/`) would otherwise be reported as unhealthy
24
+ # despite the workload itself being up.
25
+ #
26
+ # Strongly recommended: expose a dedicated `/health` endpoint that returns `200` and set
27
+ # HEALTH_CHECK_ACCEPTED_STATUSES to `"200"` in repository variables. The 301/302 default
28
+ # trades correctness for ergonomics — a maintenance-mode redirect or an auth-gate redirect
29
+ # to a login page can pass this check even when the underlying app is broken. Override
30
+ # via the HEALTH_CHECK_ACCEPTED_STATUSES repo variable to tighten this for apps that
31
+ # expose a dedicated health endpoint (e.g. "200" for a plain /health, or "200 401 403"
32
+ # for apps that auth-gate / without redirecting).
33
+ HEALTH_CHECK_ACCEPTED_STATUSES: ${{ vars.HEALTH_CHECK_ACCEPTED_STATUSES || '200 301 302' }}
34
+ COPY_IMAGE_RETRIES: ${{ vars.COPY_IMAGE_RETRIES || '3' }}
35
+ COPY_IMAGE_RETRY_INTERVAL: ${{ vars.COPY_IMAGE_RETRY_INTERVAL || '20' }}
36
+ ROLLBACK_READINESS_RETRIES: ${{ vars.ROLLBACK_READINESS_RETRIES || '24' }}
37
+ ROLLBACK_READINESS_INTERVAL: ${{ vars.ROLLBACK_READINESS_INTERVAL || '15' }}
38
+
39
+ concurrency:
40
+ # Single global group: only one production promotion may run at a time across the
41
+ # whole repo. Independent of staging deploys and review-app workflows (different
42
+ # GVCs / different concurrency keys), so those can still run in parallel.
43
+ group: cpflow-promote-staging-to-production
44
+ # Don't cancel an in-flight promotion: a half-finished `cpflow deploy-image` plus a
45
+ # rollback can leave production in a worse state than letting the first run finish.
46
+ cancel-in-progress: false
15
47
 
16
48
  jobs:
17
49
  promote-to-production:
18
50
  if: github.event.inputs.confirm_promotion == 'promote'
19
- uses: shakacode/control-plane-flow/.github/workflows/cpflow-promote-staging-to-production.yml@__CPFLOW_GITHUB_ACTIONS_REF__
20
- with:
21
- # Keep CPLN_TOKEN_PRODUCTION as a secret on this protected GitHub
22
- # Environment. The caller passes the environment name, the upstream
23
- # reusable workflow runs its production job in that environment, and
24
- # GitHub exposes environment secrets only after required reviewers approve.
25
- production_environment: production
26
- # Only pass the staging token explicitly. CPLN_TOKEN_PRODUCTION must live on
27
- # the protected production Environment, where GitHub exposes it only after
28
- # the required reviewers approve this job.
29
- secrets:
30
- CPLN_TOKEN_STAGING: ${{ secrets.CPLN_TOKEN_STAGING }}
51
+ runs-on: ubuntu-latest
52
+ # This normal caller-repo job declares the protected production Environment
53
+ # directly, so GitHub exposes environment secrets in this job after the
54
+ # environment gate. Do not move production promotion back behind a
55
+ # cross-repo reusable workflow; environment secrets are not available there.
56
+ environment: production
57
+ timeout-minutes: 45
58
+ outputs:
59
+ staging_app_name: ${{ steps.release-context.outputs.staging_app_name }}
60
+ production_app_name: ${{ steps.release-context.outputs.production_app_name }}
61
+
62
+ steps:
63
+ - name: Checkout repository
64
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
65
+ with:
66
+ persist-credentials: false
67
+
68
+ - name: Checkout control-plane-flow actions
69
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
70
+ with:
71
+ repository: shakacode/control-plane-flow
72
+ ref: __CPFLOW_GITHUB_ACTIONS_REF__
73
+ path: .cpflow
74
+ persist-credentials: false
75
+
76
+ - name: Validate production token
77
+ shell: bash
78
+ env:
79
+ # GitHub does not expose which secret scope supplied this value.
80
+ # Keep CPLN_TOKEN_PRODUCTION absent from repo/org secrets so the
81
+ # protected production Environment is the only configured source.
82
+ CPLN_TOKEN_PRODUCTION: ${{ secrets.CPLN_TOKEN_PRODUCTION }}
83
+ PRODUCTION_ENVIRONMENT: production
84
+ run: |
85
+ set -euo pipefail
86
+
87
+ if [[ -z "${CPLN_TOKEN_PRODUCTION}" ]]; then
88
+ echo "::error::CPLN_TOKEN_PRODUCTION is not set. Add it as a secret on the '${PRODUCTION_ENVIRONMENT}' GitHub Environment."
89
+ exit 1
90
+ fi
91
+
92
+ - name: Validate required secrets and variables
93
+ uses: ./.cpflow/.github/actions/cpflow-validate-config
94
+ # Pass secrets via env so the composite action checks indirect shell
95
+ # variables instead of interpolating secret values into a run script.
96
+ env:
97
+ CPLN_TOKEN_STAGING: ${{ secrets.CPLN_TOKEN_STAGING }}
98
+ CPLN_TOKEN_PRODUCTION: ${{ secrets.CPLN_TOKEN_PRODUCTION }}
99
+ CPLN_ORG_STAGING: ${{ vars.CPLN_ORG_STAGING }}
100
+ CPLN_ORG_PRODUCTION: ${{ vars.CPLN_ORG_PRODUCTION }}
101
+ STAGING_APP_NAME: ${{ vars.STAGING_APP_NAME }}
102
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
103
+ with:
104
+ required: |
105
+ secret:CPLN_TOKEN_STAGING
106
+ secret:CPLN_TOKEN_PRODUCTION
107
+ variable:CPLN_ORG_STAGING
108
+ variable:CPLN_ORG_PRODUCTION
109
+ variable:STAGING_APP_NAME
110
+ variable:PRODUCTION_APP_NAME
111
+
112
+ - name: Capture release context
113
+ id: release-context
114
+ env:
115
+ STAGING_APP_NAME: ${{ vars.STAGING_APP_NAME }}
116
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
117
+ shell: bash
118
+ run: |
119
+ set -euo pipefail
120
+
121
+ {
122
+ echo "staging_app_name=${STAGING_APP_NAME}"
123
+ echo "production_app_name=${PRODUCTION_APP_NAME}"
124
+ } >> "$GITHUB_OUTPUT"
125
+
126
+ - name: Setup production environment
127
+ uses: ./.cpflow/.github/actions/cpflow-setup-environment
128
+ with:
129
+ token: ${{ secrets.CPLN_TOKEN_PRODUCTION }}
130
+ org: ${{ vars.CPLN_ORG_PRODUCTION }}
131
+ working_directory: .cpflow
132
+ cpln_cli_version: ${{ vars.CPLN_CLI_VERSION }}
133
+ cpflow_version: ${{ vars.CPFLOW_VERSION }}
134
+ # The setup action validates CPFLOW_VERSION against this full workflow ref.
135
+ control_plane_flow_ref: shakacode/control-plane-flow/.github/workflows/cpflow-promote-staging-to-production.yml@__CPFLOW_GITHUB_ACTIONS_REF__
136
+
137
+ # Runs after Setup production environment so the pinned Ruby (>= 3.1) is on PATH.
138
+ # YAML.load_file(..., aliases: true) is not supported on Ruby 3.0 (system Ruby on ubuntu-22.04).
139
+ - name: Resolve production app workloads
140
+ id: workloads
141
+ env:
142
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
143
+ PRIMARY_WORKLOAD: ${{ vars.PRIMARY_WORKLOAD }}
144
+ shell: bash
145
+ run: |
146
+ set -euo pipefail
147
+
148
+ ruby - "${PRODUCTION_APP_NAME}" "${PRIMARY_WORKLOAD}" "${GITHUB_OUTPUT}" <<'RUBY'
149
+ require "yaml"
150
+
151
+ app = ARGV.fetch(0)
152
+ requested_primary = ARGV.fetch(1, "").to_s.strip
153
+ output_path = ARGV.fetch(2)
154
+ data = YAML.safe_load(File.read(".controlplane/controlplane.yml"), aliases: true)
155
+ apps = data["apps"] || {}
156
+ app_config = apps[app]
157
+
158
+ unless app_config
159
+ warn "Error: app '#{app}' is not defined under `apps:` in `.controlplane/controlplane.yml`."
160
+ warn " Fix the PRODUCTION_APP_NAME repository variable or add the app to controlplane.yml."
161
+ exit 1
162
+ end
163
+
164
+ workloads = Array(app_config["app_workloads"]).map(&:to_s).reject(&:empty?)
165
+ workloads = ["rails"] if workloads.empty?
166
+
167
+ primary =
168
+ if requested_primary.empty?
169
+ if workloads.length == 1
170
+ workloads.first
171
+ elsif workloads.include?("rails")
172
+ "rails"
173
+ else
174
+ puts "::error::PRIMARY_WORKLOAD is not configured and app '#{app}' has multiple workloads: #{workloads.join(', ')}."
175
+ warn " Set the PRIMARY_WORKLOAD repository variable to one of these workloads."
176
+ exit 1
177
+ end
178
+ elsif workloads.include?(requested_primary)
179
+ requested_primary
180
+ else
181
+ puts "::error::PRIMARY_WORKLOAD '#{requested_primary}' is not one of: #{workloads.join(', ')}."
182
+ exit 1
183
+ end
184
+
185
+ File.open(output_path, "a") do |output|
186
+ output.puts "names=#{workloads.join(',')}"
187
+ output.puts "primary=#{primary}"
188
+ end
189
+ RUBY
190
+
191
+ - name: Detect release phase support
192
+ id: release-phase
193
+ uses: ./.cpflow/.github/actions/cpflow-detect-release-phase
194
+ with:
195
+ app_name: ${{ vars.PRODUCTION_APP_NAME }}
196
+
197
+ - name: Verify production environment variables
198
+ env:
199
+ CPLN_TOKEN_STAGING: ${{ secrets.CPLN_TOKEN_STAGING }}
200
+ CPLN_TOKEN_PRODUCTION: ${{ secrets.CPLN_TOKEN_PRODUCTION }}
201
+ STAGING_APP_NAME: ${{ vars.STAGING_APP_NAME }}
202
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
203
+ CPLN_ORG_STAGING: ${{ vars.CPLN_ORG_STAGING }}
204
+ CPLN_ORG_PRODUCTION: ${{ vars.CPLN_ORG_PRODUCTION }}
205
+ shell: bash
206
+ run: |
207
+ set -euo pipefail
208
+
209
+ staging_vars="$(CPLN_TOKEN="${CPLN_TOKEN_STAGING}" cpln gvc get "${STAGING_APP_NAME}" --org "${CPLN_ORG_STAGING}" -o json | jq -r '.spec.env // [] | .[].name' | sort)"
210
+ production_vars="$(CPLN_TOKEN="${CPLN_TOKEN_PRODUCTION}" cpln gvc get "${PRODUCTION_APP_NAME}" --org "${CPLN_ORG_PRODUCTION}" -o json | jq -r '.spec.env // [] | .[].name' | sort)"
211
+
212
+ if [[ -z "${staging_vars}" ]]; then
213
+ echo "Staging GVC exposes no environment variables; skipping parity check."
214
+ exit 0
215
+ fi
216
+
217
+ # Treat staging as the promotion source of truth: fail when a variable
218
+ # present in staging is missing in production. Production-only variables
219
+ # are allowed, but surface them so teams can spot drift.
220
+ missing_vars="$(comm -23 <(printf '%s\n' "${staging_vars}") <(printf '%s\n' "${production_vars}"))"
221
+ production_only_vars="$(comm -13 <(printf '%s\n' "${staging_vars}") <(printf '%s\n' "${production_vars}"))"
222
+
223
+ if [[ -n "${production_only_vars}" ]]; then
224
+ echo "::warning::Production has environment variables that are not present in staging:"
225
+ echo "${production_only_vars}"
226
+ fi
227
+
228
+ if [[ -n "${missing_vars}" ]]; then
229
+ echo "::error::Production is missing environment variables that exist in staging"
230
+ echo "${missing_vars}"
231
+ exit 1
232
+ fi
233
+
234
+ - name: Capture current production image
235
+ id: capture-current
236
+ env:
237
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
238
+ CPLN_ORG_PRODUCTION: ${{ vars.CPLN_ORG_PRODUCTION }}
239
+ WORKLOAD_NAMES: ${{ steps.workloads.outputs.names }}
240
+ PRIMARY_WORKLOAD: ${{ steps.workloads.outputs.primary }}
241
+ shell: bash
242
+ run: |
243
+ set -euo pipefail
244
+
245
+ selected_workload="${PRIMARY_WORKLOAD}"
246
+ selected_image=""
247
+ selected_version=""
248
+ rollback_state='{}'
249
+
250
+ # Validate all workloads have images, then promote the primary workload's
251
+ # image as the canonical image for this GVC.
252
+ while IFS= read -r workload_name; do
253
+ [[ -n "${workload_name}" ]] || continue
254
+
255
+ workload_json="$(cpln workload get "${workload_name}" --gvc "${PRODUCTION_APP_NAME}" --org "${CPLN_ORG_PRODUCTION}" -o json)"
256
+ workload_image="$(echo "${workload_json}" | jq -r '.spec.containers[0].image // empty')"
257
+ workload_containers="$(echo "${workload_json}" | jq -c '.spec.containers | map({name, image})')"
258
+ workload_version="$(echo "${workload_json}" | jq -r '.version')"
259
+
260
+ if [[ "${workload_name}" == "${selected_workload}" ]]; then
261
+ selected_image="${workload_image}"
262
+ selected_version="${workload_version}"
263
+ fi
264
+
265
+ rollback_state="$(
266
+ jq -c \
267
+ --arg workload "${workload_name}" \
268
+ --arg image "${workload_image}" \
269
+ --arg version "${workload_version}" \
270
+ --argjson containers "${workload_containers}" \
271
+ '. + {($workload): {image: $image, version: $version, containers: $containers}}' \
272
+ <<< "${rollback_state}"
273
+ )"
274
+ done < <(tr ',' '\n' <<< "${WORKLOAD_NAMES}")
275
+
276
+ if [[ -z "${selected_image}" || -z "${selected_version}" ]]; then
277
+ echo "::error::Could not capture current image/version for primary workload '${selected_workload}'." >&2
278
+ exit 1
279
+ fi
280
+
281
+ echo "current_image=${selected_image}" >> "$GITHUB_OUTPUT"
282
+ echo "current_version=${selected_version}" >> "$GITHUB_OUTPUT"
283
+ # Randomize the heredoc delimiter so a stray "EOF" line inside rollback_state can't terminate it early.
284
+ delim="EOF_$(openssl rand -hex 8)"
285
+ {
286
+ echo "rollback_state<<${delim}"
287
+ echo "${rollback_state}"
288
+ echo "${delim}"
289
+ } >> "$GITHUB_OUTPUT"
290
+
291
+ - name: Capture deployed staging image
292
+ id: staging-image
293
+ env:
294
+ CPLN_TOKEN_STAGING: ${{ secrets.CPLN_TOKEN_STAGING }}
295
+ STAGING_APP_NAME: ${{ vars.STAGING_APP_NAME }}
296
+ CPLN_ORG_STAGING: ${{ vars.CPLN_ORG_STAGING }}
297
+ WORKLOAD_NAMES: ${{ steps.workloads.outputs.names }}
298
+ PRIMARY_WORKLOAD: ${{ steps.workloads.outputs.primary }}
299
+ shell: bash
300
+ run: |
301
+ set -euo pipefail
302
+
303
+ selected_workload="${PRIMARY_WORKLOAD}"
304
+ selected_image=""
305
+
306
+ while IFS= read -r workload_name; do
307
+ [[ -n "${workload_name}" ]] || continue
308
+
309
+ workload_json="$(CPLN_TOKEN="${CPLN_TOKEN_STAGING}" cpln workload get "${workload_name}" --gvc "${STAGING_APP_NAME}" --org "${CPLN_ORG_STAGING}" -o json)"
310
+ workload_image="$(echo "${workload_json}" | jq -r '.spec.containers[0].image // empty')"
311
+
312
+ if [[ -z "${workload_image}" ]]; then
313
+ echo "::error::Could not find an image on staging workload '${workload_name}'." >&2
314
+ exit 1
315
+ fi
316
+
317
+ if [[ "${workload_name}" == "${selected_workload}" ]]; then
318
+ selected_image="${workload_image}"
319
+ fi
320
+ done < <(tr ',' '\n' <<< "${WORKLOAD_NAMES}")
321
+
322
+ staging_image_ref="${selected_image}"
323
+ if [[ -z "${staging_image_ref}" ]]; then
324
+ echo "::error::Could not determine the deployed staging image for primary workload '${selected_workload}'." >&2
325
+ exit 1
326
+ fi
327
+
328
+ if [[ "${staging_image_ref}" == /org/*/image/* ]]; then
329
+ staging_image="${staging_image_ref##*/image/}"
330
+ elif [[ "${staging_image_ref}" == *.registry.cpln.io/* ]]; then
331
+ staging_image="${staging_image_ref#*.registry.cpln.io/}"
332
+ else
333
+ staging_image="${staging_image_ref}"
334
+ fi
335
+
336
+ echo "image=${staging_image}" >> "$GITHUB_OUTPUT"
337
+
338
+ - name: Copy image from staging
339
+ env:
340
+ # Pass the upstream token via env rather than `-t` so it doesn't appear in /proc/<pid>/cmdline.
341
+ CPLN_TOKEN_STAGING: ${{ secrets.CPLN_TOKEN_STAGING }}
342
+ CPLN_UPSTREAM_TOKEN: ${{ secrets.CPLN_TOKEN_STAGING }}
343
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
344
+ CPLN_ORG_STAGING: ${{ vars.CPLN_ORG_STAGING }}
345
+ CPLN_ORG_PRODUCTION: ${{ vars.CPLN_ORG_PRODUCTION }}
346
+ STAGING_IMAGE: ${{ steps.staging-image.outputs.image }}
347
+ shell: bash
348
+ run: |
349
+ set -euo pipefail
350
+
351
+ if ! [[ "${COPY_IMAGE_RETRIES}" =~ ^[0-9]+$ ]]; then
352
+ echo "::error::COPY_IMAGE_RETRIES must be a non-negative integer."
353
+ exit 1
354
+ fi
355
+
356
+ if ! [[ "${COPY_IMAGE_RETRY_INTERVAL}" =~ ^[0-9]+$ ]]; then
357
+ echo "::error::COPY_IMAGE_RETRY_INTERVAL must be a non-negative integer."
358
+ exit 1
359
+ fi
360
+
361
+ copy_image_retries=$((10#${COPY_IMAGE_RETRIES}))
362
+ copy_image_attempts=$((copy_image_retries + 1))
363
+ copy_image_retry_interval=$((10#${COPY_IMAGE_RETRY_INTERVAL}))
364
+
365
+ if ! CPLN_TOKEN="${CPLN_TOKEN_STAGING}" cpln image get "${STAGING_IMAGE}" --org "${CPLN_ORG_STAGING}" -o json >/dev/null; then
366
+ echo "::error::Staging image '${STAGING_IMAGE}' was not found in org '${CPLN_ORG_STAGING}'; aborting promotion."
367
+ exit 1
368
+ fi
369
+
370
+ copy_status=1
371
+ for attempt in $(seq 1 "${copy_image_attempts}"); do
372
+ if cpflow copy-image-from-upstream -a "${PRODUCTION_APP_NAME}" --org "${CPLN_ORG_PRODUCTION}" --image "${STAGING_IMAGE}"; then
373
+ copy_status=0
374
+ break
375
+ else
376
+ copy_status=$?
377
+ fi
378
+
379
+ if [[ "${attempt}" -lt "${copy_image_attempts}" ]]; then
380
+ echo "::warning::Image copy attempt ${attempt}/${copy_image_attempts} failed with exit ${copy_status}; retrying in ${copy_image_retry_interval}s."
381
+ sleep "${copy_image_retry_interval}"
382
+ else
383
+ echo "::warning::Image copy attempt ${attempt}/${copy_image_attempts} failed with exit ${copy_status}; no attempts remain."
384
+ fi
385
+ done
386
+
387
+ if [[ "${copy_status}" -ne 0 ]]; then
388
+ echo "::error::Could not copy staging image '${STAGING_IMAGE}' from '${CPLN_ORG_STAGING}' to '${CPLN_ORG_PRODUCTION}' after ${copy_image_attempts} attempt(s)."
389
+ exit "${copy_status}"
390
+ fi
391
+
392
+ - name: Deploy image to production
393
+ env:
394
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
395
+ CPLN_ORG_PRODUCTION: ${{ vars.CPLN_ORG_PRODUCTION }}
396
+ RELEASE_PHASE_FLAG: ${{ steps.release-phase.outputs.flag }}
397
+ shell: bash
398
+ run: |
399
+ set -euo pipefail
400
+
401
+ deploy_args=(-a "${PRODUCTION_APP_NAME}")
402
+ if [[ -n "${RELEASE_PHASE_FLAG}" ]]; then
403
+ deploy_args+=("${RELEASE_PHASE_FLAG}")
404
+ fi
405
+ deploy_args+=(--org "${CPLN_ORG_PRODUCTION}" --verbose)
406
+
407
+ cpflow deploy-image "${deploy_args[@]}"
408
+
409
+ - name: Wait for deployment health
410
+ id: health-check
411
+ uses: ./.cpflow/.github/actions/cpflow-wait-for-health
412
+ with:
413
+ workload_name: ${{ steps.workloads.outputs.primary }}
414
+ app_name: ${{ vars.PRODUCTION_APP_NAME }}
415
+ org: ${{ vars.CPLN_ORG_PRODUCTION }}
416
+ max_retries: ${{ env.HEALTH_CHECK_RETRIES }}
417
+ interval_seconds: ${{ env.HEALTH_CHECK_INTERVAL }}
418
+ accepted_statuses: ${{ env.HEALTH_CHECK_ACCEPTED_STATUSES }}
419
+
420
+ - name: Roll back on failure
421
+ if: failure() && steps.capture-current.outcome == 'success'
422
+ env:
423
+ ROLLBACK_STATE: ${{ steps.capture-current.outputs.rollback_state }}
424
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
425
+ CPLN_ORG_PRODUCTION: ${{ vars.CPLN_ORG_PRODUCTION }}
426
+ shell: bash
427
+ run: |
428
+ # Best-effort rollback: try every workload, aggregate failures, exit non-zero at the end
429
+ # if any failed. A single cpln hiccup shouldn't leave other workloads mid-promotion.
430
+ # Keep -e disabled here so rollback can aggregate failures across workloads.
431
+ set -uo pipefail
432
+
433
+ rollback_failures=0
434
+ if ! rollback_entries="$(echo "${ROLLBACK_STATE}" | jq -r 'to_entries[] | "\(.key)\t\(.value.containers | @json)"')"; then
435
+ echo "::error::Could not parse rollback state; manual recovery may be required." >&2
436
+ exit 1
437
+ fi
438
+
439
+ while IFS=$'\t' read -r workload_name previous_containers; do
440
+ rollback_args=()
441
+ if ! current_names="$(cpln workload get "${workload_name}" --gvc "${PRODUCTION_APP_NAME}" --org "${CPLN_ORG_PRODUCTION}" -o json | jq -c '.spec.containers | map(.name)')"; then
442
+ echo "::warning::Could not retrieve current containers for workload '${workload_name}'; skipping rollback for this workload." >&2
443
+ rollback_failures=$((rollback_failures + 1))
444
+ continue
445
+ fi
446
+ if ! previous_names="$(echo "${previous_containers}" | jq -c 'map(.name)')"; then
447
+ echo "::warning::Could not parse captured containers for workload '${workload_name}'; skipping rollback for this workload." >&2
448
+ rollback_failures=$((rollback_failures + 1))
449
+ continue
450
+ fi
451
+
452
+ if [[ "$(echo "${current_names}" | jq -c 'sort')" != "$(echo "${previous_names}" | jq -c 'sort')" ]]; then
453
+ echo "::error::Container set changed for workload '${workload_name}'; refusing rollback." >&2
454
+ rollback_failures=$((rollback_failures + 1))
455
+ continue
456
+ fi
457
+
458
+ if ! rollback_container_entries="$(jq -r '.[] | "\(.name)\t\(.image)"' <<< "${previous_containers}")"; then
459
+ echo "::warning::Could not build rollback image list for workload '${workload_name}'; skipping rollback for this workload." >&2
460
+ rollback_failures=$((rollback_failures + 1))
461
+ continue
462
+ fi
463
+
464
+ while IFS=$'\t' read -r container_name image; do
465
+ rollback_args+=(--set "spec.containers.${container_name}.image=${image}")
466
+ done <<< "${rollback_container_entries}"
467
+
468
+ if ! cpln workload update "${workload_name}" \
469
+ --gvc "${PRODUCTION_APP_NAME}" \
470
+ --org "${CPLN_ORG_PRODUCTION}" \
471
+ "${rollback_args[@]}"; then
472
+ echo "::warning::Rollback failed for workload '${workload_name}'; continuing with remaining workloads." >&2
473
+ rollback_failures=$((rollback_failures + 1))
474
+ fi
475
+ done <<< "${rollback_entries}"
476
+
477
+ if [[ "${rollback_failures}" -gt 0 ]]; then
478
+ echo "::error::${rollback_failures} workload(s) failed to roll back; inspect the logs above." >&2
479
+ exit 1
480
+ fi
481
+
482
+ - name: Wait for rollback readiness
483
+ if: failure() && steps.capture-current.outcome == 'success'
484
+ env:
485
+ ROLLBACK_STATE: ${{ steps.capture-current.outputs.rollback_state }}
486
+ PRODUCTION_APP_NAME: ${{ vars.PRODUCTION_APP_NAME }}
487
+ CPLN_ORG_PRODUCTION: ${{ vars.CPLN_ORG_PRODUCTION }}
488
+ shell: bash
489
+ run: |
490
+ set -euo pipefail
491
+
492
+ mapfile -t workloads < <(echo "${ROLLBACK_STATE}" | jq -r 'keys[]')
493
+
494
+ # Poll workloads in parallel so the worst-case wall time during a
495
+ # production incident is `retries × interval` rather than scaling
496
+ # linearly with the number of workloads. Each per-workload retry
497
+ # loop runs in a backgrounded subshell that writes its final state
498
+ # to a status file; the parent waits for all of them before
499
+ # aggregating warnings, keeping output ordered and deterministic.
500
+ status_dir="$(mktemp -d)"
501
+ trap 'rm -rf "${status_dir}"' EXIT
502
+
503
+ pids=()
504
+ for workload_name in "${workloads[@]}"; do
505
+ [[ -n "${workload_name}" ]] || continue
506
+ status_name="${workload_name//\//_}"
507
+
508
+ echo "Polling rollback readiness for workload '${workload_name}'..."
509
+ (
510
+ set -euo pipefail
511
+ ready=false
512
+ for attempt in $(seq 1 "${ROLLBACK_READINESS_RETRIES}"); do
513
+ deployment_ready="$(cpln workload get "${workload_name}" --gvc "${PRODUCTION_APP_NAME}" --org "${CPLN_ORG_PRODUCTION}" -o json | jq -r '.status.ready // false')"
514
+ if [[ "${deployment_ready}" == "true" ]]; then
515
+ ready=true
516
+ break
517
+ fi
518
+
519
+ if [[ "${attempt}" -lt "${ROLLBACK_READINESS_RETRIES}" ]]; then
520
+ sleep "${ROLLBACK_READINESS_INTERVAL}"
521
+ fi
522
+ done
523
+
524
+ if [[ "${ready}" == "true" ]]; then
525
+ printf 'ready\n' > "${status_dir}/${status_name}"
526
+ else
527
+ printf 'not_ready\n' > "${status_dir}/${status_name}"
528
+ fi
529
+ ) &
530
+ pids+=("$!")
531
+ done
532
+
533
+ # `|| true` so a single workload that fails to poll (e.g. transient
534
+ # cpln API error) doesn't abort the parent before the others finish.
535
+ # Missing or non-`ready` status files are surfaced in the aggregation
536
+ # loop below, so the failure is still visible to operators.
537
+ for pid in "${pids[@]}"; do
538
+ wait "${pid}" || true
539
+ done
540
+
541
+ for workload_name in "${workloads[@]}"; do
542
+ [[ -n "${workload_name}" ]] || continue
543
+ status_name="${workload_name//\//_}"
544
+ status_file="${status_dir}/${status_name}"
545
+ if [[ ! -f "${status_file}" ]] || [[ "$(<"${status_file}")" != "ready" ]]; then
546
+ echo "::warning::Workload '${workload_name}' did not report ready after rollback."
547
+ fi
548
+ done
549
+
550
+ - name: Promotion summary
551
+ if: always()
552
+ env:
553
+ HEALTHY: ${{ steps.health-check.outputs.healthy }}
554
+ PREVIOUS_IMAGE: ${{ steps.capture-current.outputs.current_image }}
555
+ PREVIOUS_VERSION: ${{ steps.capture-current.outputs.current_version }}
556
+ DEPLOYED_IMAGE: ${{ steps.staging-image.outputs.image }}
557
+ shell: bash
558
+ run: |
559
+ {
560
+ echo "## Promotion Summary"
561
+ echo
562
+ if [[ "${HEALTHY}" == "true" ]]; then
563
+ echo "✅ Status: deployment successful"
564
+ else
565
+ echo "❌ Status: deployment failed"
566
+ fi
567
+ echo
568
+ echo "Previous image: \`${PREVIOUS_IMAGE}\`"
569
+ echo "Previous version: ${PREVIOUS_VERSION}"
570
+ echo "Deployed image: \`${DEPLOYED_IMAGE}\`"
571
+ } >> "$GITHUB_STEP_SUMMARY"
572
+
573
+ create-github-release:
574
+ needs: promote-to-production
575
+ if: needs.promote-to-production.result == 'success'
576
+ runs-on: ubuntu-latest
577
+ permissions:
578
+ contents: write
579
+
580
+ steps:
581
+ - name: Create GitHub release
582
+ env:
583
+ GH_REPO: ${{ github.repository }}
584
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
585
+ GITHUB_RUN_ID: ${{ github.run_id }}
586
+ STAGING_APP_NAME: ${{ needs.promote-to-production.outputs.staging_app_name }}
587
+ PRODUCTION_APP_NAME: ${{ needs.promote-to-production.outputs.production_app_name }}
588
+ shell: bash
589
+ run: |
590
+ set -euo pipefail
591
+
592
+ release_date="$(date '+%Y-%m-%d')"
593
+ timestamp="$(date '+%H%M%S')"
594
+ release_tag="production-${release_date}-${timestamp}-${GITHUB_RUN_ID}"
595
+
596
+ gh release create "${release_tag}" \
597
+ --title "Production Release ${release_date} ${timestamp}" \
598
+ --notes "Promoted ${STAGING_APP_NAME} to ${PRODUCTION_APP_NAME} on ${release_date} at ${timestamp}."
@@ -8,15 +8,25 @@ USAGE = <<~USAGE
8
8
 
9
9
  Use a release tag for normal operation, e.g. v5.0.0.
10
10
  Use a full 40-character commit SHA for temporary unreleased upstream testing.
11
- This only updates generated reusable-workflow `uses:` refs. The called
12
- workflows load their own matching shared actions from that same workflow
13
- commit automatically. Regenerate from the cpflow gem when templates changed.
11
+ This updates generated reusable-workflow `uses:` refs plus the production
12
+ workflow's pinned control-plane-flow checkout and setup validation ref.
13
+ Regenerate from the cpflow gem when templates changed.
14
14
  Use --allow-moving-ref only for short-lived local branch/ref experiments.
15
15
  USAGE
16
16
 
17
17
  ALLOWED_OPTIONS = ["--allow-moving-ref"].freeze
18
18
  FULL_COMMIT_SHA = /\A[0-9a-f]{40}\z/i
19
19
  RELEASE_TAG = /\Av\d+\.\d+\.\d+(?:[-.][0-9A-Za-z][0-9A-Za-z.-]*)?\z/
20
+ PRODUCTION_WORKFLOW_REF = "shakacode/control-plane-flow/.github/workflows/" \
21
+ "cpflow-promote-staging-to-production.yml"
22
+ CPFLOW_CHECKOUT_REF_PATTERN = %r{
23
+ (^\s*-\s+name:\s+Checkout\ control-plane-flow\ actions\s*\n
24
+ (?:(?!^\s*-\s+name:).)*?
25
+ ^\s+repository:\s+shakacode/control-plane-flow\s*\n
26
+ (?:(?!^\s*-\s+name:).)*?
27
+ ^\s+ref:\s+)
28
+ [^\s]+
29
+ }mx
20
30
 
21
31
  options, positional = ARGV.partition { |arg| arg.start_with?("--") }
22
32
  unknown_options = options - ALLOWED_OPTIONS
@@ -57,8 +67,12 @@ end
57
67
  changed = []
58
68
  workflow_paths.each do |path|
59
69
  text = File.read(path)
70
+ production_setup_ref_pattern =
71
+ /(control_plane_flow_ref:\s+#{Regexp.escape(PRODUCTION_WORKFLOW_REF)}@)[^\s]+/
60
72
  updated = text
61
73
  .gsub(%r{(uses:\s+shakacode/control-plane-flow/\.github/workflows/[^@\s]+@)[^\s]+}, "\\1#{ref}")
74
+ .gsub(production_setup_ref_pattern, "\\1#{ref}")
75
+ .gsub(CPFLOW_CHECKOUT_REF_PATTERN, "\\1#{ref}")
62
76
 
63
77
  next if updated == text
64
78