@layr-labs/ecloud-sdk 1.0.0-dev.2 → 1.0.0-devep1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/compute.js CHANGED
@@ -148,6 +148,7 @@ var ENV_SOURCE_SCRIPT_NAME = "compute-source-env.sh";
148
148
  var KMS_CLIENT_BINARY_NAME = "kms-client";
149
149
  var KMS_SIGNING_KEY_NAME = "kms-signing-public-key.pem";
150
150
  var TLS_KEYGEN_BINARY_NAME = "tls-keygen";
151
+ var DRAIN_WATCHER_BINARY_NAME = "ecloud-drain-watcher";
151
152
  var CADDYFILE_NAME = "Caddyfile";
152
153
  var LAYERED_BUILD_DIR_PREFIX = "ecloud-layered-build";
153
154
 
@@ -421,7 +422,7 @@ var PushPermissionError = class extends Error {
421
422
  import Handlebars from "handlebars";
422
423
 
423
424
  // src/client/common/templates/Dockerfile.layered.tmpl
424
- var Dockerfile_layered_default = '{{#if includeTLS}}\n# Get Caddy from official image\nFROM caddy:2.10.2-alpine AS caddy\n{{/if}}\n\nFROM {{baseImage}}\n\n{{#if originalUser}}\n# Switch to root to perform setup (base image has non-root USER: {{originalUser}})\nUSER root\n{{/if}}\n\n# Copy core TEE components\nCOPY compute-source-env.sh /usr/local/bin/\nCOPY kms-client /usr/local/bin/\nCOPY kms-signing-public-key.pem /usr/local/bin/\n\n{{#if includeTLS}}\n# Copy Caddy from official image\nCOPY --from=caddy /usr/bin/caddy /usr/local/bin/caddy\n\n# Copy TLS components\nCOPY tls-keygen /usr/local/bin/\nCOPY Caddyfile /etc/caddy/\n{{/if}}\n\n{{#if originalUser}}\n# Make binaries executable (755 for executables, 644 for keys)\nRUN chmod 755 /usr/local/bin/compute-source-env.sh \\\n && chmod 755 /usr/local/bin/kms-client{{#if includeTLS}} \\\n && chmod 755 /usr/local/bin/tls-keygen \\\n && chmod 755 /usr/local/bin/caddy{{/if}} \\\n && chmod 644 /usr/local/bin/kms-signing-public-key.pem\n\n# Store original user - entrypoint will drop privileges to this user after TEE setup\nENV __ECLOUD_ORIGINAL_USER={{originalUser}}\n{{else}}\n# Make binaries executable (preserve existing permissions, just add execute)\nRUN chmod +x /usr/local/bin/compute-source-env.sh \\\n && chmod +x /usr/local/bin/kms-client{{#if includeTLS}} \\\n && chmod +x /usr/local/bin/tls-keygen{{/if}}\n{{/if}}\n\n{{#if logRedirect}}\n\nLABEL tee.launch_policy.log_redirect={{logRedirect}}\n{{/if}}\n{{#if resourceUsageAllow}}\n\nLABEL tee.launch_policy.monitoring_memory_allow={{resourceUsageAllow}}\n{{/if}}\n\nLABEL eigenx_cli_version={{ecloudCLIVersion}}\nLABEL eigenx_vm_image=eigen\n\n{{#if includeTLS}}\n# Expose both HTTP and HTTPS ports for Caddy\nEXPOSE 80 443\n{{/if}}\n\nENTRYPOINT ["/usr/local/bin/compute-source-env.sh"]\nCMD {{{originalCmd}}}\n';
425
+ var Dockerfile_layered_default = '{{#if includeTLS}}\n# Get Caddy from official image\nFROM caddy:2.10.2-alpine AS caddy\n{{/if}}\n\nFROM {{baseImage}}\n\n{{#if originalUser}}\n# Switch to root to perform setup (base image has non-root USER: {{originalUser}})\nUSER root\n{{/if}}\n\n# Copy core TEE components\nCOPY compute-source-env.sh /usr/local/bin/\nCOPY kms-client /usr/local/bin/\nCOPY kms-signing-public-key.pem /usr/local/bin/\n{{#if includeDrainWatcher}}\nCOPY ecloud-drain-watcher /usr/local/bin/\n{{/if}}\n\n{{#if includeTLS}}\n# Copy Caddy from official image\nCOPY --from=caddy /usr/bin/caddy /usr/local/bin/caddy\n\n# Copy TLS components\nCOPY tls-keygen /usr/local/bin/\nCOPY Caddyfile /etc/caddy/\n{{/if}}\n\n{{#if originalUser}}\n# Make binaries executable (755 for executables, 644 for keys)\nRUN chmod 755 /usr/local/bin/compute-source-env.sh \\\n && chmod 755 /usr/local/bin/kms-client{{#if includeDrainWatcher}} \\\n && chmod 755 /usr/local/bin/ecloud-drain-watcher{{/if}}{{#if includeTLS}} \\\n && chmod 755 /usr/local/bin/tls-keygen \\\n && chmod 755 /usr/local/bin/caddy{{/if}} \\\n && chmod 644 /usr/local/bin/kms-signing-public-key.pem\n\n# Store original user - entrypoint will drop privileges to this user after TEE setup\nENV __ECLOUD_ORIGINAL_USER={{originalUser}}\n{{else}}\n# Make binaries executable (preserve existing permissions, just add execute)\nRUN chmod +x /usr/local/bin/compute-source-env.sh \\\n && chmod +x /usr/local/bin/kms-client{{#if includeDrainWatcher}} \\\n && chmod +x /usr/local/bin/ecloud-drain-watcher{{/if}}{{#if includeTLS}} \\\n && chmod +x /usr/local/bin/tls-keygen{{/if}}\n{{/if}}\n\n{{#if logRedirect}}\n\nLABEL tee.launch_policy.log_redirect={{logRedirect}}\n{{/if}}\n{{#if resourceUsageAllow}}\n\nLABEL tee.launch_policy.monitoring_memory_allow={{resourceUsageAllow}}\n{{/if}}\n\n# Allow-list the envvars the ecloud-platform sets via GCE `tee-env-*`\n# metadata. Without this label, Confidential Space\'s launcher rejects\n# any `tee-env-*` override at container-start with\n# "env var {...} is not allowed to be overridden on this image" and\n# exits with code 1 \u2014 which terminates the VM before the entrypoint\n# ever runs. ECLOUD_PD_EXPECTED is set on PD-backed apps so the\n# entrypoint (compute-source-env.sh) knows to wait for the persistent\n# disk before exec\'ing the user workload. User-supplied env vars\n# flow through KMS (not tee-env-*) and don\'t need to be listed here.\nLABEL tee.launch_policy.allow_env_override=ECLOUD_PD_EXPECTED\n\nLABEL eigenx_cli_version={{ecloudCLIVersion}}\nLABEL eigenx_vm_image=eigen\nLABEL eigenx_container_contract=v1\n\n{{#if includeTLS}}\n# Expose both HTTP and HTTPS ports for Caddy\nEXPOSE 80 443\n{{/if}}\n\nENTRYPOINT ["/usr/local/bin/compute-source-env.sh"]\nCMD {{{originalCmd}}}\n';
425
426
 
426
427
  // src/client/common/templates/dockerfileTemplate.ts
427
428
  function processDockerfileTemplate(data) {
@@ -434,6 +435,49 @@ import Handlebars2 from "handlebars";
434
435
 
435
436
  // src/client/common/templates/compute-source-env.sh.tmpl
436
437
  var compute_source_env_sh_default = `#!/bin/sh
438
+ # EigenCompute container entrypoint script
439
+ # This script handles KMS secret fetching, TLS setup, and privilege dropping
440
+ # before executing the user's application.
441
+ #
442
+ # Handlebars template variables (replaced at build time by the CLI):
443
+ # kmsServerURL - URL of the KMS server
444
+ # userAPIURL - URL of the user API (ecloud-platform)
445
+ # The KMS signing public key is copied into the image as
446
+ # /usr/local/bin/kms-signing-public-key.pem at layer-build time by the CLI.
447
+ #
448
+ # ecloud-platform divergence from compute-tee:
449
+ # This script emits ECLOUD_READY / ECLOUD_FAIL / ECLOUD_AWAITING_USERDATA /
450
+ # ECLOUD_DETACHED markers to stdout at key lifecycle points. The GCP
451
+ # provisioner's serial-console watcher in ecloud-platform
452
+ # (pkg/services/infraService/providers/gcp/compute.go) parses those
453
+ # markers to gate "VM ready" and to coordinate the prewarm-detach
454
+ # upgrade flow. Without the markers, the platform's waitForStartupReady
455
+ # times out at ~10 minutes per deploy, rollback fires, and the VM is
456
+ # deleted \u2014 seen in dev on 2026-05-04 with an older copy of this
457
+ # template that lacked the markers.
458
+ #
459
+ # Prewarm-detach contract:
460
+ # - If ECLOUD_PD_EXPECTED=1 and /mnt/disks/userdata is not present at boot,
461
+ # emit ECLOUD_AWAITING_USERDATA and wait until the disk is attached.
462
+ # - On SIGTERM (drain-requested), forward to child, wait for exit, sync
463
+ # + unmount /mnt/disks/userdata, emit ECLOUD_DETACHED, exit.
464
+ # - ECLOUD_READY is emitted once runtime is bootstrapped (same as before).
465
+ # - ECLOUD_FAIL is emitted on any unrecoverable setup error.
466
+ # Keep the markers on any line that resolves a lifecycle outcome.
467
+ #
468
+ # This file is kept in lockstep with
469
+ # ecloud-platform/pkg/services/buildService/assets/compute-source-env.sh.tmpl
470
+ # \u2014 if you change one, change the other. Differences vs the platform copy
471
+ # are intentionally minimal:
472
+ # - Handlebars placeholders use the CLI's naming (kmsServerURL,
473
+ # userAPIURL) rather than the platform's (KMS_SERVER_URL,
474
+ # USER_API_URL). (See top of file for real placeholder syntax \u2014
475
+ # not repeated here so Handlebars doesn't expand it in this comment.)
476
+ # - KMS signing key is read from a file the CLI copies into the image,
477
+ # not heredoc-embedded in the script, because the CLI's image
478
+ # layering writes it as a separate file (kms-signing-public-key.pem).
479
+ # - TLS binary is \`tls-keygen\` (CLI-bundled) not \`tls-client\`.
480
+
437
481
  echo "compute-source-env.sh: Running setup script..."
438
482
 
439
483
  # Fetch and source environment variables from KMS
@@ -449,92 +493,93 @@ if /usr/local/bin/kms-client \\
449
493
  else
450
494
  echo "compute-source-env.sh: ERROR - Failed to fetch environment variables from KMS"
451
495
  echo "compute-source-env.sh: Exiting - cannot start user workload without KMS secrets"
496
+ echo "ECLOUD_FAIL kms_bootstrap"
452
497
  exit 1
453
498
  fi
454
499
 
455
- # Setup TLS if tls-keygen is present (which means TLS was configured at build time)
500
+ # Setup TLS if tls-keygen is present and DOMAIN is configured
456
501
  setup_tls() {
457
502
  # If tls-keygen isn't present, TLS wasn't configured during build
458
503
  if [ ! -x /usr/local/bin/tls-keygen ]; then
459
504
  echo "compute-source-env.sh: TLS not configured (no tls-keygen binary)"
460
505
  return 0
461
506
  fi
462
-
507
+
463
508
  local domain="\${DOMAIN:-}"
464
509
  local mnemonic="\${MNEMONIC:-}"
465
-
466
- # Since tls-keygen is present, TLS is expected - validate requirements
510
+
511
+ # If DOMAIN is not set or is localhost, skip TLS setup
467
512
  if [ -z "$domain" ] || [ "$domain" = "localhost" ]; then
468
- echo "compute-source-env.sh: ERROR - TLS binary present but DOMAIN not configured or is localhost"
469
- echo "compute-source-env.sh: Set DOMAIN environment variable to a valid domain"
470
- exit 1
513
+ echo "compute-source-env.sh: TLS skipped (DOMAIN not set or is localhost)"
514
+ return 0
471
515
  fi
472
-
516
+
473
517
  if [ -z "$mnemonic" ]; then
474
- echo "compute-source-env.sh: ERROR - TLS binary present but MNEMONIC not available"
518
+ echo "compute-source-env.sh: ERROR - TLS requested but MNEMONIC not available"
475
519
  echo "compute-source-env.sh: Cannot obtain TLS certificate without mnemonic"
520
+ echo "ECLOUD_FAIL tls_mnemonic_missing"
476
521
  exit 1
477
522
  fi
478
-
523
+
479
524
  if [ ! -x /usr/local/bin/caddy ]; then
480
- echo "compute-source-env.sh: ERROR - TLS binary present but Caddy not found"
525
+ echo "compute-source-env.sh: ERROR - TLS requested but Caddy not found"
526
+ echo "ECLOUD_FAIL tls_caddy_missing"
481
527
  exit 1
482
528
  fi
483
-
529
+
484
530
  echo "compute-source-env.sh: Setting up TLS for domain: $domain"
485
-
531
+
486
532
  # Obtain TLS certificate using ACME
487
- # Default to http-01, but allow override via ACME_CHALLENGE env var
488
533
  local challenge="\${ACME_CHALLENGE:-http-01}"
489
-
534
+
490
535
  # Check if we should use staging (for testing)
491
536
  local staging_flag=""
492
537
  if [ "\${ACME_STAGING:-false}" = "true" ]; then
493
538
  staging_flag="-staging"
494
- echo "compute-source-env.sh: Using Let's Encrypt STAGING environment (certificates won't be trusted)"
539
+ echo "compute-source-env.sh: Using Let's Encrypt STAGING environment"
495
540
  fi
496
-
541
+
497
542
  echo "compute-source-env.sh: Obtaining TLS certificate using $challenge challenge..."
498
- # Pass the API URL for certificate persistence
499
543
  if ! MNEMONIC="$mnemonic" DOMAIN="$domain" API_URL="{{userAPIURL}}" /usr/local/bin/tls-keygen \\
500
544
  -challenge "$challenge" \\
501
545
  $staging_flag; then
502
546
  echo "compute-source-env.sh: ERROR - Failed to obtain TLS certificate"
503
- echo "compute-source-env.sh: Certificate issuance failed for $domain"
547
+ echo "ECLOUD_FAIL tls_setup"
504
548
  exit 1
505
549
  fi
506
-
550
+
507
551
  echo "compute-source-env.sh: TLS certificate obtained successfully"
508
-
552
+
509
553
  # Validate Caddyfile before starting
510
- if ! /usr/local/bin/caddy validate --config /etc/caddy/Caddyfile --adapter caddyfile 2>/dev/null; then
511
- echo "compute-source-env.sh: ERROR - Invalid Caddyfile"
512
- echo "compute-source-env.sh: TLS was requested (DOMAIN=$domain) but setup failed"
513
- exit 1
514
- fi
515
-
516
- # Start Caddy in background
517
- echo "compute-source-env.sh: Starting Caddy reverse proxy..."
518
-
519
- # Check if Caddy logs should be enabled
520
- if [ "\${ENABLE_CADDY_LOGS:-false}" = "true" ]; then
521
- if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile 2>&1; then
522
- echo "compute-source-env.sh: ERROR - Failed to start Caddy"
523
- echo "compute-source-env.sh: TLS was requested (DOMAIN=$domain) but setup failed"
554
+ if [ -f /etc/caddy/Caddyfile ]; then
555
+ if ! /usr/local/bin/caddy validate --config /etc/caddy/Caddyfile --adapter caddyfile 2>/dev/null; then
556
+ echo "compute-source-env.sh: ERROR - Invalid Caddyfile"
557
+ echo "ECLOUD_FAIL tls_invalid_caddyfile"
524
558
  exit 1
525
559
  fi
526
- else
527
- # Redirect Caddy output to /dev/null to silence logs
528
- if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile >/dev/null 2>&1; then
529
- echo "compute-source-env.sh: ERROR - Failed to start Caddy"
530
- echo "compute-source-env.sh: TLS was requested (DOMAIN=$domain) but setup failed"
531
- exit 1
560
+
561
+ # Start Caddy in background
562
+ echo "compute-source-env.sh: Starting Caddy reverse proxy..."
563
+ if [ "\${ENABLE_CADDY_LOGS:-false}" = "true" ]; then
564
+ if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile 2>&1; then
565
+ echo "compute-source-env.sh: ERROR - Failed to start Caddy"
566
+ echo "ECLOUD_FAIL tls_caddy_start"
567
+ exit 1
568
+ fi
569
+ else
570
+ if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile >/dev/null 2>&1; then
571
+ echo "compute-source-env.sh: ERROR - Failed to start Caddy"
572
+ echo "ECLOUD_FAIL tls_caddy_start"
573
+ exit 1
574
+ fi
532
575
  fi
576
+
577
+ sleep 2
578
+ echo "compute-source-env.sh: Caddy started successfully"
579
+ else
580
+ echo "compute-source-env.sh: No Caddyfile found, skipping Caddy"
533
581
  fi
534
-
535
- # Give Caddy a moment to fully initialize
536
- sleep 2
537
- echo "compute-source-env.sh: Caddy started successfully"
582
+
538
583
  return 0
539
584
  }
540
585
 
@@ -545,15 +590,233 @@ setup_tls
545
590
  export KMS_SERVER_URL="{{kmsServerURL}}"
546
591
  export KMS_PUBLIC_KEY="$(cat /usr/local/bin/kms-signing-public-key.pem)"
547
592
 
593
+ # \u2500\u2500 Prewarm-detach: wait for PD if expected \u2500\u2500
594
+ # Orchestrator sets ECLOUD_PD_EXPECTED=1 on apps using StorageBackend=pd.
595
+ # When the prewarm path is used, the new VM boots WITHOUT the disk; we
596
+ # signal awaiting-userdata and poll until the disk is attached.
597
+ USERDATA_MOUNT="/mnt/disks/userdata"
598
+ USERDATA_DEV="/dev/disk/by-id/google-persistent_storage_1"
599
+
600
+ wait_for_userdata() {
601
+ if [ "\${ECLOUD_PD_EXPECTED:-0}" != "1" ]; then
602
+ return 0
603
+ fi
604
+ if mountpoint -q "$USERDATA_MOUNT" 2>/dev/null; then
605
+ echo "compute-source-env.sh: userdata already mounted at $USERDATA_MOUNT"
606
+ return 0
607
+ fi
608
+ # Refuse to proceed if the tools we need for safe first-attach
609
+ # detection are missing. Without blkid we cannot tell an empty new
610
+ # disk from an already-formatted one \u2014 running mkfs.ext4 on the
611
+ # latter would destroy data.
612
+ if ! command -v blkid >/dev/null 2>&1; then
613
+ echo "ECLOUD_FAIL pd_tools_missing"
614
+ exit 1
615
+ fi
616
+ echo "ECLOUD_AWAITING_USERDATA"
617
+ echo "compute-source-env.sh: waiting for PD at $USERDATA_DEV..."
618
+ # Poll for up to 10 minutes (120 * 5s). The orchestrator's overall
619
+ # attach timeout is shorter; the ceiling here just bounds the wait
620
+ # for manual / diagnostic scenarios.
621
+ local i=0
622
+ local mount_failures=0
623
+ while [ "$i" -lt 120 ]; do
624
+ if [ -e "$USERDATA_DEV" ]; then
625
+ mkdir -p "$USERDATA_MOUNT"
626
+ if mount -o noatime "$USERDATA_DEV" "$USERDATA_MOUNT" 2>/dev/null; then
627
+ echo "compute-source-env.sh: PD mounted at $USERDATA_MOUNT"
628
+ return 0
629
+ fi
630
+ # Disk present but mount failed. Check whether it has a
631
+ # recognized filesystem. \`blkid -s TYPE -o value\` prints the
632
+ # FS type (empty if none). We only mkfs when there is
633
+ # demonstrably NO filesystem \u2014 never on the basis of blkid
634
+ # returning non-zero alone, which could mean "blkid missing"
635
+ # or "device busy".
636
+ local fstype
637
+ fstype=$(blkid -s TYPE -o value "$USERDATA_DEV" 2>/dev/null)
638
+ if [ -z "$fstype" ]; then
639
+ echo "compute-source-env.sh: formatting $USERDATA_DEV (first attach)"
640
+ mkfs.ext4 -F -L eclouddata "$USERDATA_DEV" >/dev/null 2>&1 || {
641
+ echo "ECLOUD_FAIL pd_mkfs_failed"
642
+ exit 1
643
+ }
644
+ mount -o noatime "$USERDATA_DEV" "$USERDATA_MOUNT" || {
645
+ echo "ECLOUD_FAIL pd_mount_after_format_failed"
646
+ exit 1
647
+ }
648
+ return 0
649
+ fi
650
+ # Disk has a filesystem but mount still failed. Give it a
651
+ # few retries to cover transient cases (device busy, udev
652
+ # still settling), but don't pretend this is an attach
653
+ # timeout if it persists.
654
+ mount_failures=$((mount_failures + 1))
655
+ if [ "$mount_failures" -ge 6 ]; then
656
+ echo "ECLOUD_FAIL pd_mount_failed"
657
+ exit 1
658
+ fi
659
+ else
660
+ # Device disappeared (e.g. udev re-enumeration between
661
+ # attach and mount). Reset the consecutive-failure counter
662
+ # so only true back-to-back mount failures trip
663
+ # pd_mount_failed; a device blip should not steal retries.
664
+ mount_failures=0
665
+ fi
666
+ i=$((i + 1))
667
+ sleep 5
668
+ done
669
+ echo "ECLOUD_FAIL pd_attach_timeout"
670
+ exit 1
671
+ }
672
+
673
+ wait_for_userdata
674
+
675
+ # \u2500\u2500 Prewarm-detach: install SIGTERM handler for graceful drain \u2500\u2500
676
+ # Orchestrator signals drain by setting the instance metadata key
677
+ # ECLOUD_DRAIN_REQUESTED=1, which a host-level agent translates into
678
+ # SIGTERM on PID 1. On SIGTERM we:
679
+ # 1. Forward to the child (wakes the user's app for graceful exit)
680
+ # 2. Wait for child exit
681
+ # 3. Sync + unmount the PD
682
+ # 4. Emit ECLOUD_DETACHED so the orchestrator can proceed to detach
683
+ CHILD_PID=""
684
+ _DRAIN_IN_PROGRESS=0
685
+
686
+ drain_handler() {
687
+ # Guard against re-entry if SIGTERM arrives twice (e.g. both the
688
+ # drain_watcher and an external signal fire in quick succession).
689
+ if [ "$_DRAIN_IN_PROGRESS" = "1" ]; then
690
+ return 0
691
+ fi
692
+ _DRAIN_IN_PROGRESS=1
693
+ echo "compute-source-env.sh: received drain signal, forwarding to child pgid=$CHILD_PID"
694
+ if [ -n "$CHILD_PID" ]; then
695
+ # Send to the process group so intermediate wrappers (su, sh -c,
696
+ # etc.) don't swallow the signal. The leading \`-\` targets the
697
+ # pgid, which equals the direct child's pid for a shell-backgrounded
698
+ # process. Fall back to the pid alone if pgid signaling fails
699
+ # (e.g. kernel older than 3.9 or PID namespace edge cases).
700
+ kill -TERM -"$CHILD_PID" 2>/dev/null || kill -TERM "$CHILD_PID" 2>/dev/null || true
701
+ # Give the app up to 30s to exit cleanly.
702
+ local i=0
703
+ while [ "$i" -lt 30 ] && kill -0 "$CHILD_PID" 2>/dev/null; do
704
+ i=$((i + 1))
705
+ sleep 1
706
+ done
707
+ if kill -0 "$CHILD_PID" 2>/dev/null; then
708
+ echo "compute-source-env.sh: child did not exit in 30s, sending SIGKILL"
709
+ kill -KILL -"$CHILD_PID" 2>/dev/null || kill -KILL "$CHILD_PID" 2>/dev/null || true
710
+ # Reap the process so its in-flight I/O is flushed to the
711
+ # filesystem before we sync + unmount. SIGKILL schedules
712
+ # death; wait guarantees it's complete.
713
+ wait "$CHILD_PID" 2>/dev/null || true
714
+ fi
715
+ fi
716
+ if [ "\${ECLOUD_PD_EXPECTED:-0}" = "1" ] && mountpoint -q "$USERDATA_MOUNT" 2>/dev/null; then
717
+ sync
718
+ if umount "$USERDATA_MOUNT" 2>/dev/null; then
719
+ echo "compute-source-env.sh: unmounted $USERDATA_MOUNT cleanly"
720
+ else
721
+ # Force lazy unmount as last resort \u2014 orchestrator still needs
722
+ # the DETACHED signal to proceed.
723
+ umount -l "$USERDATA_MOUNT" 2>/dev/null || true
724
+ echo "compute-source-env.sh: WARNING - used lazy unmount on $USERDATA_MOUNT"
725
+ fi
726
+ # ECLOUD_DETACHED is strictly a PD-lifecycle signal. Only emit
727
+ # it when we actually had a PD mount in play, so serial-log
728
+ # parsers and alerting for non-PD apps don't see spurious
729
+ # lifecycle markers on routine container SIGTERM.
730
+ echo "ECLOUD_DETACHED"
731
+ fi
732
+ # Always exit 0: drain is a managed shutdown and the orchestrator
733
+ # waits on ECLOUD_DETACHED, not the container exit code. Forwarding
734
+ # the child's exit status here would make a crash-during-drain look
735
+ # like a drain failure to whatever reads the container exit code.
736
+ exit 0
737
+ }
738
+ trap drain_handler TERM
739
+
740
+ # \u2500\u2500 Prewarm-detach: background drain watcher \u2500\u2500
741
+ # Container metadata delivery in Confidential Space is limited, so we
742
+ # poll the instance metadata server for ECLOUD_DRAIN_REQUESTED and
743
+ # raise SIGTERM on ourselves when it flips to "1".
744
+ #
745
+ # Try wget first (present in most Alpine bases), fall back to curl.
746
+ # If neither is present, drain watcher is disabled \u2014 the orchestrator
747
+ # will hit its drain timeout and fail the upgrade explicitly, which is
748
+ # the correct behavior (we cannot silently ignore a drain request).
749
+ _fetch_drain_flag() {
750
+ local url="http://metadata.google.internal/computeMetadata/v1/instance/attributes/ECLOUD_DRAIN_REQUESTED"
751
+ if command -v wget >/dev/null 2>&1; then
752
+ wget -q --tries=1 --timeout=2 --header='Metadata-Flavor: Google' -O - "$url" 2>/dev/null
753
+ elif command -v curl >/dev/null 2>&1; then
754
+ curl -sf --max-time 2 -H 'Metadata-Flavor: Google' "$url" 2>/dev/null
755
+ else
756
+ return 2
757
+ fi
758
+ }
759
+
760
+ drain_watcher() {
761
+ # Preflight: confirm we have an HTTP client
762
+ if ! _fetch_drain_flag >/dev/null 2>&1; then
763
+ # Either no http client available OR metadata server not
764
+ # responding yet. If no client, give up and log; otherwise the
765
+ # loop below will retry.
766
+ if ! command -v wget >/dev/null 2>&1 && ! command -v curl >/dev/null 2>&1; then
767
+ echo "compute-source-env.sh: WARNING - no wget/curl; drain_watcher disabled"
768
+ return 0
769
+ fi
770
+ fi
771
+ while true; do
772
+ local v
773
+ v=$(_fetch_drain_flag || true)
774
+ if [ "$v" = "1" ]; then
775
+ echo "compute-source-env.sh: drain_watcher saw ECLOUD_DRAIN_REQUESTED=1, signaling PID 1"
776
+ # The CS launcher runs this script directly as PID 1, so
777
+ # kill -TERM 1 delivers SIGTERM to the shell that installed
778
+ # the drain_handler trap. If the launch mechanism ever
779
+ # wraps this script in another process, this assumption
780
+ # breaks and drain will silently no-op \u2014 audit here.
781
+ kill -TERM 1 2>/dev/null || true
782
+ return 0
783
+ fi
784
+ sleep 2
785
+ done
786
+ }
787
+
788
+ if [ "\${ECLOUD_PD_EXPECTED:-0}" = "1" ]; then
789
+ # Assumption: the orchestrator only flips ECLOUD_DRAIN_REQUESTED=1
790
+ # after observing ECLOUD_AWAITING_USERDATA (old VM) or
791
+ # ECLOUD_READY (new VM), so CHILD_PID is always set by the time
792
+ # drain_handler fires. If drain somehow arrived in the tiny window
793
+ # between this watcher spawn and CHILD_PID assignment below,
794
+ # drain_handler would skip the child-kill branch and still emit
795
+ # ECLOUD_DETACHED \u2014 harmless because there's nothing to drain yet.
796
+ if [ -x /usr/local/bin/ecloud-drain-watcher ]; then
797
+ /usr/local/bin/ecloud-drain-watcher &
798
+ else
799
+ drain_watcher &
800
+ fi
801
+ fi
802
+
548
803
  echo "compute-source-env.sh: Environment sourced."
804
+ echo "ECLOUD_READY runtime_bootstrapped"
549
805
 
550
806
  # Drop privileges to original user for the application command
551
807
  if [ -n "$__ECLOUD_ORIGINAL_USER" ] && [ "$(id -u)" = "0" ]; then
552
808
  echo "compute-source-env.sh: Dropping privileges to user: $__ECLOUD_ORIGINAL_USER"
553
- exec su -s /bin/sh "$__ECLOUD_ORIGINAL_USER" -c 'exec "$@"' -- sh "$@"
809
+ # Must background the child so our trap can fire; exec replaces PID 1.
810
+ su -s /bin/sh "$__ECLOUD_ORIGINAL_USER" -c 'exec "$@"' -- sh "$@" &
811
+ CHILD_PID=$!
812
+ wait "$CHILD_PID"
813
+ exit $?
554
814
  fi
555
815
 
556
- exec "$@"
816
+ "$@" &
817
+ CHILD_PID=$!
818
+ wait "$CHILD_PID"
819
+ exit $?
557
820
  `;
558
821
 
559
822
  // src/client/common/templates/scriptTemplate.ts
@@ -745,6 +1008,8 @@ async function layerLocalImage(options, logger) {
745
1008
  logger.debug(`Found DOMAIN=${domainMatch[1]} in ${envFilePath}, including TLS components`);
746
1009
  }
747
1010
  }
1011
+ const drainWatcherSource = findBinary("ecloud-drain-watcher-linux-amd64");
1012
+ const includeDrainWatcher = fs.existsSync(drainWatcherSource);
748
1013
  const layeredDockerfileContent = processDockerfileTemplate({
749
1014
  baseImage: sourceImageRef,
750
1015
  originalCmd: JSON.stringify(originalCmd),
@@ -752,8 +1017,9 @@ async function layerLocalImage(options, logger) {
752
1017
  logRedirect,
753
1018
  resourceUsageAllow,
754
1019
  includeTLS,
755
- ecloudCLIVersion: "0.1.0"
1020
+ ecloudCLIVersion: "0.1.0",
756
1021
  // TODO: Get from package.json
1022
+ includeDrainWatcher
757
1023
  });
758
1024
  const scriptContent = processScriptTemplate({
759
1025
  kmsServerURL: environmentConfig.kmsServerURL,
@@ -763,7 +1029,8 @@ async function layerLocalImage(options, logger) {
763
1029
  environmentConfig,
764
1030
  layeredDockerfileContent,
765
1031
  scriptContent,
766
- includeTLS
1032
+ includeTLS,
1033
+ includeDrainWatcher ? drainWatcherSource : void 0
767
1034
  // logger
768
1035
  );
769
1036
  try {
@@ -778,7 +1045,7 @@ async function layerLocalImage(options, logger) {
778
1045
  fs.rmSync(tempDir, { recursive: true, force: true });
779
1046
  }
780
1047
  }
781
- async function setupLayeredBuildDirectory(environmentConfig, layeredDockerfileContent, scriptContent, includeTLS) {
1048
+ async function setupLayeredBuildDirectory(environmentConfig, layeredDockerfileContent, scriptContent, includeTLS, drainWatcherSource) {
782
1049
  const tempDir = fs.mkdtempSync(path2.join(os.tmpdir(), LAYERED_BUILD_DIR_PREFIX));
783
1050
  try {
784
1051
  const layeredDockerfilePath = path2.join(tempDir, LAYERED_DOCKERFILE_NAME);
@@ -802,6 +1069,11 @@ async function setupLayeredBuildDirectory(environmentConfig, layeredDockerfileCo
802
1069
  }
803
1070
  fs.copyFileSync(kmsClientSource, kmsClientPath);
804
1071
  fs.chmodSync(kmsClientPath, 493);
1072
+ if (drainWatcherSource && fs.existsSync(drainWatcherSource)) {
1073
+ const drainWatcherPath = path2.join(tempDir, DRAIN_WATCHER_BINARY_NAME);
1074
+ fs.copyFileSync(drainWatcherSource, drainWatcherPath);
1075
+ fs.chmodSync(drainWatcherPath, 493);
1076
+ }
805
1077
  if (includeTLS) {
806
1078
  const tlsKeygenPath = path2.join(tempDir, TLS_KEYGEN_BINARY_NAME);
807
1079
  const tlsKeygenSource = findBinary("tls-keygen-linux-amd64");
@@ -4693,7 +4965,7 @@ var CanViewAppLogsPermission = "0x2fd3f2fe";
4693
4965
  var CanViewSensitiveAppInfoPermission = "0x0e67b22f";
4694
4966
  var CanUpdateAppProfilePermission = "0x036fef61";
4695
4967
  function getDefaultClientId() {
4696
- const version = true ? "1.0.0-dev.2" : "0.0.0";
4968
+ const version = true ? "1.0.0-devep1" : "0.0.0";
4697
4969
  return `ecloud-sdk/v${version}`;
4698
4970
  }
4699
4971
  var UserApiClient = class {