@layr-labs/ecloud-sdk 1.0.0-dev.2 → 1.0.0-devep1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/compute.cjs CHANGED
@@ -183,6 +183,7 @@ var ENV_SOURCE_SCRIPT_NAME = "compute-source-env.sh";
183
183
  var KMS_CLIENT_BINARY_NAME = "kms-client";
184
184
  var KMS_SIGNING_KEY_NAME = "kms-signing-public-key.pem";
185
185
  var TLS_KEYGEN_BINARY_NAME = "tls-keygen";
186
+ var DRAIN_WATCHER_BINARY_NAME = "ecloud-drain-watcher";
186
187
  var CADDYFILE_NAME = "Caddyfile";
187
188
  var LAYERED_BUILD_DIR_PREFIX = "ecloud-layered-build";
188
189
 
@@ -456,7 +457,7 @@ var PushPermissionError = class extends Error {
456
457
  var import_handlebars = __toESM(require("handlebars"), 1);
457
458
 
458
459
  // src/client/common/templates/Dockerfile.layered.tmpl
459
- var Dockerfile_layered_default = '{{#if includeTLS}}\n# Get Caddy from official image\nFROM caddy:2.10.2-alpine AS caddy\n{{/if}}\n\nFROM {{baseImage}}\n\n{{#if originalUser}}\n# Switch to root to perform setup (base image has non-root USER: {{originalUser}})\nUSER root\n{{/if}}\n\n# Copy core TEE components\nCOPY compute-source-env.sh /usr/local/bin/\nCOPY kms-client /usr/local/bin/\nCOPY kms-signing-public-key.pem /usr/local/bin/\n\n{{#if includeTLS}}\n# Copy Caddy from official image\nCOPY --from=caddy /usr/bin/caddy /usr/local/bin/caddy\n\n# Copy TLS components\nCOPY tls-keygen /usr/local/bin/\nCOPY Caddyfile /etc/caddy/\n{{/if}}\n\n{{#if originalUser}}\n# Make binaries executable (755 for executables, 644 for keys)\nRUN chmod 755 /usr/local/bin/compute-source-env.sh \\\n && chmod 755 /usr/local/bin/kms-client{{#if includeTLS}} \\\n && chmod 755 /usr/local/bin/tls-keygen \\\n && chmod 755 /usr/local/bin/caddy{{/if}} \\\n && chmod 644 /usr/local/bin/kms-signing-public-key.pem\n\n# Store original user - entrypoint will drop privileges to this user after TEE setup\nENV __ECLOUD_ORIGINAL_USER={{originalUser}}\n{{else}}\n# Make binaries executable (preserve existing permissions, just add execute)\nRUN chmod +x /usr/local/bin/compute-source-env.sh \\\n && chmod +x /usr/local/bin/kms-client{{#if includeTLS}} \\\n && chmod +x /usr/local/bin/tls-keygen{{/if}}\n{{/if}}\n\n{{#if logRedirect}}\n\nLABEL tee.launch_policy.log_redirect={{logRedirect}}\n{{/if}}\n{{#if resourceUsageAllow}}\n\nLABEL tee.launch_policy.monitoring_memory_allow={{resourceUsageAllow}}\n{{/if}}\n\nLABEL eigenx_cli_version={{ecloudCLIVersion}}\nLABEL eigenx_vm_image=eigen\n\n{{#if includeTLS}}\n# Expose both HTTP and HTTPS ports for Caddy\nEXPOSE 80 443\n{{/if}}\n\nENTRYPOINT ["/usr/local/bin/compute-source-env.sh"]\nCMD {{{originalCmd}}}\n';
460
+ var Dockerfile_layered_default = '{{#if includeTLS}}\n# Get Caddy from official image\nFROM caddy:2.10.2-alpine AS caddy\n{{/if}}\n\nFROM {{baseImage}}\n\n{{#if originalUser}}\n# Switch to root to perform setup (base image has non-root USER: {{originalUser}})\nUSER root\n{{/if}}\n\n# Copy core TEE components\nCOPY compute-source-env.sh /usr/local/bin/\nCOPY kms-client /usr/local/bin/\nCOPY kms-signing-public-key.pem /usr/local/bin/\n{{#if includeDrainWatcher}}\nCOPY ecloud-drain-watcher /usr/local/bin/\n{{/if}}\n\n{{#if includeTLS}}\n# Copy Caddy from official image\nCOPY --from=caddy /usr/bin/caddy /usr/local/bin/caddy\n\n# Copy TLS components\nCOPY tls-keygen /usr/local/bin/\nCOPY Caddyfile /etc/caddy/\n{{/if}}\n\n{{#if originalUser}}\n# Make binaries executable (755 for executables, 644 for keys)\nRUN chmod 755 /usr/local/bin/compute-source-env.sh \\\n && chmod 755 /usr/local/bin/kms-client{{#if includeDrainWatcher}} \\\n && chmod 755 /usr/local/bin/ecloud-drain-watcher{{/if}}{{#if includeTLS}} \\\n && chmod 755 /usr/local/bin/tls-keygen \\\n && chmod 755 /usr/local/bin/caddy{{/if}} \\\n && chmod 644 /usr/local/bin/kms-signing-public-key.pem\n\n# Store original user - entrypoint will drop privileges to this user after TEE setup\nENV __ECLOUD_ORIGINAL_USER={{originalUser}}\n{{else}}\n# Make binaries executable (preserve existing permissions, just add execute)\nRUN chmod +x /usr/local/bin/compute-source-env.sh \\\n && chmod +x /usr/local/bin/kms-client{{#if includeDrainWatcher}} \\\n && chmod +x /usr/local/bin/ecloud-drain-watcher{{/if}}{{#if includeTLS}} \\\n && chmod +x /usr/local/bin/tls-keygen{{/if}}\n{{/if}}\n\n{{#if logRedirect}}\n\nLABEL tee.launch_policy.log_redirect={{logRedirect}}\n{{/if}}\n{{#if resourceUsageAllow}}\n\nLABEL tee.launch_policy.monitoring_memory_allow={{resourceUsageAllow}}\n{{/if}}\n\n# Allow-list the envvars the ecloud-platform sets via GCE `tee-env-*`\n# metadata. Without this label, Confidential Space\'s launcher rejects\n# any `tee-env-*` override at container-start with\n# "env var {...} is not allowed to be overridden on this image" and\n# exits with code 1 \u2014 which terminates the VM before the entrypoint\n# ever runs. ECLOUD_PD_EXPECTED is set on PD-backed apps so the\n# entrypoint (compute-source-env.sh) knows to wait for the persistent\n# disk before exec\'ing the user workload. User-supplied env vars\n# flow through KMS (not tee-env-*) and don\'t need to be listed here.\nLABEL tee.launch_policy.allow_env_override=ECLOUD_PD_EXPECTED\n\nLABEL eigenx_cli_version={{ecloudCLIVersion}}\nLABEL eigenx_vm_image=eigen\nLABEL eigenx_container_contract=v1\n\n{{#if includeTLS}}\n# Expose both HTTP and HTTPS ports for Caddy\nEXPOSE 80 443\n{{/if}}\n\nENTRYPOINT ["/usr/local/bin/compute-source-env.sh"]\nCMD {{{originalCmd}}}\n';
460
461
 
461
462
  // src/client/common/templates/dockerfileTemplate.ts
462
463
  function processDockerfileTemplate(data) {
@@ -469,6 +470,49 @@ var import_handlebars2 = __toESM(require("handlebars"), 1);
469
470
 
470
471
  // src/client/common/templates/compute-source-env.sh.tmpl
471
472
  var compute_source_env_sh_default = `#!/bin/sh
473
+ # EigenCompute container entrypoint script
474
+ # This script handles KMS secret fetching, TLS setup, and privilege dropping
475
+ # before executing the user's application.
476
+ #
477
+ # Handlebars template variables (replaced at build time by the CLI):
478
+ # kmsServerURL - URL of the KMS server
479
+ # userAPIURL - URL of the user API (ecloud-platform)
480
+ # The KMS signing public key is copied into the image as
481
+ # /usr/local/bin/kms-signing-public-key.pem at layer-build time by the CLI.
482
+ #
483
+ # ecloud-platform divergence from compute-tee:
484
+ # This script emits ECLOUD_READY / ECLOUD_FAIL / ECLOUD_AWAITING_USERDATA /
485
+ # ECLOUD_DETACHED markers to stdout at key lifecycle points. The GCP
486
+ # provisioner's serial-console watcher in ecloud-platform
487
+ # (pkg/services/infraService/providers/gcp/compute.go) parses those
488
+ # markers to gate "VM ready" and to coordinate the prewarm-detach
489
+ # upgrade flow. Without the markers, the platform's waitForStartupReady
490
+ # times out at ~10 minutes per deploy, rollback fires, and the VM is
491
+ # deleted \u2014 seen in dev on 2026-05-04 with an older copy of this
492
+ # template that lacked the markers.
493
+ #
494
+ # Prewarm-detach contract:
495
+ # - If ECLOUD_PD_EXPECTED=1 and /mnt/disks/userdata is not present at boot,
496
+ # emit ECLOUD_AWAITING_USERDATA and wait until the disk is attached.
497
+ # - On SIGTERM (drain-requested), forward to child, wait for exit, sync
498
+ # + unmount /mnt/disks/userdata, emit ECLOUD_DETACHED, exit.
499
+ # - ECLOUD_READY is emitted once runtime is bootstrapped (same as before).
500
+ # - ECLOUD_FAIL is emitted on any unrecoverable setup error.
501
+ # Keep the markers on any line that resolves a lifecycle outcome.
502
+ #
503
+ # This file is kept in lockstep with
504
+ # ecloud-platform/pkg/services/buildService/assets/compute-source-env.sh.tmpl
505
+ # \u2014 if you change one, change the other. Differences vs the platform copy
506
+ # are intentionally minimal:
507
+ # - Handlebars placeholders use the CLI's naming (kmsServerURL,
508
+ # userAPIURL) rather than the platform's (KMS_SERVER_URL,
509
+ # USER_API_URL). (See top of file for real placeholder syntax \u2014
510
+ # not repeated here so Handlebars doesn't expand it in this comment.)
511
+ # - KMS signing key is read from a file the CLI copies into the image,
512
+ # not heredoc-embedded in the script, because the CLI's image
513
+ # layering writes it as a separate file (kms-signing-public-key.pem).
514
+ # - TLS binary is \`tls-keygen\` (CLI-bundled) not \`tls-client\`.
515
+
472
516
  echo "compute-source-env.sh: Running setup script..."
473
517
 
474
518
  # Fetch and source environment variables from KMS
@@ -484,92 +528,93 @@ if /usr/local/bin/kms-client \\
484
528
  else
485
529
  echo "compute-source-env.sh: ERROR - Failed to fetch environment variables from KMS"
486
530
  echo "compute-source-env.sh: Exiting - cannot start user workload without KMS secrets"
531
+ echo "ECLOUD_FAIL kms_bootstrap"
487
532
  exit 1
488
533
  fi
489
534
 
490
- # Setup TLS if tls-keygen is present (which means TLS was configured at build time)
535
+ # Setup TLS if tls-keygen is present and DOMAIN is configured
491
536
  setup_tls() {
492
537
  # If tls-keygen isn't present, TLS wasn't configured during build
493
538
  if [ ! -x /usr/local/bin/tls-keygen ]; then
494
539
  echo "compute-source-env.sh: TLS not configured (no tls-keygen binary)"
495
540
  return 0
496
541
  fi
497
-
542
+
498
543
  local domain="\${DOMAIN:-}"
499
544
  local mnemonic="\${MNEMONIC:-}"
500
-
501
- # Since tls-keygen is present, TLS is expected - validate requirements
545
+
546
+ # If DOMAIN is not set or is localhost, skip TLS setup
502
547
  if [ -z "$domain" ] || [ "$domain" = "localhost" ]; then
503
- echo "compute-source-env.sh: ERROR - TLS binary present but DOMAIN not configured or is localhost"
504
- echo "compute-source-env.sh: Set DOMAIN environment variable to a valid domain"
505
- exit 1
548
+ echo "compute-source-env.sh: TLS skipped (DOMAIN not set or is localhost)"
549
+ return 0
506
550
  fi
507
-
551
+
508
552
  if [ -z "$mnemonic" ]; then
509
- echo "compute-source-env.sh: ERROR - TLS binary present but MNEMONIC not available"
553
+ echo "compute-source-env.sh: ERROR - TLS requested but MNEMONIC not available"
510
554
  echo "compute-source-env.sh: Cannot obtain TLS certificate without mnemonic"
555
+ echo "ECLOUD_FAIL tls_mnemonic_missing"
511
556
  exit 1
512
557
  fi
513
-
558
+
514
559
  if [ ! -x /usr/local/bin/caddy ]; then
515
- echo "compute-source-env.sh: ERROR - TLS binary present but Caddy not found"
560
+ echo "compute-source-env.sh: ERROR - TLS requested but Caddy not found"
561
+ echo "ECLOUD_FAIL tls_caddy_missing"
516
562
  exit 1
517
563
  fi
518
-
564
+
519
565
  echo "compute-source-env.sh: Setting up TLS for domain: $domain"
520
-
566
+
521
567
  # Obtain TLS certificate using ACME
522
- # Default to http-01, but allow override via ACME_CHALLENGE env var
523
568
  local challenge="\${ACME_CHALLENGE:-http-01}"
524
-
569
+
525
570
  # Check if we should use staging (for testing)
526
571
  local staging_flag=""
527
572
  if [ "\${ACME_STAGING:-false}" = "true" ]; then
528
573
  staging_flag="-staging"
529
- echo "compute-source-env.sh: Using Let's Encrypt STAGING environment (certificates won't be trusted)"
574
+ echo "compute-source-env.sh: Using Let's Encrypt STAGING environment"
530
575
  fi
531
-
576
+
532
577
  echo "compute-source-env.sh: Obtaining TLS certificate using $challenge challenge..."
533
- # Pass the API URL for certificate persistence
534
578
  if ! MNEMONIC="$mnemonic" DOMAIN="$domain" API_URL="{{userAPIURL}}" /usr/local/bin/tls-keygen \\
535
579
  -challenge "$challenge" \\
536
580
  $staging_flag; then
537
581
  echo "compute-source-env.sh: ERROR - Failed to obtain TLS certificate"
538
- echo "compute-source-env.sh: Certificate issuance failed for $domain"
582
+ echo "ECLOUD_FAIL tls_setup"
539
583
  exit 1
540
584
  fi
541
-
585
+
542
586
  echo "compute-source-env.sh: TLS certificate obtained successfully"
543
-
587
+
544
588
  # Validate Caddyfile before starting
545
- if ! /usr/local/bin/caddy validate --config /etc/caddy/Caddyfile --adapter caddyfile 2>/dev/null; then
546
- echo "compute-source-env.sh: ERROR - Invalid Caddyfile"
547
- echo "compute-source-env.sh: TLS was requested (DOMAIN=$domain) but setup failed"
548
- exit 1
549
- fi
550
-
551
- # Start Caddy in background
552
- echo "compute-source-env.sh: Starting Caddy reverse proxy..."
553
-
554
- # Check if Caddy logs should be enabled
555
- if [ "\${ENABLE_CADDY_LOGS:-false}" = "true" ]; then
556
- if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile 2>&1; then
557
- echo "compute-source-env.sh: ERROR - Failed to start Caddy"
558
- echo "compute-source-env.sh: TLS was requested (DOMAIN=$domain) but setup failed"
589
+ if [ -f /etc/caddy/Caddyfile ]; then
590
+ if ! /usr/local/bin/caddy validate --config /etc/caddy/Caddyfile --adapter caddyfile 2>/dev/null; then
591
+ echo "compute-source-env.sh: ERROR - Invalid Caddyfile"
592
+ echo "ECLOUD_FAIL tls_invalid_caddyfile"
559
593
  exit 1
560
594
  fi
561
- else
562
- # Redirect Caddy output to /dev/null to silence logs
563
- if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile >/dev/null 2>&1; then
564
- echo "compute-source-env.sh: ERROR - Failed to start Caddy"
565
- echo "compute-source-env.sh: TLS was requested (DOMAIN=$domain) but setup failed"
566
- exit 1
595
+
596
+ # Start Caddy in background
597
+ echo "compute-source-env.sh: Starting Caddy reverse proxy..."
598
+ if [ "\${ENABLE_CADDY_LOGS:-false}" = "true" ]; then
599
+ if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile 2>&1; then
600
+ echo "compute-source-env.sh: ERROR - Failed to start Caddy"
601
+ echo "ECLOUD_FAIL tls_caddy_start"
602
+ exit 1
603
+ fi
604
+ else
605
+ if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile >/dev/null 2>&1; then
606
+ echo "compute-source-env.sh: ERROR - Failed to start Caddy"
607
+ echo "ECLOUD_FAIL tls_caddy_start"
608
+ exit 1
609
+ fi
567
610
  fi
611
+
612
+ sleep 2
613
+ echo "compute-source-env.sh: Caddy started successfully"
614
+ else
615
+ echo "compute-source-env.sh: No Caddyfile found, skipping Caddy"
568
616
  fi
569
-
570
- # Give Caddy a moment to fully initialize
571
- sleep 2
572
- echo "compute-source-env.sh: Caddy started successfully"
617
+
573
618
  return 0
574
619
  }
575
620
 
@@ -580,15 +625,233 @@ setup_tls
580
625
  export KMS_SERVER_URL="{{kmsServerURL}}"
581
626
  export KMS_PUBLIC_KEY="$(cat /usr/local/bin/kms-signing-public-key.pem)"
582
627
 
628
+ # \u2500\u2500 Prewarm-detach: wait for PD if expected \u2500\u2500
629
+ # Orchestrator sets ECLOUD_PD_EXPECTED=1 on apps using StorageBackend=pd.
630
+ # When the prewarm path is used, the new VM boots WITHOUT the disk; we
631
+ # signal awaiting-userdata and poll until the disk is attached.
632
+ USERDATA_MOUNT="/mnt/disks/userdata"
633
+ USERDATA_DEV="/dev/disk/by-id/google-persistent_storage_1"
634
+
635
+ wait_for_userdata() {
636
+ if [ "\${ECLOUD_PD_EXPECTED:-0}" != "1" ]; then
637
+ return 0
638
+ fi
639
+ if mountpoint -q "$USERDATA_MOUNT" 2>/dev/null; then
640
+ echo "compute-source-env.sh: userdata already mounted at $USERDATA_MOUNT"
641
+ return 0
642
+ fi
643
+ # Refuse to proceed if the tools we need for safe first-attach
644
+ # detection are missing. Without blkid we cannot tell an empty new
645
+ # disk from an already-formatted one \u2014 running mkfs.ext4 on the
646
+ # latter would destroy data.
647
+ if ! command -v blkid >/dev/null 2>&1; then
648
+ echo "ECLOUD_FAIL pd_tools_missing"
649
+ exit 1
650
+ fi
651
+ echo "ECLOUD_AWAITING_USERDATA"
652
+ echo "compute-source-env.sh: waiting for PD at $USERDATA_DEV..."
653
+ # Poll for up to 10 minutes (120 * 5s). The orchestrator's overall
654
+ # attach timeout is shorter; the ceiling here just bounds the wait
655
+ # for manual / diagnostic scenarios.
656
+ local i=0
657
+ local mount_failures=0
658
+ while [ "$i" -lt 120 ]; do
659
+ if [ -e "$USERDATA_DEV" ]; then
660
+ mkdir -p "$USERDATA_MOUNT"
661
+ if mount -o noatime "$USERDATA_DEV" "$USERDATA_MOUNT" 2>/dev/null; then
662
+ echo "compute-source-env.sh: PD mounted at $USERDATA_MOUNT"
663
+ return 0
664
+ fi
665
+ # Disk present but mount failed. Check whether it has a
666
+ # recognized filesystem. \`blkid -s TYPE -o value\` prints the
667
+ # FS type (empty if none). We only mkfs when there is
668
+ # demonstrably NO filesystem \u2014 never on the basis of blkid
669
+ # returning non-zero alone, which could mean "blkid missing"
670
+ # or "device busy".
671
+ local fstype
672
+ fstype=$(blkid -s TYPE -o value "$USERDATA_DEV" 2>/dev/null)
673
+ if [ -z "$fstype" ]; then
674
+ echo "compute-source-env.sh: formatting $USERDATA_DEV (first attach)"
675
+ mkfs.ext4 -F -L eclouddata "$USERDATA_DEV" >/dev/null 2>&1 || {
676
+ echo "ECLOUD_FAIL pd_mkfs_failed"
677
+ exit 1
678
+ }
679
+ mount -o noatime "$USERDATA_DEV" "$USERDATA_MOUNT" || {
680
+ echo "ECLOUD_FAIL pd_mount_after_format_failed"
681
+ exit 1
682
+ }
683
+ return 0
684
+ fi
685
+ # Disk has a filesystem but mount still failed. Give it a
686
+ # few retries to cover transient cases (device busy, udev
687
+ # still settling), but don't pretend this is an attach
688
+ # timeout if it persists.
689
+ mount_failures=$((mount_failures + 1))
690
+ if [ "$mount_failures" -ge 6 ]; then
691
+ echo "ECLOUD_FAIL pd_mount_failed"
692
+ exit 1
693
+ fi
694
+ else
695
+ # Device disappeared (e.g. udev re-enumeration between
696
+ # attach and mount). Reset the consecutive-failure counter
697
+ # so only true back-to-back mount failures trip
698
+ # pd_mount_failed; a device blip should not steal retries.
699
+ mount_failures=0
700
+ fi
701
+ i=$((i + 1))
702
+ sleep 5
703
+ done
704
+ echo "ECLOUD_FAIL pd_attach_timeout"
705
+ exit 1
706
+ }
707
+
708
+ wait_for_userdata
709
+
710
+ # \u2500\u2500 Prewarm-detach: install SIGTERM handler for graceful drain \u2500\u2500
711
+ # Orchestrator signals drain by setting the instance metadata key
712
+ # ECLOUD_DRAIN_REQUESTED=1, which a host-level agent translates into
713
+ # SIGTERM on PID 1. On SIGTERM we:
714
+ # 1. Forward to the child (wakes the user's app for graceful exit)
715
+ # 2. Wait for child exit
716
+ # 3. Sync + unmount the PD
717
+ # 4. Emit ECLOUD_DETACHED so the orchestrator can proceed to detach
718
+ CHILD_PID=""
719
+ _DRAIN_IN_PROGRESS=0
720
+
721
+ drain_handler() {
722
+ # Guard against re-entry if SIGTERM arrives twice (e.g. both the
723
+ # drain_watcher and an external signal fire in quick succession).
724
+ if [ "$_DRAIN_IN_PROGRESS" = "1" ]; then
725
+ return 0
726
+ fi
727
+ _DRAIN_IN_PROGRESS=1
728
+ echo "compute-source-env.sh: received drain signal, forwarding to child pgid=$CHILD_PID"
729
+ if [ -n "$CHILD_PID" ]; then
730
+ # Send to the process group so intermediate wrappers (su, sh -c,
731
+ # etc.) don't swallow the signal. The leading \`-\` targets the
732
+ # pgid, which equals the direct child's pid for a shell-backgrounded
733
+ # process. Fall back to the pid alone if pgid signaling fails
734
+ # (e.g. kernel older than 3.9 or PID namespace edge cases).
735
+ kill -TERM -"$CHILD_PID" 2>/dev/null || kill -TERM "$CHILD_PID" 2>/dev/null || true
736
+ # Give the app up to 30s to exit cleanly.
737
+ local i=0
738
+ while [ "$i" -lt 30 ] && kill -0 "$CHILD_PID" 2>/dev/null; do
739
+ i=$((i + 1))
740
+ sleep 1
741
+ done
742
+ if kill -0 "$CHILD_PID" 2>/dev/null; then
743
+ echo "compute-source-env.sh: child did not exit in 30s, sending SIGKILL"
744
+ kill -KILL -"$CHILD_PID" 2>/dev/null || kill -KILL "$CHILD_PID" 2>/dev/null || true
745
+ # Reap the process so its in-flight I/O is flushed to the
746
+ # filesystem before we sync + unmount. SIGKILL schedules
747
+ # death; wait guarantees it's complete.
748
+ wait "$CHILD_PID" 2>/dev/null || true
749
+ fi
750
+ fi
751
+ if [ "\${ECLOUD_PD_EXPECTED:-0}" = "1" ] && mountpoint -q "$USERDATA_MOUNT" 2>/dev/null; then
752
+ sync
753
+ if umount "$USERDATA_MOUNT" 2>/dev/null; then
754
+ echo "compute-source-env.sh: unmounted $USERDATA_MOUNT cleanly"
755
+ else
756
+ # Force lazy unmount as last resort \u2014 orchestrator still needs
757
+ # the DETACHED signal to proceed.
758
+ umount -l "$USERDATA_MOUNT" 2>/dev/null || true
759
+ echo "compute-source-env.sh: WARNING - used lazy unmount on $USERDATA_MOUNT"
760
+ fi
761
+ # ECLOUD_DETACHED is strictly a PD-lifecycle signal. Only emit
762
+ # it when we actually had a PD mount in play, so serial-log
763
+ # parsers and alerting for non-PD apps don't see spurious
764
+ # lifecycle markers on routine container SIGTERM.
765
+ echo "ECLOUD_DETACHED"
766
+ fi
767
+ # Always exit 0: drain is a managed shutdown and the orchestrator
768
+ # waits on ECLOUD_DETACHED, not the container exit code. Forwarding
769
+ # the child's exit status here would make a crash-during-drain look
770
+ # like a drain failure to whatever reads the container exit code.
771
+ exit 0
772
+ }
773
+ trap drain_handler TERM
774
+
775
+ # \u2500\u2500 Prewarm-detach: background drain watcher \u2500\u2500
776
+ # Container metadata delivery in Confidential Space is limited, so we
777
+ # poll the instance metadata server for ECLOUD_DRAIN_REQUESTED and
778
+ # raise SIGTERM on ourselves when it flips to "1".
779
+ #
780
+ # Try wget first (present in most Alpine bases), fall back to curl.
781
+ # If neither is present, drain watcher is disabled \u2014 the orchestrator
782
+ # will hit its drain timeout and fail the upgrade explicitly, which is
783
+ # the correct behavior (we cannot silently ignore a drain request).
784
+ _fetch_drain_flag() {
785
+ local url="http://metadata.google.internal/computeMetadata/v1/instance/attributes/ECLOUD_DRAIN_REQUESTED"
786
+ if command -v wget >/dev/null 2>&1; then
787
+ wget -q --tries=1 --timeout=2 --header='Metadata-Flavor: Google' -O - "$url" 2>/dev/null
788
+ elif command -v curl >/dev/null 2>&1; then
789
+ curl -sf --max-time 2 -H 'Metadata-Flavor: Google' "$url" 2>/dev/null
790
+ else
791
+ return 2
792
+ fi
793
+ }
794
+
795
+ drain_watcher() {
796
+ # Preflight: confirm we have an HTTP client
797
+ if ! _fetch_drain_flag >/dev/null 2>&1; then
798
+ # Either no http client available OR metadata server not
799
+ # responding yet. If no client, give up and log; otherwise the
800
+ # loop below will retry.
801
+ if ! command -v wget >/dev/null 2>&1 && ! command -v curl >/dev/null 2>&1; then
802
+ echo "compute-source-env.sh: WARNING - no wget/curl; drain_watcher disabled"
803
+ return 0
804
+ fi
805
+ fi
806
+ while true; do
807
+ local v
808
+ v=$(_fetch_drain_flag || true)
809
+ if [ "$v" = "1" ]; then
810
+ echo "compute-source-env.sh: drain_watcher saw ECLOUD_DRAIN_REQUESTED=1, signaling PID 1"
811
+ # The CS launcher runs this script directly as PID 1, so
812
+ # kill -TERM 1 delivers SIGTERM to the shell that installed
813
+ # the drain_handler trap. If the launch mechanism ever
814
+ # wraps this script in another process, this assumption
815
+ # breaks and drain will silently no-op \u2014 audit here.
816
+ kill -TERM 1 2>/dev/null || true
817
+ return 0
818
+ fi
819
+ sleep 2
820
+ done
821
+ }
822
+
823
+ if [ "\${ECLOUD_PD_EXPECTED:-0}" = "1" ]; then
824
+ # Assumption: the orchestrator only flips ECLOUD_DRAIN_REQUESTED=1
825
+ # after observing ECLOUD_AWAITING_USERDATA (old VM) or
826
+ # ECLOUD_READY (new VM), so CHILD_PID is always set by the time
827
+ # drain_handler fires. If drain somehow arrived in the tiny window
828
+ # between this watcher spawn and CHILD_PID assignment below,
829
+ # drain_handler would skip the child-kill branch and still emit
830
+ # ECLOUD_DETACHED \u2014 harmless because there's nothing to drain yet.
831
+ if [ -x /usr/local/bin/ecloud-drain-watcher ]; then
832
+ /usr/local/bin/ecloud-drain-watcher &
833
+ else
834
+ drain_watcher &
835
+ fi
836
+ fi
837
+
583
838
  echo "compute-source-env.sh: Environment sourced."
839
+ echo "ECLOUD_READY runtime_bootstrapped"
584
840
 
585
841
  # Drop privileges to original user for the application command
586
842
  if [ -n "$__ECLOUD_ORIGINAL_USER" ] && [ "$(id -u)" = "0" ]; then
587
843
  echo "compute-source-env.sh: Dropping privileges to user: $__ECLOUD_ORIGINAL_USER"
588
- exec su -s /bin/sh "$__ECLOUD_ORIGINAL_USER" -c 'exec "$@"' -- sh "$@"
844
+ # Must background the child so our trap can fire; exec replaces PID 1.
845
+ su -s /bin/sh "$__ECLOUD_ORIGINAL_USER" -c 'exec "$@"' -- sh "$@" &
846
+ CHILD_PID=$!
847
+ wait "$CHILD_PID"
848
+ exit $?
589
849
  fi
590
850
 
591
- exec "$@"
851
+ "$@" &
852
+ CHILD_PID=$!
853
+ wait "$CHILD_PID"
854
+ exit $?
592
855
  `;
593
856
 
594
857
  // src/client/common/templates/scriptTemplate.ts
@@ -781,6 +1044,8 @@ async function layerLocalImage(options, logger) {
781
1044
  logger.debug(`Found DOMAIN=${domainMatch[1]} in ${envFilePath}, including TLS components`);
782
1045
  }
783
1046
  }
1047
+ const drainWatcherSource = findBinary("ecloud-drain-watcher-linux-amd64");
1048
+ const includeDrainWatcher = fs.existsSync(drainWatcherSource);
784
1049
  const layeredDockerfileContent = processDockerfileTemplate({
785
1050
  baseImage: sourceImageRef,
786
1051
  originalCmd: JSON.stringify(originalCmd),
@@ -788,8 +1053,9 @@ async function layerLocalImage(options, logger) {
788
1053
  logRedirect,
789
1054
  resourceUsageAllow,
790
1055
  includeTLS,
791
- ecloudCLIVersion: "0.1.0"
1056
+ ecloudCLIVersion: "0.1.0",
792
1057
  // TODO: Get from package.json
1058
+ includeDrainWatcher
793
1059
  });
794
1060
  const scriptContent = processScriptTemplate({
795
1061
  kmsServerURL: environmentConfig.kmsServerURL,
@@ -799,7 +1065,8 @@ async function layerLocalImage(options, logger) {
799
1065
  environmentConfig,
800
1066
  layeredDockerfileContent,
801
1067
  scriptContent,
802
- includeTLS
1068
+ includeTLS,
1069
+ includeDrainWatcher ? drainWatcherSource : void 0
803
1070
  // logger
804
1071
  );
805
1072
  try {
@@ -814,7 +1081,7 @@ async function layerLocalImage(options, logger) {
814
1081
  fs.rmSync(tempDir, { recursive: true, force: true });
815
1082
  }
816
1083
  }
817
- async function setupLayeredBuildDirectory(environmentConfig, layeredDockerfileContent, scriptContent, includeTLS) {
1084
+ async function setupLayeredBuildDirectory(environmentConfig, layeredDockerfileContent, scriptContent, includeTLS, drainWatcherSource) {
818
1085
  const tempDir = fs.mkdtempSync(path2.join(os.tmpdir(), LAYERED_BUILD_DIR_PREFIX));
819
1086
  try {
820
1087
  const layeredDockerfilePath = path2.join(tempDir, LAYERED_DOCKERFILE_NAME);
@@ -838,6 +1105,11 @@ async function setupLayeredBuildDirectory(environmentConfig, layeredDockerfileCo
838
1105
  }
839
1106
  fs.copyFileSync(kmsClientSource, kmsClientPath);
840
1107
  fs.chmodSync(kmsClientPath, 493);
1108
+ if (drainWatcherSource && fs.existsSync(drainWatcherSource)) {
1109
+ const drainWatcherPath = path2.join(tempDir, DRAIN_WATCHER_BINARY_NAME);
1110
+ fs.copyFileSync(drainWatcherSource, drainWatcherPath);
1111
+ fs.chmodSync(drainWatcherPath, 493);
1112
+ }
841
1113
  if (includeTLS) {
842
1114
  const tlsKeygenPath = path2.join(tempDir, TLS_KEYGEN_BINARY_NAME);
843
1115
  const tlsKeygenSource = findBinary("tls-keygen-linux-amd64");
@@ -4729,7 +5001,7 @@ var CanViewAppLogsPermission = "0x2fd3f2fe";
4729
5001
  var CanViewSensitiveAppInfoPermission = "0x0e67b22f";
4730
5002
  var CanUpdateAppProfilePermission = "0x036fef61";
4731
5003
  function getDefaultClientId() {
4732
- const version = true ? "1.0.0-dev.2" : "0.0.0";
5004
+ const version = true ? "1.0.0-devep1" : "0.0.0";
4733
5005
  return `ecloud-sdk/v${version}`;
4734
5006
  }
4735
5007
  var UserApiClient = class {