@layr-labs/ecloud-sdk 1.0.0-dev.2 → 1.0.0-devep1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -152,6 +152,7 @@ var ENV_SOURCE_SCRIPT_NAME = "compute-source-env.sh";
152
152
  var KMS_CLIENT_BINARY_NAME = "kms-client";
153
153
  var KMS_SIGNING_KEY_NAME = "kms-signing-public-key.pem";
154
154
  var TLS_KEYGEN_BINARY_NAME = "tls-keygen";
155
+ var DRAIN_WATCHER_BINARY_NAME = "ecloud-drain-watcher";
155
156
  var CADDYFILE_NAME = "Caddyfile";
156
157
  var LAYERED_BUILD_DIR_PREFIX = "ecloud-layered-build";
157
158
 
@@ -425,7 +426,7 @@ var PushPermissionError = class extends Error {
425
426
  import Handlebars from "handlebars";
426
427
 
427
428
  // src/client/common/templates/Dockerfile.layered.tmpl
428
- var Dockerfile_layered_default = '{{#if includeTLS}}\n# Get Caddy from official image\nFROM caddy:2.10.2-alpine AS caddy\n{{/if}}\n\nFROM {{baseImage}}\n\n{{#if originalUser}}\n# Switch to root to perform setup (base image has non-root USER: {{originalUser}})\nUSER root\n{{/if}}\n\n# Copy core TEE components\nCOPY compute-source-env.sh /usr/local/bin/\nCOPY kms-client /usr/local/bin/\nCOPY kms-signing-public-key.pem /usr/local/bin/\n\n{{#if includeTLS}}\n# Copy Caddy from official image\nCOPY --from=caddy /usr/bin/caddy /usr/local/bin/caddy\n\n# Copy TLS components\nCOPY tls-keygen /usr/local/bin/\nCOPY Caddyfile /etc/caddy/\n{{/if}}\n\n{{#if originalUser}}\n# Make binaries executable (755 for executables, 644 for keys)\nRUN chmod 755 /usr/local/bin/compute-source-env.sh \\\n && chmod 755 /usr/local/bin/kms-client{{#if includeTLS}} \\\n && chmod 755 /usr/local/bin/tls-keygen \\\n && chmod 755 /usr/local/bin/caddy{{/if}} \\\n && chmod 644 /usr/local/bin/kms-signing-public-key.pem\n\n# Store original user - entrypoint will drop privileges to this user after TEE setup\nENV __ECLOUD_ORIGINAL_USER={{originalUser}}\n{{else}}\n# Make binaries executable (preserve existing permissions, just add execute)\nRUN chmod +x /usr/local/bin/compute-source-env.sh \\\n && chmod +x /usr/local/bin/kms-client{{#if includeTLS}} \\\n && chmod +x /usr/local/bin/tls-keygen{{/if}}\n{{/if}}\n\n{{#if logRedirect}}\n\nLABEL tee.launch_policy.log_redirect={{logRedirect}}\n{{/if}}\n{{#if resourceUsageAllow}}\n\nLABEL tee.launch_policy.monitoring_memory_allow={{resourceUsageAllow}}\n{{/if}}\n\nLABEL eigenx_cli_version={{ecloudCLIVersion}}\nLABEL eigenx_vm_image=eigen\n\n{{#if includeTLS}}\n# Expose both HTTP and HTTPS ports for Caddy\nEXPOSE 80 443\n{{/if}}\n\nENTRYPOINT ["/usr/local/bin/compute-source-env.sh"]\nCMD {{{originalCmd}}}\n';
429
+ var Dockerfile_layered_default = '{{#if includeTLS}}\n# Get Caddy from official image\nFROM caddy:2.10.2-alpine AS caddy\n{{/if}}\n\nFROM {{baseImage}}\n\n{{#if originalUser}}\n# Switch to root to perform setup (base image has non-root USER: {{originalUser}})\nUSER root\n{{/if}}\n\n# Copy core TEE components\nCOPY compute-source-env.sh /usr/local/bin/\nCOPY kms-client /usr/local/bin/\nCOPY kms-signing-public-key.pem /usr/local/bin/\n{{#if includeDrainWatcher}}\nCOPY ecloud-drain-watcher /usr/local/bin/\n{{/if}}\n\n{{#if includeTLS}}\n# Copy Caddy from official image\nCOPY --from=caddy /usr/bin/caddy /usr/local/bin/caddy\n\n# Copy TLS components\nCOPY tls-keygen /usr/local/bin/\nCOPY Caddyfile /etc/caddy/\n{{/if}}\n\n{{#if originalUser}}\n# Make binaries executable (755 for executables, 644 for keys)\nRUN chmod 755 /usr/local/bin/compute-source-env.sh \\\n && chmod 755 /usr/local/bin/kms-client{{#if includeDrainWatcher}} \\\n && chmod 755 /usr/local/bin/ecloud-drain-watcher{{/if}}{{#if includeTLS}} \\\n && chmod 755 /usr/local/bin/tls-keygen \\\n && chmod 755 /usr/local/bin/caddy{{/if}} \\\n && chmod 644 /usr/local/bin/kms-signing-public-key.pem\n\n# Store original user - entrypoint will drop privileges to this user after TEE setup\nENV __ECLOUD_ORIGINAL_USER={{originalUser}}\n{{else}}\n# Make binaries executable (preserve existing permissions, just add execute)\nRUN chmod +x /usr/local/bin/compute-source-env.sh \\\n && chmod +x /usr/local/bin/kms-client{{#if includeDrainWatcher}} \\\n && chmod +x /usr/local/bin/ecloud-drain-watcher{{/if}}{{#if includeTLS}} \\\n && chmod +x /usr/local/bin/tls-keygen{{/if}}\n{{/if}}\n\n{{#if logRedirect}}\n\nLABEL tee.launch_policy.log_redirect={{logRedirect}}\n{{/if}}\n{{#if resourceUsageAllow}}\n\nLABEL tee.launch_policy.monitoring_memory_allow={{resourceUsageAllow}}\n{{/if}}\n\n# Allow-list the envvars the ecloud-platform sets via GCE `tee-env-*`\n# metadata. Without this label, Confidential Space\'s launcher rejects\n# any `tee-env-*` override at container-start with\n# "env var {...} is not allowed to be overridden on this image" and\n# exits with code 1 \u2014 which terminates the VM before the entrypoint\n# ever runs. ECLOUD_PD_EXPECTED is set on PD-backed apps so the\n# entrypoint (compute-source-env.sh) knows to wait for the persistent\n# disk before exec\'ing the user workload. User-supplied env vars\n# flow through KMS (not tee-env-*) and don\'t need to be listed here.\nLABEL tee.launch_policy.allow_env_override=ECLOUD_PD_EXPECTED\n\nLABEL eigenx_cli_version={{ecloudCLIVersion}}\nLABEL eigenx_vm_image=eigen\nLABEL eigenx_container_contract=v1\n\n{{#if includeTLS}}\n# Expose both HTTP and HTTPS ports for Caddy\nEXPOSE 80 443\n{{/if}}\n\nENTRYPOINT ["/usr/local/bin/compute-source-env.sh"]\nCMD {{{originalCmd}}}\n';
429
430
 
430
431
  // src/client/common/templates/dockerfileTemplate.ts
431
432
  function processDockerfileTemplate(data) {
@@ -438,6 +439,49 @@ import Handlebars2 from "handlebars";
438
439
 
439
440
  // src/client/common/templates/compute-source-env.sh.tmpl
440
441
  var compute_source_env_sh_default = `#!/bin/sh
442
+ # EigenCompute container entrypoint script
443
+ # This script handles KMS secret fetching, TLS setup, and privilege dropping
444
+ # before executing the user's application.
445
+ #
446
+ # Handlebars template variables (replaced at build time by the CLI):
447
+ # kmsServerURL - URL of the KMS server
448
+ # userAPIURL - URL of the user API (ecloud-platform)
449
+ # The KMS signing public key is copied into the image as
450
+ # /usr/local/bin/kms-signing-public-key.pem at layer-build time by the CLI.
451
+ #
452
+ # ecloud-platform divergence from compute-tee:
453
+ # This script emits ECLOUD_READY / ECLOUD_FAIL / ECLOUD_AWAITING_USERDATA /
454
+ # ECLOUD_DETACHED markers to stdout at key lifecycle points. The GCP
455
+ # provisioner's serial-console watcher in ecloud-platform
456
+ # (pkg/services/infraService/providers/gcp/compute.go) parses those
457
+ # markers to gate "VM ready" and to coordinate the prewarm-detach
458
+ # upgrade flow. Without the markers, the platform's waitForStartupReady
459
+ # times out at ~10 minutes per deploy, rollback fires, and the VM is
460
+ # deleted \u2014 seen in dev on 2026-05-04 with an older copy of this
461
+ # template that lacked the markers.
462
+ #
463
+ # Prewarm-detach contract:
464
+ # - If ECLOUD_PD_EXPECTED=1 and /mnt/disks/userdata is not present at boot,
465
+ # emit ECLOUD_AWAITING_USERDATA and wait until the disk is attached.
466
+ # - On SIGTERM (drain-requested), forward to child, wait for exit, sync
467
+ # + unmount /mnt/disks/userdata, emit ECLOUD_DETACHED, exit.
468
+ # - ECLOUD_READY is emitted once runtime is bootstrapped (same as before).
469
+ # - ECLOUD_FAIL is emitted on any unrecoverable setup error.
470
+ # Keep the markers on any line that resolves a lifecycle outcome.
471
+ #
472
+ # This file is kept in lockstep with
473
+ # ecloud-platform/pkg/services/buildService/assets/compute-source-env.sh.tmpl
474
+ # \u2014 if you change one, change the other. Differences vs the platform copy
475
+ # are intentionally minimal:
476
+ # - Handlebars placeholders use the CLI's naming (kmsServerURL,
477
+ # userAPIURL) rather than the platform's (KMS_SERVER_URL,
478
+ # USER_API_URL). (See top of file for real placeholder syntax \u2014
479
+ # not repeated here so Handlebars doesn't expand it in this comment.)
480
+ # - KMS signing key is read from a file the CLI copies into the image,
481
+ # not heredoc-embedded in the script, because the CLI's image
482
+ # layering writes it as a separate file (kms-signing-public-key.pem).
483
+ # - TLS binary is \`tls-keygen\` (CLI-bundled) not \`tls-client\`.
484
+
441
485
  echo "compute-source-env.sh: Running setup script..."
442
486
 
443
487
  # Fetch and source environment variables from KMS
@@ -453,92 +497,93 @@ if /usr/local/bin/kms-client \\
453
497
  else
454
498
  echo "compute-source-env.sh: ERROR - Failed to fetch environment variables from KMS"
455
499
  echo "compute-source-env.sh: Exiting - cannot start user workload without KMS secrets"
500
+ echo "ECLOUD_FAIL kms_bootstrap"
456
501
  exit 1
457
502
  fi
458
503
 
459
- # Setup TLS if tls-keygen is present (which means TLS was configured at build time)
504
+ # Setup TLS if tls-keygen is present and DOMAIN is configured
460
505
  setup_tls() {
461
506
  # If tls-keygen isn't present, TLS wasn't configured during build
462
507
  if [ ! -x /usr/local/bin/tls-keygen ]; then
463
508
  echo "compute-source-env.sh: TLS not configured (no tls-keygen binary)"
464
509
  return 0
465
510
  fi
466
-
511
+
467
512
  local domain="\${DOMAIN:-}"
468
513
  local mnemonic="\${MNEMONIC:-}"
469
-
470
- # Since tls-keygen is present, TLS is expected - validate requirements
514
+
515
+ # If DOMAIN is not set or is localhost, skip TLS setup
471
516
  if [ -z "$domain" ] || [ "$domain" = "localhost" ]; then
472
- echo "compute-source-env.sh: ERROR - TLS binary present but DOMAIN not configured or is localhost"
473
- echo "compute-source-env.sh: Set DOMAIN environment variable to a valid domain"
474
- exit 1
517
+ echo "compute-source-env.sh: TLS skipped (DOMAIN not set or is localhost)"
518
+ return 0
475
519
  fi
476
-
520
+
477
521
  if [ -z "$mnemonic" ]; then
478
- echo "compute-source-env.sh: ERROR - TLS binary present but MNEMONIC not available"
522
+ echo "compute-source-env.sh: ERROR - TLS requested but MNEMONIC not available"
479
523
  echo "compute-source-env.sh: Cannot obtain TLS certificate without mnemonic"
524
+ echo "ECLOUD_FAIL tls_mnemonic_missing"
480
525
  exit 1
481
526
  fi
482
-
527
+
483
528
  if [ ! -x /usr/local/bin/caddy ]; then
484
- echo "compute-source-env.sh: ERROR - TLS binary present but Caddy not found"
529
+ echo "compute-source-env.sh: ERROR - TLS requested but Caddy not found"
530
+ echo "ECLOUD_FAIL tls_caddy_missing"
485
531
  exit 1
486
532
  fi
487
-
533
+
488
534
  echo "compute-source-env.sh: Setting up TLS for domain: $domain"
489
-
535
+
490
536
  # Obtain TLS certificate using ACME
491
- # Default to http-01, but allow override via ACME_CHALLENGE env var
492
537
  local challenge="\${ACME_CHALLENGE:-http-01}"
493
-
538
+
494
539
  # Check if we should use staging (for testing)
495
540
  local staging_flag=""
496
541
  if [ "\${ACME_STAGING:-false}" = "true" ]; then
497
542
  staging_flag="-staging"
498
- echo "compute-source-env.sh: Using Let's Encrypt STAGING environment (certificates won't be trusted)"
543
+ echo "compute-source-env.sh: Using Let's Encrypt STAGING environment"
499
544
  fi
500
-
545
+
501
546
  echo "compute-source-env.sh: Obtaining TLS certificate using $challenge challenge..."
502
- # Pass the API URL for certificate persistence
503
547
  if ! MNEMONIC="$mnemonic" DOMAIN="$domain" API_URL="{{userAPIURL}}" /usr/local/bin/tls-keygen \\
504
548
  -challenge "$challenge" \\
505
549
  $staging_flag; then
506
550
  echo "compute-source-env.sh: ERROR - Failed to obtain TLS certificate"
507
- echo "compute-source-env.sh: Certificate issuance failed for $domain"
551
+ echo "ECLOUD_FAIL tls_setup"
508
552
  exit 1
509
553
  fi
510
-
554
+
511
555
  echo "compute-source-env.sh: TLS certificate obtained successfully"
512
-
556
+
513
557
  # Validate Caddyfile before starting
514
- if ! /usr/local/bin/caddy validate --config /etc/caddy/Caddyfile --adapter caddyfile 2>/dev/null; then
515
- echo "compute-source-env.sh: ERROR - Invalid Caddyfile"
516
- echo "compute-source-env.sh: TLS was requested (DOMAIN=$domain) but setup failed"
517
- exit 1
518
- fi
519
-
520
- # Start Caddy in background
521
- echo "compute-source-env.sh: Starting Caddy reverse proxy..."
522
-
523
- # Check if Caddy logs should be enabled
524
- if [ "\${ENABLE_CADDY_LOGS:-false}" = "true" ]; then
525
- if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile 2>&1; then
526
- echo "compute-source-env.sh: ERROR - Failed to start Caddy"
527
- echo "compute-source-env.sh: TLS was requested (DOMAIN=$domain) but setup failed"
558
+ if [ -f /etc/caddy/Caddyfile ]; then
559
+ if ! /usr/local/bin/caddy validate --config /etc/caddy/Caddyfile --adapter caddyfile 2>/dev/null; then
560
+ echo "compute-source-env.sh: ERROR - Invalid Caddyfile"
561
+ echo "ECLOUD_FAIL tls_invalid_caddyfile"
528
562
  exit 1
529
563
  fi
530
- else
531
- # Redirect Caddy output to /dev/null to silence logs
532
- if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile >/dev/null 2>&1; then
533
- echo "compute-source-env.sh: ERROR - Failed to start Caddy"
534
- echo "compute-source-env.sh: TLS was requested (DOMAIN=$domain) but setup failed"
535
- exit 1
564
+
565
+ # Start Caddy in background
566
+ echo "compute-source-env.sh: Starting Caddy reverse proxy..."
567
+ if [ "\${ENABLE_CADDY_LOGS:-false}" = "true" ]; then
568
+ if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile 2>&1; then
569
+ echo "compute-source-env.sh: ERROR - Failed to start Caddy"
570
+ echo "ECLOUD_FAIL tls_caddy_start"
571
+ exit 1
572
+ fi
573
+ else
574
+ if ! /usr/local/bin/caddy start --config /etc/caddy/Caddyfile --adapter caddyfile >/dev/null 2>&1; then
575
+ echo "compute-source-env.sh: ERROR - Failed to start Caddy"
576
+ echo "ECLOUD_FAIL tls_caddy_start"
577
+ exit 1
578
+ fi
536
579
  fi
580
+
581
+ sleep 2
582
+ echo "compute-source-env.sh: Caddy started successfully"
583
+ else
584
+ echo "compute-source-env.sh: No Caddyfile found, skipping Caddy"
537
585
  fi
538
-
539
- # Give Caddy a moment to fully initialize
540
- sleep 2
541
- echo "compute-source-env.sh: Caddy started successfully"
586
+
542
587
  return 0
543
588
  }
544
589
 
@@ -549,15 +594,233 @@ setup_tls
549
594
  export KMS_SERVER_URL="{{kmsServerURL}}"
550
595
  export KMS_PUBLIC_KEY="$(cat /usr/local/bin/kms-signing-public-key.pem)"
551
596
 
597
+ # \u2500\u2500 Prewarm-detach: wait for PD if expected \u2500\u2500
598
+ # Orchestrator sets ECLOUD_PD_EXPECTED=1 on apps using StorageBackend=pd.
599
+ # When the prewarm path is used, the new VM boots WITHOUT the disk; we
600
+ # signal awaiting-userdata and poll until the disk is attached.
601
+ USERDATA_MOUNT="/mnt/disks/userdata"
602
+ USERDATA_DEV="/dev/disk/by-id/google-persistent_storage_1"
603
+
604
+ wait_for_userdata() {
605
+ if [ "\${ECLOUD_PD_EXPECTED:-0}" != "1" ]; then
606
+ return 0
607
+ fi
608
+ if mountpoint -q "$USERDATA_MOUNT" 2>/dev/null; then
609
+ echo "compute-source-env.sh: userdata already mounted at $USERDATA_MOUNT"
610
+ return 0
611
+ fi
612
+ # Refuse to proceed if the tools we need for safe first-attach
613
+ # detection are missing. Without blkid we cannot tell an empty new
614
+ # disk from an already-formatted one \u2014 running mkfs.ext4 on the
615
+ # latter would destroy data.
616
+ if ! command -v blkid >/dev/null 2>&1; then
617
+ echo "ECLOUD_FAIL pd_tools_missing"
618
+ exit 1
619
+ fi
620
+ echo "ECLOUD_AWAITING_USERDATA"
621
+ echo "compute-source-env.sh: waiting for PD at $USERDATA_DEV..."
622
+ # Poll for up to 10 minutes (120 * 5s). The orchestrator's overall
623
+ # attach timeout is shorter; the ceiling here just bounds the wait
624
+ # for manual / diagnostic scenarios.
625
+ local i=0
626
+ local mount_failures=0
627
+ while [ "$i" -lt 120 ]; do
628
+ if [ -e "$USERDATA_DEV" ]; then
629
+ mkdir -p "$USERDATA_MOUNT"
630
+ if mount -o noatime "$USERDATA_DEV" "$USERDATA_MOUNT" 2>/dev/null; then
631
+ echo "compute-source-env.sh: PD mounted at $USERDATA_MOUNT"
632
+ return 0
633
+ fi
634
+ # Disk present but mount failed. Check whether it has a
635
+ # recognized filesystem. \`blkid -s TYPE -o value\` prints the
636
+ # FS type (empty if none). We only mkfs when there is
637
+ # demonstrably NO filesystem \u2014 never on the basis of blkid
638
+ # returning non-zero alone, which could mean "blkid missing"
639
+ # or "device busy".
640
+ local fstype
641
+ fstype=$(blkid -s TYPE -o value "$USERDATA_DEV" 2>/dev/null)
642
+ if [ -z "$fstype" ]; then
643
+ echo "compute-source-env.sh: formatting $USERDATA_DEV (first attach)"
644
+ mkfs.ext4 -F -L eclouddata "$USERDATA_DEV" >/dev/null 2>&1 || {
645
+ echo "ECLOUD_FAIL pd_mkfs_failed"
646
+ exit 1
647
+ }
648
+ mount -o noatime "$USERDATA_DEV" "$USERDATA_MOUNT" || {
649
+ echo "ECLOUD_FAIL pd_mount_after_format_failed"
650
+ exit 1
651
+ }
652
+ return 0
653
+ fi
654
+ # Disk has a filesystem but mount still failed. Give it a
655
+ # few retries to cover transient cases (device busy, udev
656
+ # still settling), but don't pretend this is an attach
657
+ # timeout if it persists.
658
+ mount_failures=$((mount_failures + 1))
659
+ if [ "$mount_failures" -ge 6 ]; then
660
+ echo "ECLOUD_FAIL pd_mount_failed"
661
+ exit 1
662
+ fi
663
+ else
664
+ # Device disappeared (e.g. udev re-enumeration between
665
+ # attach and mount). Reset the consecutive-failure counter
666
+ # so only true back-to-back mount failures trip
667
+ # pd_mount_failed; a device blip should not steal retries.
668
+ mount_failures=0
669
+ fi
670
+ i=$((i + 1))
671
+ sleep 5
672
+ done
673
+ echo "ECLOUD_FAIL pd_attach_timeout"
674
+ exit 1
675
+ }
676
+
677
+ wait_for_userdata
678
+
679
+ # \u2500\u2500 Prewarm-detach: install SIGTERM handler for graceful drain \u2500\u2500
680
+ # Orchestrator signals drain by setting the instance metadata key
681
+ # ECLOUD_DRAIN_REQUESTED=1, which a host-level agent translates into
682
+ # SIGTERM on PID 1. On SIGTERM we:
683
+ # 1. Forward to the child (wakes the user's app for graceful exit)
684
+ # 2. Wait for child exit
685
+ # 3. Sync + unmount the PD
686
+ # 4. Emit ECLOUD_DETACHED so the orchestrator can proceed to detach
687
+ CHILD_PID=""
688
+ _DRAIN_IN_PROGRESS=0
689
+
690
+ drain_handler() {
691
+ # Guard against re-entry if SIGTERM arrives twice (e.g. both the
692
+ # drain_watcher and an external signal fire in quick succession).
693
+ if [ "$_DRAIN_IN_PROGRESS" = "1" ]; then
694
+ return 0
695
+ fi
696
+ _DRAIN_IN_PROGRESS=1
697
+ echo "compute-source-env.sh: received drain signal, forwarding to child pgid=$CHILD_PID"
698
+ if [ -n "$CHILD_PID" ]; then
699
+ # Send to the process group so intermediate wrappers (su, sh -c,
700
+ # etc.) don't swallow the signal. The leading \`-\` targets the
701
+ # pgid, which equals the direct child's pid for a shell-backgrounded
702
+ # process. Fall back to the pid alone if pgid signaling fails
703
+ # (e.g. kernel older than 3.9 or PID namespace edge cases).
704
+ kill -TERM -"$CHILD_PID" 2>/dev/null || kill -TERM "$CHILD_PID" 2>/dev/null || true
705
+ # Give the app up to 30s to exit cleanly.
706
+ local i=0
707
+ while [ "$i" -lt 30 ] && kill -0 "$CHILD_PID" 2>/dev/null; do
708
+ i=$((i + 1))
709
+ sleep 1
710
+ done
711
+ if kill -0 "$CHILD_PID" 2>/dev/null; then
712
+ echo "compute-source-env.sh: child did not exit in 30s, sending SIGKILL"
713
+ kill -KILL -"$CHILD_PID" 2>/dev/null || kill -KILL "$CHILD_PID" 2>/dev/null || true
714
+ # Reap the process so its in-flight I/O is flushed to the
715
+ # filesystem before we sync + unmount. SIGKILL schedules
716
+ # death; wait guarantees it's complete.
717
+ wait "$CHILD_PID" 2>/dev/null || true
718
+ fi
719
+ fi
720
+ if [ "\${ECLOUD_PD_EXPECTED:-0}" = "1" ] && mountpoint -q "$USERDATA_MOUNT" 2>/dev/null; then
721
+ sync
722
+ if umount "$USERDATA_MOUNT" 2>/dev/null; then
723
+ echo "compute-source-env.sh: unmounted $USERDATA_MOUNT cleanly"
724
+ else
725
+ # Force lazy unmount as last resort \u2014 orchestrator still needs
726
+ # the DETACHED signal to proceed.
727
+ umount -l "$USERDATA_MOUNT" 2>/dev/null || true
728
+ echo "compute-source-env.sh: WARNING - used lazy unmount on $USERDATA_MOUNT"
729
+ fi
730
+ # ECLOUD_DETACHED is strictly a PD-lifecycle signal. Only emit
731
+ # it when we actually had a PD mount in play, so serial-log
732
+ # parsers and alerting for non-PD apps don't see spurious
733
+ # lifecycle markers on routine container SIGTERM.
734
+ echo "ECLOUD_DETACHED"
735
+ fi
736
+ # Always exit 0: drain is a managed shutdown and the orchestrator
737
+ # waits on ECLOUD_DETACHED, not the container exit code. Forwarding
738
+ # the child's exit status here would make a crash-during-drain look
739
+ # like a drain failure to whatever reads the container exit code.
740
+ exit 0
741
+ }
742
+ trap drain_handler TERM
743
+
744
+ # \u2500\u2500 Prewarm-detach: background drain watcher \u2500\u2500
745
+ # Container metadata delivery in Confidential Space is limited, so we
746
+ # poll the instance metadata server for ECLOUD_DRAIN_REQUESTED and
747
+ # raise SIGTERM on ourselves when it flips to "1".
748
+ #
749
+ # Try wget first (present in most Alpine bases), fall back to curl.
750
+ # If neither is present, drain watcher is disabled \u2014 the orchestrator
751
+ # will hit its drain timeout and fail the upgrade explicitly, which is
752
+ # the correct behavior (we cannot silently ignore a drain request).
753
+ _fetch_drain_flag() {
754
+ local url="http://metadata.google.internal/computeMetadata/v1/instance/attributes/ECLOUD_DRAIN_REQUESTED"
755
+ if command -v wget >/dev/null 2>&1; then
756
+ wget -q --tries=1 --timeout=2 --header='Metadata-Flavor: Google' -O - "$url" 2>/dev/null
757
+ elif command -v curl >/dev/null 2>&1; then
758
+ curl -sf --max-time 2 -H 'Metadata-Flavor: Google' "$url" 2>/dev/null
759
+ else
760
+ return 2
761
+ fi
762
+ }
763
+
764
+ drain_watcher() {
765
+ # Preflight: confirm we have an HTTP client
766
+ if ! _fetch_drain_flag >/dev/null 2>&1; then
767
+ # Either no http client available OR metadata server not
768
+ # responding yet. If no client, give up and log; otherwise the
769
+ # loop below will retry.
770
+ if ! command -v wget >/dev/null 2>&1 && ! command -v curl >/dev/null 2>&1; then
771
+ echo "compute-source-env.sh: WARNING - no wget/curl; drain_watcher disabled"
772
+ return 0
773
+ fi
774
+ fi
775
+ while true; do
776
+ local v
777
+ v=$(_fetch_drain_flag || true)
778
+ if [ "$v" = "1" ]; then
779
+ echo "compute-source-env.sh: drain_watcher saw ECLOUD_DRAIN_REQUESTED=1, signaling PID 1"
780
+ # The CS launcher runs this script directly as PID 1, so
781
+ # kill -TERM 1 delivers SIGTERM to the shell that installed
782
+ # the drain_handler trap. If the launch mechanism ever
783
+ # wraps this script in another process, this assumption
784
+ # breaks and drain will silently no-op \u2014 audit here.
785
+ kill -TERM 1 2>/dev/null || true
786
+ return 0
787
+ fi
788
+ sleep 2
789
+ done
790
+ }
791
+
792
+ if [ "\${ECLOUD_PD_EXPECTED:-0}" = "1" ]; then
793
+ # Assumption: the orchestrator only flips ECLOUD_DRAIN_REQUESTED=1
794
+ # after observing ECLOUD_AWAITING_USERDATA (old VM) or
795
+ # ECLOUD_READY (new VM), so CHILD_PID is always set by the time
796
+ # drain_handler fires. If drain somehow arrived in the tiny window
797
+ # between this watcher spawn and CHILD_PID assignment below,
798
+ # drain_handler would skip the child-kill branch and still emit
799
+ # ECLOUD_DETACHED \u2014 harmless because there's nothing to drain yet.
800
+ if [ -x /usr/local/bin/ecloud-drain-watcher ]; then
801
+ /usr/local/bin/ecloud-drain-watcher &
802
+ else
803
+ drain_watcher &
804
+ fi
805
+ fi
806
+
552
807
  echo "compute-source-env.sh: Environment sourced."
808
+ echo "ECLOUD_READY runtime_bootstrapped"
553
809
 
554
810
  # Drop privileges to original user for the application command
555
811
  if [ -n "$__ECLOUD_ORIGINAL_USER" ] && [ "$(id -u)" = "0" ]; then
556
812
  echo "compute-source-env.sh: Dropping privileges to user: $__ECLOUD_ORIGINAL_USER"
557
- exec su -s /bin/sh "$__ECLOUD_ORIGINAL_USER" -c 'exec "$@"' -- sh "$@"
813
+ # Must background the child so our trap can fire; exec replaces PID 1.
814
+ su -s /bin/sh "$__ECLOUD_ORIGINAL_USER" -c 'exec "$@"' -- sh "$@" &
815
+ CHILD_PID=$!
816
+ wait "$CHILD_PID"
817
+ exit $?
558
818
  fi
559
819
 
560
- exec "$@"
820
+ "$@" &
821
+ CHILD_PID=$!
822
+ wait "$CHILD_PID"
823
+ exit $?
561
824
  `;
562
825
 
563
826
  // src/client/common/templates/scriptTemplate.ts
@@ -749,6 +1012,8 @@ async function layerLocalImage(options, logger) {
749
1012
  logger.debug(`Found DOMAIN=${domainMatch[1]} in ${envFilePath}, including TLS components`);
750
1013
  }
751
1014
  }
1015
+ const drainWatcherSource = findBinary("ecloud-drain-watcher-linux-amd64");
1016
+ const includeDrainWatcher = fs.existsSync(drainWatcherSource);
752
1017
  const layeredDockerfileContent = processDockerfileTemplate({
753
1018
  baseImage: sourceImageRef,
754
1019
  originalCmd: JSON.stringify(originalCmd),
@@ -756,8 +1021,9 @@ async function layerLocalImage(options, logger) {
756
1021
  logRedirect,
757
1022
  resourceUsageAllow,
758
1023
  includeTLS,
759
- ecloudCLIVersion: "0.1.0"
1024
+ ecloudCLIVersion: "0.1.0",
760
1025
  // TODO: Get from package.json
1026
+ includeDrainWatcher
761
1027
  });
762
1028
  const scriptContent = processScriptTemplate({
763
1029
  kmsServerURL: environmentConfig.kmsServerURL,
@@ -767,7 +1033,8 @@ async function layerLocalImage(options, logger) {
767
1033
  environmentConfig,
768
1034
  layeredDockerfileContent,
769
1035
  scriptContent,
770
- includeTLS
1036
+ includeTLS,
1037
+ includeDrainWatcher ? drainWatcherSource : void 0
771
1038
  // logger
772
1039
  );
773
1040
  try {
@@ -782,7 +1049,7 @@ async function layerLocalImage(options, logger) {
782
1049
  fs.rmSync(tempDir, { recursive: true, force: true });
783
1050
  }
784
1051
  }
785
- async function setupLayeredBuildDirectory(environmentConfig, layeredDockerfileContent, scriptContent, includeTLS) {
1052
+ async function setupLayeredBuildDirectory(environmentConfig, layeredDockerfileContent, scriptContent, includeTLS, drainWatcherSource) {
786
1053
  const tempDir = fs.mkdtempSync(path2.join(os.tmpdir(), LAYERED_BUILD_DIR_PREFIX));
787
1054
  try {
788
1055
  const layeredDockerfilePath = path2.join(tempDir, LAYERED_DOCKERFILE_NAME);
@@ -806,6 +1073,11 @@ async function setupLayeredBuildDirectory(environmentConfig, layeredDockerfileCo
806
1073
  }
807
1074
  fs.copyFileSync(kmsClientSource, kmsClientPath);
808
1075
  fs.chmodSync(kmsClientPath, 493);
1076
+ if (drainWatcherSource && fs.existsSync(drainWatcherSource)) {
1077
+ const drainWatcherPath = path2.join(tempDir, DRAIN_WATCHER_BINARY_NAME);
1078
+ fs.copyFileSync(drainWatcherSource, drainWatcherPath);
1079
+ fs.chmodSync(drainWatcherPath, 493);
1080
+ }
809
1081
  if (includeTLS) {
810
1082
  const tlsKeygenPath = path2.join(tempDir, TLS_KEYGEN_BINARY_NAME);
811
1083
  const tlsKeygenSource = findBinary("tls-keygen-linux-amd64");
@@ -4837,7 +5109,7 @@ var CanViewAppLogsPermission = "0x2fd3f2fe";
4837
5109
  var CanViewSensitiveAppInfoPermission = "0x0e67b22f";
4838
5110
  var CanUpdateAppProfilePermission = "0x036fef61";
4839
5111
  function getDefaultClientId() {
4840
- const version = true ? "1.0.0-dev.2" : "0.0.0";
5112
+ const version = true ? "1.0.0-devep1" : "0.0.0";
4841
5113
  return `ecloud-sdk/v${version}`;
4842
5114
  }
4843
5115
  var UserApiClient = class {