direxio-deployer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/AGENTS.md +92 -0
  2. package/LICENSE +21 -0
  3. package/README.md +221 -0
  4. package/README_zh.md +218 -0
  5. package/SKILL.md +722 -0
  6. package/agents/README.md +25 -0
  7. package/agents/openai.yaml +12 -0
  8. package/bin/direxio-deployer.mjs +375 -0
  9. package/package.json +28 -0
  10. package/references/agent-targets.md +128 -0
  11. package/references/architecture.md +44 -0
  12. package/references/bug-history.md +78 -0
  13. package/references/deployment-lessons.md +218 -0
  14. package/references/deployment-optimization-audit.md +317 -0
  15. package/references/deployment-workflow.md +341 -0
  16. package/references/iam-policy.json +52 -0
  17. package/references/runtime-wiring.md +209 -0
  18. package/references/state-machine.md +46 -0
  19. package/references/token-refresh.md +81 -0
  20. package/references/tooling.md +106 -0
  21. package/references/troubleshooting.md +26 -0
  22. package/references/user-journey.md +75 -0
  23. package/references/verification-recovery.md +84 -0
  24. package/references/voip-turn-runbook.md +154 -0
  25. package/references/windows-deployment-notes.md +119 -0
  26. package/scripts/aws-credentials.sh +195 -0
  27. package/scripts/cloud-init/Caddyfile +48 -0
  28. package/scripts/cloud-init/docker-compose.yml +125 -0
  29. package/scripts/cloud-init/init-tokens.sh +238 -0
  30. package/scripts/cloud-init/user-data.yaml +40 -0
  31. package/scripts/destroy.ps1 +77 -0
  32. package/scripts/destroy.sh +589 -0
  33. package/scripts/lib/aws.sh +73 -0
  34. package/scripts/lib/domain.sh +175 -0
  35. package/scripts/lib/operation_report.sh +240 -0
  36. package/scripts/lib/ops.sh +230 -0
  37. package/scripts/lib/paths.sh +35 -0
  38. package/scripts/lib/state.sh +137 -0
  39. package/scripts/mcp-tools-list.mjs +95 -0
  40. package/scripts/orchestrate.ps1 +112 -0
  41. package/scripts/orchestrate.sh +1126 -0
  42. package/scripts/phases/s0_prereq_aws.sh +39 -0
  43. package/scripts/phases/s1_preflight.sh +72 -0
  44. package/scripts/phases/s2_domain.sh +103 -0
  45. package/scripts/phases/s3_provision.sh +421 -0
  46. package/scripts/phases/s4_bootstrap_stack.sh +38 -0
  47. package/scripts/phases/s5_init_tokens.sh +118 -0
  48. package/scripts/phases/s6_wire_local.sh +1435 -0
  49. package/scripts/phases/s7_verify_e2e.sh +136 -0
  50. package/scripts/pricing-estimate.sh +256 -0
  51. package/scripts/render/render-userdata.sh +86 -0
  52. package/scripts/reset-app-data.sh +40 -0
  53. package/scripts/update.sh +30 -0
  54. package/tests/aws_credentials_test.sh +139 -0
  55. package/tests/connect_daemon_runtime_check_test.sh +120 -0
  56. package/tests/default_paths_test.sh +58 -0
  57. package/tests/destroy_local_bridge_test.sh +154 -0
  58. package/tests/destroy_root_identity_test.sh +91 -0
  59. package/tests/destroy_route53_zone_test.sh +80 -0
  60. package/tests/domain_authoritative_dns_test.sh +49 -0
  61. package/tests/mcp_doctor_runtime_check_test.sh +86 -0
  62. package/tests/mcp_smoke_runtime_check_test.sh +121 -0
  63. package/tests/mcp_tools_runtime_check_test.sh +123 -0
  64. package/tests/npm_skill_distribution_test.sh +95 -0
  65. package/tests/operation_report_test.sh +258 -0
  66. package/tests/orchestrate_status_recovery_test.sh +91 -0
  67. package/tests/phase_timeout_test.sh +88 -0
  68. package/tests/pricing_estimate_test.sh +159 -0
  69. package/tests/render_userdata_remote_nodes_test.sh +40 -0
  70. package/tests/root_volume_tracking_test.sh +41 -0
  71. package/tests/route53_overwrite_guard_test.sh +86 -0
  72. package/tests/route53_zone_auto_create_test.sh +66 -0
  73. package/tests/runtime_summary_check_test.sh +203 -0
  74. package/tests/s6_wire_local_test.sh +405 -0
  75. package/tests/skill_structure_test.sh +298 -0
  76. package/tests/update_reset_ops_test.sh +230 -0
  77. package/tests/user_confirmation_gates_test.sh +152 -0
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env bash
2
+ # S0 PREREQ_AWS - validate that AWS credentials are usable.
3
+ #
4
+ # User-side AWS account/IAM/key setup is documented in references/user-journey.md.
5
+ # This phase only validates credentials and reports actionable blockers.
6
+
7
+ run_phase() {
8
+ aws_env_prep
9
+
10
+ # STS is the source of truth and supports env keys, AWS_PROFILE, instance roles, SSO, etc.
11
+ phase_set S0_PREREQ_AWS in_progress "validating AWS credentials"
12
+ local acct arn
13
+ arn=$(aws_identity_arn)
14
+ if [ -n "$arn" ] && [ "$arn" != "None" ]; then
15
+ local root_identity=false
16
+ aws_arn_is_root "$arn" && root_identity=true
17
+ acct=$(aws_identity_account)
18
+ state_set region "${AWS_DEFAULT_REGION:-$(state_get region)}"
19
+ phase_set S0_PREREQ_AWS done "sts ok account=$acct profile=${AWS_PROFILE:-<env/ak>} root=$root_identity arn=$(aws_redact_arn "$arn") region=${AWS_DEFAULT_REGION:-$(state_get region)}"
20
+ ok "AWS credentials are valid (account=$acct${AWS_PROFILE:+, profile=$AWS_PROFILE}, root=$root_identity, arn=$(aws_redact_arn "$arn"))."
21
+ return 0
22
+ fi
23
+
24
+ # Distinguish missing credentials from invalid/not-yet-active credentials.
25
+ if [ -z "${AWS_ACCESS_KEY_ID:-}" ] && [ -z "${AWS_PROFILE:-}" ]; then
26
+ phase_set S0_PREREQ_AWS waiting_user "no usable AWS credentials (no env keys or AWS_PROFILE)"
27
+ warn "No usable AWS credentials found. Choose one:"
28
+ warn " 1. Configure a deployment access key, then export AWS_ACCESS_KEY_ID=... AWS_SECRET_ACCESS_KEY=..."
29
+ warn " 2. Or use an existing profile: export AWS_PROFILE=<your-profile>"
30
+ return 2
31
+ fi
32
+
33
+ phase_set S0_PREREQ_AWS waiting_user "sts failed (invalid credentials, not active yet, or proxy/TLS issue)"
34
+ warn "AWS credential validation failed (profile=${AWS_PROFILE:-<env/ak>}). Possible causes:"
35
+ warn " 1. AK/SK or profile is incorrect."
36
+ warn " 2. Newly created credentials are not active yet; wait a few minutes."
37
+ warn " 3. Local proxy/network is breaking TLS; AWS proxy bypass is already attempted."
38
+ return 2
39
+ }
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env bash
2
+ # S1 PREFLIGHT - default VPC, EC2 vCPU quota, and AMI checks.
3
+ #
4
+ # New AWS accounts often start with low or zero EC2 quota. Report the blocker
5
+ # and keep polling instead of losing deployment state.
6
+
7
+ run_phase() {
8
+ aws_env_prep
9
+ phase_set S1_PREFLIGHT in_progress "running preflight checks"
10
+
11
+ # 1) Default VPC.
12
+ local vpc
13
+ vpc=$(aws ec2 describe-vpcs --filters Name=isDefault,Values=true \
14
+ --query 'Vpcs[0].VpcId' --output text 2>/dev/null || echo "None")
15
+ if [ "$vpc" = "None" ] || [ -z "$vpc" ]; then
16
+ phase_set S1_PREFLIGHT failed "no default VPC in this region"
17
+ fail "This region has no default VPC. In the AWS console, go to VPC -> Create default VPC, or choose another region."
18
+ fi
19
+ res_set vpc_id "$vpc"
20
+ log "Default VPC = $vpc"
21
+
22
+ # 2) vCPU quota. t3.small requires 2 vCPU. Unknown quota is warned but not blocked.
23
+ local quota
24
+ quota=$(aws service-quotas get-service-quota --service-code ec2 --quota-code "$EC2_STD_QUOTA_CODE" \
25
+ --query 'Quota.Value' --output text 2>/dev/null || echo "unknown")
26
+ quota=${quota:-unknown}
27
+ log "On-Demand Standard instance vCPU quota = $quota (need 2)"
28
+ if _is_unknown_quota "$quota"; then
29
+ warn "Could not read quota; continuing. If run-instances returns VcpuLimitExceeded, quota is insufficient."
30
+ elif ! _num_ge "$quota" 2; then
31
+ phase_set S1_PREFLIGHT waiting_user "vCPU quota=$quota (<2), waiting for quota increase"
32
+ warn "EC2 vCPU quota is $quota (<2), which is common on new AWS accounts."
33
+ warn "Open Service Quotas -> Amazon EC2 ->"
34
+ warn " 'Running On-Demand Standard (A,C,D,H,I,M,R,T,Z) instances' and request quota >= 2."
35
+ warn "After submitting the request, you can leave this running; it checks every ${QUOTA_POLL_INTERVAL:-300}s."
36
+ poll_until "vCPU quota >= 2" "${QUOTA_POLL_INTERVAL:-300}" 0 _quota_ge_2 \
37
+ || { phase_set S1_PREFLIGHT failed "quota polling interrupted"; return 1; }
38
+ fi
39
+
40
+ # 3) AMI (amd64/x86).
41
+ local ami
42
+ ami=$(aws_lookup_ubuntu_ami)
43
+ if [ "$ami" = "None" ] || [ -z "$ami" ]; then
44
+ phase_set S1_PREFLIGHT failed "failed to resolve Ubuntu AMI"
45
+ fail "Could not resolve Ubuntu 22.04 amd64 AMI (SSM parameter unavailable)."
46
+ fi
47
+ res_set ami_id "$ami"
48
+ log "AMI = $ami (Ubuntu 22.04 amd64/x86, user=ubuntu)"
49
+
50
+ phase_set S1_PREFLIGHT done "vpc=$vpc quota=$quota ami=$ami"
51
+ return 0
52
+ }
53
+
54
+ # Values used when quota cannot be read. These warn but do not block.
55
+ _is_unknown_quota() {
56
+ case "$1" in ""|unknown|None|null) return 0;; *) return 1;; esac
57
+ }
58
+
59
+ # Numeric comparison $1 >= $2. Use -v to avoid awk syntax errors on empty/non-numeric input.
60
+ _num_ge() {
61
+ awk -v a="$1" -v b="$2" 'BEGIN{ if (a+0 >= b+0) exit 0; else exit 1 }'
62
+ }
63
+
64
+ # Quota >= 2 check for poll_until. Empty/None counts as not ready.
65
+ _quota_ge_2() {
66
+ local q
67
+ q=$(aws service-quotas get-service-quota --service-code ec2 --quota-code "$EC2_STD_QUOTA_CODE" \
68
+ --query 'Quota.Value' --output text 2>/dev/null || echo "0")
69
+ q=${q:-0}
70
+ _is_unknown_quota "$q" && return 1
71
+ _num_ge "$q" 2
72
+ }
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env bash
2
+ # S2 DOMAIN_DECISION — validate the production Matrix server_name.
3
+ #
4
+ # P2P-IM production deployments require a real, long-lived domain. Temporary
5
+ # sslip.io/public-IP domains are intentionally not part of this interface.
6
+ #
7
+ # Supported modes:
8
+ # DOMAIN_MODE=user user owns DNS; S3 waits until A record points at the EIP
9
+ # DOMAIN_MODE=route53 Route53 hosted zone; ops manages the A record
10
+ #
11
+ # If DOMAIN_MODE is omitted but DOMAIN is present, user mode is assumed.
12
+ # P2P_ASSUME_DEFAULTS never chooses a domain.
13
+
14
+ S2_PHASE_DIR=$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")/.." && pwd)
15
+ source "$S2_PHASE_DIR/lib/domain.sh"
16
+
17
+ run_phase() {
18
+ phase_set S2_DOMAIN in_progress "validating production domain"
19
+
20
+ local mode=${DOMAIN_MODE:-}
21
+ local domain=${DOMAIN:-}
22
+
23
+ if [ -z "$mode" ]; then
24
+ if [ -n "$domain" ]; then
25
+ mode=user
26
+ elif [ -t 0 ]; then
27
+ warn "P2P-IM requires a production domain as the Matrix server_name."
28
+ warn "Changing the domain is effectively a new homeserver identity; temporary sslip.io defaults are not supported."
29
+ printf "Enter the final domain (for example __DOMAIN__): " >&2
30
+ read -r domain
31
+ [ -n "$domain" ] || {
32
+ phase_set S2_DOMAIN waiting_user "waiting for production domain"
33
+ warn "DOMAIN was not provided. Prepare a production domain and DNS control first."
34
+ return 2
35
+ }
36
+ mode=user
37
+ else
38
+ phase_set S2_DOMAIN waiting_user "waiting for production domain"
39
+ warn "Deployment blocked: DOMAIN is missing. P2P-IM no longer supports temporary sslip.io defaults."
40
+ warn "Prepare a production domain such as __DOMAIN__. Matrix server_name binds to that domain; changing it later is effectively a new homeserver identity."
41
+ warn "Example:"
42
+ warn " DOMAIN=__DOMAIN__ DOMAIN_MODE=user CONFIRM_DOMAIN_BINDING=1 bash scripts/orchestrate.sh"
43
+ return 2
44
+ fi
45
+ fi
46
+
47
+ if [ -z "$domain" ]; then
48
+ phase_set S2_DOMAIN waiting_user "$mode mode requires DOMAIN"
49
+ warn "Deployment blocked: DOMAIN_MODE=$mode requires explicit DOMAIN."
50
+ warn "Example: DOMAIN=__DOMAIN__ DOMAIN_MODE=$mode CONFIRM_DOMAIN_BINDING=1 bash scripts/orchestrate.sh"
51
+ return 2
52
+ fi
53
+ domain=$(domain_normalize "$domain")
54
+ if ! domain_is_formal_name "$domain"; then
55
+ phase_set S2_DOMAIN waiting_user "DOMAIN is not a valid production domain"
56
+ warn "Deployment blocked: DOMAIN=$domain is not a valid production domain."
57
+ warn "Use a long-lived domain you own and can manage in DNS, such as __DOMAIN__. IPs, localhost, wildcards, and temporary resolver domains are not accepted."
58
+ return 2
59
+ fi
60
+
61
+ if [ "${CONFIRM_DOMAIN_BINDING:-0}" != "1" ]; then
62
+ phase_set S2_DOMAIN waiting_user "domain binding irreversibility not confirmed"
63
+ warn "Deployment blocked: Matrix server_name domain binding must be confirmed."
64
+ warn "After $domain becomes server_name, changing the domain is effectively a new homeserver identity."
65
+ warn "Rerun after confirmation:"
66
+ warn " DOMAIN=$domain DOMAIN_MODE=$mode CONFIRM_DOMAIN_BINDING=1 bash scripts/orchestrate.sh"
67
+ return 2
68
+ fi
69
+
70
+ case "$mode" in
71
+ user)
72
+ state_set domain_mode user
73
+ state_set domain "$domain"
74
+ state_set_raw domain_confirmed_irreversible 'true'
75
+ warn "Domain mode = user ($domain). S3 will wait for the DNS A record to point at the new EIP."
76
+ warn "If DNS is hosted on Cloudflare, set the record to DNS only; do not enable proxying."
77
+ ;;
78
+ route53)
79
+ state_set domain_mode route53
80
+ state_set domain "$domain"
81
+ state_set_raw domain_confirmed_irreversible 'true'
82
+ log "Domain mode = route53 ($domain). The agent will create the A record automatically; IAM needs Route53 permissions."
83
+ ;;
84
+ buy)
85
+ phase_set S2_DOMAIN waiting_user "automatic domain purchase disabled"
86
+ warn "buy mode is disabled: ops will not purchase domains automatically."
87
+ warn "Domain purchase involves billing, identity/compliance steps, and irreversible ownership decisions."
88
+ warn "Prepare the domain manually, then use DOMAIN=$domain with DOMAIN_MODE=user or DOMAIN_MODE=route53."
89
+ return 2
90
+ ;;
91
+ *)
92
+ phase_set S2_DOMAIN failed "unknown DOMAIN_MODE=$mode"
93
+ fail "Unknown DOMAIN_MODE=$mode (expected user|route53; ec2 temporary-domain mode was removed, buy mode is disabled)." ;;
94
+ esac
95
+
96
+ if [ "${DOMAIN_VERIFIED:-0}" != "1" ] && ! domain_has_dns_record "$domain"; then
97
+ warn "No A/AAAA/CNAME record was found for $domain."
98
+ warn "If this is a new domain, confirm DNS hosting is active. S3 will still wait for the A record to point at the new EIP."
99
+ fi
100
+
101
+ phase_set S2_DOMAIN done "mode=$mode domain=$domain"
102
+ return 0
103
+ }
@@ -0,0 +1,421 @@
1
+ #!/usr/bin/env bash
2
+ # S3 PROVISION_EC2 - key pair, security group, cloud-init, EC2, EIP, DNS.
3
+ #
4
+ # Default instance type is x86/amd64 t3.small (2 vCPU / 2GB). Every resource is
5
+ # persisted immediately so deployment can resume and destroy.sh can clean up.
6
+
7
+ S3_PHASE_DIR=$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")/.." && pwd)
8
+ source "$S3_PHASE_DIR/lib/domain.sh"
9
+
10
+ run_phase() {
11
+ aws_env_prep
12
+ phase_set S3_PROVISION in_progress "provisioning EC2"
13
+
14
+ local name region instance_type ami sg vpc
15
+ name=$(state_get run_id)
16
+ region=$(state_get region)
17
+ instance_type=$(state_get instance_type)
18
+ if [ -z "$instance_type" ]; then
19
+ instance_type=${INSTANCE_TYPE:-}
20
+ if [ -z "$instance_type" ]; then
21
+ if [ "${P2P_ASSUME_DEFAULTS:-0}" = "1" ]; then
22
+ instance_type=t3.small
23
+ elif [ -t 0 ]; then
24
+ warn "Default EC2 instance type is t3.small (2 vCPU / 2GB). Do you need a larger instance?"
25
+ printf "Use a larger instance? [y/N] " >&2
26
+ local ans chosen
27
+ read -r ans
28
+ if is_yes "$ans"; then
29
+ printf "Enter EC2 instance type [t3.medium]: " >&2
30
+ read -r chosen
31
+ instance_type=${chosen:-t3.medium}
32
+ else
33
+ instance_type=t3.small
34
+ fi
35
+ else
36
+ phase_set S3_PROVISION waiting_user "waiting for EC2 instance type confirmation"
37
+ warn "EC2 instance type must be confirmed. Default t3.small = 2 vCPU / 2GB."
38
+ warn " Use default: INSTANCE_TYPE=t3.small bash scripts/orchestrate.sh"
39
+ warn " Use larger: INSTANCE_TYPE=t3.medium bash scripts/orchestrate.sh"
40
+ warn " Larger instances require matching vCPU quota. If run-instances returns VcpuLimitExceeded, return to S1 and request quota."
41
+ return 2
42
+ fi
43
+ fi
44
+ state_set instance_type "$instance_type"
45
+ fi
46
+ if declare -F ensure_cost_estimate >/dev/null 2>&1; then
47
+ ensure_cost_estimate
48
+ fi
49
+ ami=$(res_get ami_id)
50
+ vpc=$(res_get vpc_id)
51
+ local message_server_image
52
+ message_server_image=${MESSAGE_SERVER_IMAGE:-direxio/message-server:latest}
53
+ local scripts_dir=${P2P_INSTALL_SCRIPTS_DIR:-${HERE:-$S3_PHASE_DIR}}
54
+
55
+ # 1) Key pair (idempotent).
56
+ local keyfile="$P2P_WORKDIR/${name}.pem"
57
+ if [ -z "$(res_get key_name)" ]; then
58
+ log "Creating key pair $name ..."
59
+ aws ec2 create-key-pair --key-name "$name" --query KeyMaterial --output text > "$keyfile"
60
+ chmod 600 "$keyfile"
61
+ res_set key_name "$name"; res_set key_file "$keyfile"
62
+ else
63
+ log "Key pair already exists; skipping."; keyfile=$(res_get key_file)
64
+ fi
65
+
66
+ # 2) Security group (idempotent): 22/80/443 + TURN relay ports.
67
+ if [ -z "$(res_get sg_id)" ]; then
68
+ log "Creating security group (22/80/443 + TURN 3478/49160-49200)..."
69
+ warn "Security group opens 22/80/443, TURN 3478 tcp/udp, and 49160-49200/udp to 0.0.0.0/0."
70
+ warn "Keep the SSH private key, AWS credentials, and password secure."
71
+ sg=$(aws ec2 create-security-group --group-name "$name" \
72
+ --description "p2p-matrix $name" --vpc-id "$vpc" --query GroupId --output text)
73
+ res_set sg_id "$sg"
74
+ local p
75
+ for p in 22 80 443; do
76
+ aws ec2 authorize-security-group-ingress --group-id "$sg" \
77
+ --protocol tcp --port "$p" --cidr 0.0.0.0/0 >/dev/null
78
+ done
79
+ # TURN main port 3478 (udp+tcp); first version does not expose turns:5349.
80
+ aws ec2 authorize-security-group-ingress --group-id "$sg" --protocol tcp --port 3478 --cidr 0.0.0.0/0 >/dev/null
81
+ aws ec2 authorize-security-group-ingress --group-id "$sg" --protocol udp --port 3478 --cidr 0.0.0.0/0 >/dev/null
82
+ # Narrow TURN UDP relay range to 49160-49200.
83
+ aws ec2 authorize-security-group-ingress --group-id "$sg" --protocol udp --port 49160-49200 --cidr 0.0.0.0/0 >/dev/null
84
+ else
85
+ log "Security group already exists; skipping."; sg=$(res_get sg_id)
86
+ fi
87
+
88
+ # 3) Render cloud-init with compose/Caddyfile/init-tokens embedded.
89
+ local domain_mode domain
90
+ domain_mode=$(state_get domain_mode)
91
+ domain=$(state_get domain)
92
+ domain=$(domain_normalize "$domain")
93
+ if [ -z "$domain" ]; then
94
+ phase_set S3_PROVISION waiting_user "production domain missing"
95
+ warn "S3 requires a production DOMAIN. Complete S2_DOMAIN first."
96
+ return 2
97
+ fi
98
+ local userdata="$P2P_WORKDIR/user-data.yaml"
99
+ log "Rendering cloud-init (domain_mode=$domain_mode)..."
100
+ bash "$scripts_dir/render/render-userdata.sh" \
101
+ --domain "$domain" \
102
+ --acme "${ACME_EMAIL:-}" \
103
+ --message-server-image "$message_server_image" \
104
+ > "$userdata"
105
+ local userdata_aws="$userdata"
106
+ if command -v cygpath >/dev/null 2>&1; then
107
+ userdata_aws=$(cygpath -w "$userdata")
108
+ fi
109
+
110
+ # 4) Launch EC2 (idempotent: reuse running/pending instance).
111
+ local iid
112
+ iid=$(res_get instance_id)
113
+ if [ -n "$iid" ] && aws ec2 describe-instances --instance-ids "$iid" \
114
+ --query 'Reservations[0].Instances[0].State.Name' --output text 2>/dev/null \
115
+ | grep -qE 'running|pending'; then
116
+ log "Instance $iid already exists; skipping creation."
117
+ else
118
+ log "Launching EC2 instance (x86 $instance_type, $ami)..."
119
+ iid=$(aws ec2 run-instances --image-id "$ami" --instance-type "$instance_type" \
120
+ --key-name "$name" --security-group-ids "$sg" \
121
+ --user-data "file://$userdata_aws" \
122
+ --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$name}]" \
123
+ --query 'Instances[0].InstanceId' --output text) || {
124
+ phase_set S3_PROVISION failed "run-instances failed (possibly VcpuLimitExceeded)"
125
+ warn "run-instances failed. If the error is VcpuLimitExceeded, return to S1 and request quota."
126
+ return 1
127
+ }
128
+ res_set instance_id "$iid"
129
+ log "Waiting for instance to become running ..."
130
+ aws ec2 wait instance-running --instance-ids "$iid" || {
131
+ phase_set S3_PROVISION failed "instance did not become running before timeout"
132
+ warn "Timed out waiting for instance running. Check status with aws ec2 describe-instances --instance-ids $iid, then rerun to resume."
133
+ return 1
134
+ }
135
+ fi
136
+ _record_root_volume_id "$iid"
137
+
138
+ # 5) Public address. Production-domain deployments require EIP for stable DNS.
139
+ local pubip
140
+ if [ -z "$(res_get eip_id)" ]; then
141
+ log "Allocating and associating Elastic IP ..."
142
+ local eip
143
+ eip=$(aws ec2 allocate-address --domain vpc --query AllocationId --output text) || {
144
+ phase_set S3_PROVISION failed "failed to allocate EIP"
145
+ warn "Failed to allocate Elastic IP. Check EIP quota, region, and AWS permissions."
146
+ return 1
147
+ }
148
+ [ -n "$eip" ] && [ "$eip" != "None" ] || {
149
+ phase_set S3_PROVISION failed "EIP allocation returned no AllocationId"
150
+ warn "Elastic IP allocation returned no AllocationId. Check AWS response and rerun."
151
+ return 1
152
+ }
153
+ res_set eip_id "$eip"
154
+ aws ec2 associate-address --instance-id "$iid" --allocation-id "$eip" >/dev/null || {
155
+ phase_set S3_PROVISION failed "failed to associate EIP"
156
+ warn "Failed to associate Elastic IP with the instance. Check instance status, EIP quota, and AWS permissions."
157
+ return 1
158
+ }
159
+ fi
160
+ pubip=$(aws ec2 describe-addresses --allocation-ids "$(res_get eip_id)" \
161
+ --query 'Addresses[0].PublicIp' --output text) || {
162
+ phase_set S3_PROVISION failed "failed to read EIP public IP"
163
+ warn "Failed to read Elastic IP address. Check AllocationId=$(res_get eip_id)."
164
+ return 1
165
+ }
166
+ [ -n "$pubip" ] && [ "$pubip" != "None" ] || {
167
+ phase_set S3_PROVISION failed "EIP returned no public IP"
168
+ warn "Elastic IP returned no public IP. Check AWS address allocation status."
169
+ return 1
170
+ }
171
+ res_set public_ip "$pubip"
172
+ log "Public IP = $pubip; domain = $(state_get domain)"
173
+
174
+ if [ "$domain_mode" = "route53" ]; then
175
+ local route53_rc=0
176
+ _upsert_route53_record "$domain" "$pubip" || route53_rc=$?
177
+ [ "$route53_rc" -eq 0 ] || return "$route53_rc"
178
+ fi
179
+
180
+ if [ "$domain_mode" = "user" ] || [ "$domain_mode" = "route53" ]; then
181
+ _require_user_dns_ready "$domain_mode" "$domain" "$pubip" "$instance_type" || return 2
182
+ fi
183
+
184
+ phase_set S3_PROVISION done "instance=$iid ip=$pubip domain=$(state_get domain)"
185
+ return 0
186
+ }
187
+
188
+ _record_root_volume_id() {
189
+ local iid=$1 volume_id
190
+ [ -n "$iid" ] || return 0
191
+ volume_id=$(aws ec2 describe-instances --instance-ids "$iid" \
192
+ --query 'Reservations[0].Instances[0].BlockDeviceMappings[?Ebs.VolumeId!=`null`].Ebs.VolumeId | [0]' \
193
+ --output text 2>/dev/null) || return 0
194
+ [ -n "$volume_id" ] && [ "$volume_id" != "None" ] || return 0
195
+ res_set root_volume_id "$volume_id"
196
+ }
197
+
198
+ _upsert_route53_record() {
199
+ local domain=$1 pubip=$2 zone zone_id zone_name change_file change_id
200
+ zone=$(_find_or_create_route53_zone "$domain") || {
201
+ phase_set S3_PROVISION failed "Route53 hosted zone unavailable"
202
+ warn "DOMAIN_MODE=route53 requires Route53 permission to list or create the hosted zone for $domain."
203
+ return 1
204
+ }
205
+ zone_id=$(printf '%s' "$zone" | cut -f1)
206
+ zone_name=$(printf '%s' "$zone" | cut -f2)
207
+ if [ -z "$zone_id" ]; then
208
+ phase_set S3_PROVISION failed "Route53 hosted zone not found"
209
+ warn "DOMAIN_MODE=route53 requires the parent domain of $domain to exist in a Route53 hosted zone."
210
+ return 1
211
+ fi
212
+ _guard_route53_a_overwrite "$zone_id" "$domain" "$pubip" || return $?
213
+
214
+ log "Route53 upsert: $domain A $pubip (zone=$zone_name)"
215
+ change_file=$(mktemp)
216
+ cat > "$change_file" <<EOF
217
+ {
218
+ "Comment": "p2p-matrix deployment",
219
+ "Changes": [
220
+ {
221
+ "Action": "UPSERT",
222
+ "ResourceRecordSet": {
223
+ "Name": "$domain.",
224
+ "Type": "A",
225
+ "TTL": 60,
226
+ "ResourceRecords": [{ "Value": "$pubip" }]
227
+ }
228
+ }
229
+ ]
230
+ }
231
+ EOF
232
+ local change_file_aws="$change_file"
233
+ if command -v cygpath >/dev/null 2>&1; then
234
+ change_file_aws=$(cygpath -w "$change_file")
235
+ fi
236
+ change_id=$(aws route53 change-resource-record-sets \
237
+ --hosted-zone-id "$zone_id" \
238
+ --change-batch "file://$change_file_aws" \
239
+ --query 'ChangeInfo.Id' --output text) || {
240
+ rm -f "$change_file"
241
+ phase_set S3_PROVISION failed "Route53 upsert failed"
242
+ return 1
243
+ }
244
+ rm -f "$change_file"
245
+ aws route53 wait resource-record-sets-changed --id "$change_id" || {
246
+ phase_set S3_PROVISION failed "Route53 change did not complete"
247
+ return 1
248
+ }
249
+ return 0
250
+ }
251
+
252
+ _route53_existing_a_value() {
253
+ local zone_id=$1 domain=$2 records name
254
+ name="${domain}."
255
+ records=$(aws route53 list-resource-record-sets --hosted-zone-id "$zone_id" --output json 2>/dev/null) || return 0
256
+ printf '%s\n' "$records" | jq -r --arg name "$name" '
257
+ .ResourceRecordSets[]?
258
+ | select(.Name == $name and .Type == "A")
259
+ | [.ResourceRecords[]?.Value]
260
+ | join(",")
261
+ ' | sed -n '1p'
262
+ }
263
+
264
+ _guard_route53_a_overwrite() {
265
+ local zone_id=$1 domain=$2 pubip=$3 existing confirmed
266
+ existing=$(_route53_existing_a_value "$zone_id" "$domain")
267
+ [ -n "$existing" ] || return 0
268
+ [ "$existing" = "$pubip" ] && return 0
269
+
270
+ res_set route53_existing_a_value "$existing"
271
+ res_set route53_pending_a_value "$pubip"
272
+ confirmed=${DIREXIO_CONFIRM_DNS_OVERWRITE:-${CONFIRM_DNS_OVERWRITE:-0}}
273
+ if [ "$confirmed" = "1" ]; then
274
+ res_set route53_overwrite_confirmed "true"
275
+ warn "Route53 A record overwrite confirmed: $domain $existing -> $pubip."
276
+ return 0
277
+ fi
278
+
279
+ phase_set S3_PROVISION waiting_user "Route53 A record overwrite requires confirmation"
280
+ warn "Route53 A record overwrite requires confirmation for $domain."
281
+ warn "Current A record: $existing"
282
+ warn "New deployment IP: $pubip"
283
+ warn "If this is intentional, rerun with DIREXIO_CONFIRM_DNS_OVERWRITE=1."
284
+ return 2
285
+ }
286
+
287
+ _route53_zone_from_state() {
288
+ local zone_id zone_name
289
+ zone_id=$(res_get route53_zone_id)
290
+ zone_name=$(res_get route53_zone_name)
291
+ [ -n "$zone_id" ] && [ -n "$zone_name" ] || return 1
292
+ printf '%s\t%s\n' "$zone_id" "$zone_name"
293
+ }
294
+
295
+ _record_route53_zone() {
296
+ local zone_id=$1 zone_name=$2 created=${3:-false} name_servers=${4:-}
297
+ res_set route53_zone_id "$zone_id"
298
+ res_set route53_zone_name "${zone_name%.}"
299
+ if [ -z "$(res_get route53_zone_created_by_deployer)" ] || [ "$created" = "true" ]; then
300
+ res_set route53_zone_created_by_deployer "$created"
301
+ fi
302
+ [ -n "$name_servers" ] && res_set route53_name_servers "$name_servers"
303
+ }
304
+
305
+ _find_or_create_route53_zone() {
306
+ local domain=$1 zone zone_id zone_name find_rc
307
+ if zone=$(_route53_zone_from_state); then
308
+ printf '%s\n' "$zone"
309
+ return 0
310
+ fi
311
+
312
+ if zone=$(_find_route53_zone "$domain"); then
313
+ zone_id=$(printf '%s' "$zone" | cut -f1)
314
+ zone_name=$(printf '%s' "$zone" | cut -f2)
315
+ _record_route53_zone "$zone_id" "$zone_name" false
316
+ printf '%s\n' "$zone"
317
+ return 0
318
+ else
319
+ find_rc=$?
320
+ fi
321
+
322
+ case "$find_rc" in
323
+ 1) _create_route53_zone "$domain" ;;
324
+ *) return 1 ;;
325
+ esac
326
+ }
327
+
328
+ _find_route53_zone() {
329
+ local domain=$1 best_id="" best_name="" best_len=0 id name clean len zones_json
330
+ zones_json=$(aws route53 list-hosted-zones --output json) || return 2
331
+ while IFS=$'\t' read -r id name; do
332
+ id=${id%$'\r'}
333
+ name=${name%$'\r'}
334
+ clean=${name%.}
335
+ case "$domain" in
336
+ "$clean"|*."$clean")
337
+ len=${#clean}
338
+ if [ "$len" -gt "$best_len" ]; then
339
+ best_id=${id#/hostedzone/}
340
+ best_name=$clean
341
+ best_len=$len
342
+ fi
343
+ ;;
344
+ esac
345
+ done < <(printf '%s\n' "$zones_json" | jq -r '.HostedZones[] | [.Id, .Name] | @tsv')
346
+ [ -n "$best_id" ] || return 1
347
+ printf '%s\t%s\n' "$best_id" "$best_name"
348
+ }
349
+
350
+ _create_route53_zone() {
351
+ local domain=$1 zone_name caller created zone_id returned_name name_servers
352
+ zone_name=${DIREXIO_ROUTE53_ZONE_NAME:-$domain}
353
+ caller="direxio-$(state_get run_id)-$(date -u +%Y%m%d%H%M%S)"
354
+ created=$(aws route53 create-hosted-zone \
355
+ --name "$zone_name" \
356
+ --caller-reference "$caller" \
357
+ --output json) || return 1
358
+ zone_id=$(printf '%s\n' "$created" | jq -r '.HostedZone.Id // empty' | sed 's#^/hostedzone/##')
359
+ returned_name=$(printf '%s\n' "$created" | jq -r '.HostedZone.Name // empty')
360
+ name_servers=$(printf '%s\n' "$created" | jq -r '(.DelegationSet.NameServers // []) | join(",")')
361
+ [ -n "$zone_id" ] && [ -n "$returned_name" ] || return 1
362
+
363
+ _record_route53_zone "$zone_id" "${returned_name%.}" true "$name_servers"
364
+ warn "Created Route53 hosted zone ${returned_name%.} (id=$zone_id). This hosted zone is billable until deleted."
365
+ if [ -n "$name_servers" ]; then
366
+ warn "Route53 nameservers: $name_servers"
367
+ warn "If the domain is registered outside Route53, delegate NS at the registrar before DNS can resolve."
368
+ fi
369
+ printf '%s\t%s\n' "$zone_id" "${returned_name%.}"
370
+ }
371
+
372
+ _require_user_dns_ready() {
373
+ local domain_mode=$1 domain=$2 pubip=$3 instance_type=$4
374
+ if [ "$(state_get dns_ready)" = "true" ]; then
375
+ domain_resolves_to_ip "$domain" "$pubip" && return 0
376
+ warn "state has dns_ready=true, but current DNS does not point to $pubip. Continuing to wait to avoid early certificate issuance."
377
+ state_set_raw dns_ready 'false'
378
+ fi
379
+ if domain_resolves_to_ip "$domain" "$pubip"; then
380
+ ok "DNS resolves to $pubip: $domain"
381
+ state_set_raw dns_ready 'true'
382
+ return 0
383
+ fi
384
+ if [ "${DNS_READY:-0}" = "1" ] || [ "${CONFIRM_DNS_READY:-0}" = "1" ]; then
385
+ if domain_resolves_to_ip "$domain" "$pubip"; then
386
+ state_set_raw dns_ready 'true'
387
+ return 0
388
+ fi
389
+ warn "DNS_READY is set, but $domain does not resolve to ${pubip} yet. Waiting to avoid Caddy/Let's Encrypt racing DNS."
390
+ fi
391
+
392
+ if [ "$domain_mode" = "route53" ]; then
393
+ warn "Route53 A record was submitted, but $domain does not resolve to ${pubip} yet."
394
+ warn "This is usually DNS propagation delay; rerun later to continue."
395
+ else
396
+ warn "Update DNS so $domain has an A record pointing to this EC2 public IP:"
397
+ warn " $domain A $pubip"
398
+ warn "Use a subdomain such as __DOMAIN__. If DNS is on Cloudflare, set it to DNS only; do not enable proxying."
399
+ fi
400
+ warn "Use this command to confirm DNS now points at the new IP:"
401
+ warn " dig +short $domain"
402
+ warn "Continue to S4 only after DNS is active, otherwise Caddy cannot issue the Let's Encrypt certificate."
403
+
404
+ if [ "$domain_mode" = "user" ] && [ -t 0 ]; then
405
+ printf "Have you updated the DNS A record and waited for propagation? [y/N] " >&2
406
+ local ans
407
+ read -r ans
408
+ if is_yes "$ans"; then
409
+ if domain_resolves_to_ip "$domain" "$pubip"; then
410
+ state_set_raw dns_ready 'true'
411
+ return 0
412
+ fi
413
+ warn "$domain still does not resolve to $pubip; confirmation alone is not enough to continue."
414
+ fi
415
+ fi
416
+
417
+ phase_set S3_PROVISION waiting_user "waiting for DNS A record $domain -> $pubip"
418
+ warn "After DNS is ready, rerun:"
419
+ warn " DOMAIN=$domain DOMAIN_MODE=$domain_mode CONFIRM_DOMAIN_BINDING=1 INSTANCE_TYPE=$instance_type bash scripts/orchestrate.sh"
420
+ return 2
421
+ }
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env bash
2
+ # S4 BOOTSTRAP_STACK - cloud-init installs Docker, starts the stack, and gets TLS.
3
+ # The local agent polls https://<domain>/healthz until it returns 200.
4
+
5
+ run_phase() {
6
+ phase_set S4_BOOTSTRAP_STACK polling "waiting for instance bootstrap and services"
7
+ local domain pubip keyfile curl_connect_timeout curl_max_time
8
+ domain=$(state_get domain)
9
+ pubip=$(res_get public_ip)
10
+ keyfile=$(res_get key_file)
11
+ curl_connect_timeout=${HEALTH_CURL_CONNECT_TIMEOUT:-10}
12
+ curl_max_time=${HEALTH_CURL_MAX_TIME:-20}
13
+
14
+ log "Waiting for bootstrap (install Docker -> start postgres/message-server/caddy/coturn -> issue Let's Encrypt certificate)..."
15
+ log "First image pull and certificate issuance usually take 5-10 minutes. Checking https://$domain/healthz every ${HEALTH_POLL_INTERVAL:-10}s (curl connect timeout ${curl_connect_timeout}s, max ${curl_max_time}s) ..."
16
+
17
+ if poll_until "health check https://$domain/healthz == 200" \
18
+ "${HEALTH_POLL_INTERVAL:-10}" "${HEALTH_POLL_MAX:-90}" _healthz_ok "$domain"; then
19
+ phase_set S4_BOOTSTRAP_STACK done "healthz 200 @ https://$domain"
20
+ return 0
21
+ fi
22
+
23
+ phase_set S4_BOOTSTRAP_STACK failed "healthz did not return 200 before timeout"
24
+ warn "Health check timed out. Inspect cloud-init logs over SSH:"
25
+ warn " ssh -i $keyfile ubuntu@$pubip 'sudo tail -n 80 /var/log/cloud-init-output.log; cd /opt/p2p && sudo docker compose ps && sudo docker compose logs message-server --tail=80'"
26
+ warn "See references/troubleshooting.md for targeted troubleshooting."
27
+ return 1
28
+ }
29
+
30
+ _healthz_ok() {
31
+ local domain=$1 pubip curl_args
32
+ pubip=$(res_get public_ip)
33
+ curl_args=(-skf --connect-timeout "${HEALTH_CURL_CONNECT_TIMEOUT:-10}" --max-time "${HEALTH_CURL_MAX_TIME:-20}")
34
+ if [ -n "$pubip" ]; then
35
+ curl "${curl_args[@]}" --resolve "$domain:443:$pubip" "https://$domain/healthz" >/dev/null 2>&1 && return 0
36
+ fi
37
+ curl "${curl_args[@]}" "https://$domain/healthz" >/dev/null 2>&1
38
+ }