direxio-deployer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/AGENTS.md +92 -0
  2. package/LICENSE +21 -0
  3. package/README.md +221 -0
  4. package/README_zh.md +218 -0
  5. package/SKILL.md +722 -0
  6. package/agents/README.md +25 -0
  7. package/agents/openai.yaml +12 -0
  8. package/bin/direxio-deployer.mjs +375 -0
  9. package/package.json +28 -0
  10. package/references/agent-targets.md +128 -0
  11. package/references/architecture.md +44 -0
  12. package/references/bug-history.md +78 -0
  13. package/references/deployment-lessons.md +218 -0
  14. package/references/deployment-optimization-audit.md +317 -0
  15. package/references/deployment-workflow.md +341 -0
  16. package/references/iam-policy.json +52 -0
  17. package/references/runtime-wiring.md +209 -0
  18. package/references/state-machine.md +46 -0
  19. package/references/token-refresh.md +81 -0
  20. package/references/tooling.md +106 -0
  21. package/references/troubleshooting.md +26 -0
  22. package/references/user-journey.md +75 -0
  23. package/references/verification-recovery.md +84 -0
  24. package/references/voip-turn-runbook.md +154 -0
  25. package/references/windows-deployment-notes.md +119 -0
  26. package/scripts/aws-credentials.sh +195 -0
  27. package/scripts/cloud-init/Caddyfile +48 -0
  28. package/scripts/cloud-init/docker-compose.yml +125 -0
  29. package/scripts/cloud-init/init-tokens.sh +238 -0
  30. package/scripts/cloud-init/user-data.yaml +40 -0
  31. package/scripts/destroy.ps1 +77 -0
  32. package/scripts/destroy.sh +589 -0
  33. package/scripts/lib/aws.sh +73 -0
  34. package/scripts/lib/domain.sh +175 -0
  35. package/scripts/lib/operation_report.sh +240 -0
  36. package/scripts/lib/ops.sh +230 -0
  37. package/scripts/lib/paths.sh +35 -0
  38. package/scripts/lib/state.sh +137 -0
  39. package/scripts/mcp-tools-list.mjs +95 -0
  40. package/scripts/orchestrate.ps1 +112 -0
  41. package/scripts/orchestrate.sh +1126 -0
  42. package/scripts/phases/s0_prereq_aws.sh +39 -0
  43. package/scripts/phases/s1_preflight.sh +72 -0
  44. package/scripts/phases/s2_domain.sh +103 -0
  45. package/scripts/phases/s3_provision.sh +421 -0
  46. package/scripts/phases/s4_bootstrap_stack.sh +38 -0
  47. package/scripts/phases/s5_init_tokens.sh +118 -0
  48. package/scripts/phases/s6_wire_local.sh +1435 -0
  49. package/scripts/phases/s7_verify_e2e.sh +136 -0
  50. package/scripts/pricing-estimate.sh +256 -0
  51. package/scripts/render/render-userdata.sh +86 -0
  52. package/scripts/reset-app-data.sh +40 -0
  53. package/scripts/update.sh +30 -0
  54. package/tests/aws_credentials_test.sh +139 -0
  55. package/tests/connect_daemon_runtime_check_test.sh +120 -0
  56. package/tests/default_paths_test.sh +58 -0
  57. package/tests/destroy_local_bridge_test.sh +154 -0
  58. package/tests/destroy_root_identity_test.sh +91 -0
  59. package/tests/destroy_route53_zone_test.sh +80 -0
  60. package/tests/domain_authoritative_dns_test.sh +49 -0
  61. package/tests/mcp_doctor_runtime_check_test.sh +86 -0
  62. package/tests/mcp_smoke_runtime_check_test.sh +121 -0
  63. package/tests/mcp_tools_runtime_check_test.sh +123 -0
  64. package/tests/npm_skill_distribution_test.sh +95 -0
  65. package/tests/operation_report_test.sh +258 -0
  66. package/tests/orchestrate_status_recovery_test.sh +91 -0
  67. package/tests/phase_timeout_test.sh +88 -0
  68. package/tests/pricing_estimate_test.sh +159 -0
  69. package/tests/render_userdata_remote_nodes_test.sh +40 -0
  70. package/tests/root_volume_tracking_test.sh +41 -0
  71. package/tests/route53_overwrite_guard_test.sh +86 -0
  72. package/tests/route53_zone_auto_create_test.sh +66 -0
  73. package/tests/runtime_summary_check_test.sh +203 -0
  74. package/tests/s6_wire_local_test.sh +405 -0
  75. package/tests/skill_structure_test.sh +298 -0
  76. package/tests/update_reset_ops_test.sh +230 -0
  77. package/tests/user_confirmation_gates_test.sh +152 -0
@@ -0,0 +1,1126 @@
1
+ #!/usr/bin/env bash
2
+ # orchestrate.sh - p2p-matrix deployment state-machine engine.
3
+ #
4
+ # Turns "one AWS credential -> working IM server -> local direxio-connect bridge" into 8 phases
5
+ # (S0..S7). State is persisted to $P2P_WORKDIR/state.json and supports:
6
+ # - resume: continue from the first unfinished phase
7
+ # - checkpoints: wait for user/AWS actions without losing progress
8
+ # - destroy: every AWS resource is recorded for destroy.sh
9
+ #
10
+ # Usage:
11
+ # export AWS_ACCESS_KEY_ID=... AWS_SECRET_ACCESS_KEY=... AWS_DEFAULT_REGION=us-east-1
12
+ # export MESSAGE_SERVER_IMAGE=direxio/message-server:latest
13
+ # # First run asks for region, production domain, instance size, and existing-state handling.
14
+ # # Non-interactive:
15
+ # # DOMAIN=__DOMAIN__ DOMAIN_MODE=user CONFIRM_DOMAIN_BINDING=1 INSTANCE_TYPE=t3.small
16
+ # bash orchestrate.sh # run or resume until completion
17
+ # DOMAIN=__DOMAIN__ bash orchestrate.sh status # show current service state only
18
+ # bash orchestrate.sh reset # archive state.json; destroy will no longer know the resources
19
+ #
20
+ # Exit codes: 0=DONE / 1=phase failed / 2=waiting for user action.
21
+ set -uo pipefail
22
+
23
+ HERE=$(cd "$(dirname "$0")" && pwd)
24
+ P2P_INSTALL_SCRIPTS_DIR="$HERE"
25
+
26
+ # Prefer workspace-local tools when present. On Windows, jq may be downloaded
27
+ # into .tools/bin/jq.exe by the operator/system and is discoverable from
28
+ # Git Bash/MSYS only when this path is prepended.
29
+ REPO_ROOT=$(cd "$HERE/.." && pwd)
30
+ if [ -d "$REPO_ROOT/.tools/bin" ]; then
31
+ PATH="$REPO_ROOT/.tools/bin:$PATH"
32
+ export PATH
33
+ fi
34
+
35
+ P2P_WORKDIR_WAS_SET=${P2P_WORKDIR+x}
36
+ DIREXIO_WORKDIR_WAS_SET=${DIREXIO_WORKDIR+x}
37
+
38
+ source "$HERE/lib/state.sh"
39
+ source "$HERE/lib/aws.sh"
40
+ source "$HERE/lib/domain.sh"
41
+ source "$HERE/lib/operation_report.sh"
42
+
43
+ # Phase -> script mapping. Use case instead of declare -A for macOS bash 3.2.
44
+ phase_file() {
45
+ case "$1" in
46
+ S0_PREREQ_AWS) echo "$HERE/phases/s0_prereq_aws.sh" ;;
47
+ S1_PREFLIGHT) echo "$HERE/phases/s1_preflight.sh" ;;
48
+ S2_DOMAIN) echo "$HERE/phases/s2_domain.sh" ;;
49
+ S3_PROVISION) echo "$HERE/phases/s3_provision.sh" ;;
50
+ S4_BOOTSTRAP_STACK) echo "$HERE/phases/s4_bootstrap_stack.sh" ;;
51
+ S5_INIT_TOKENS) echo "$HERE/phases/s5_init_tokens.sh" ;;
52
+ S6_WIRE_LOCAL) echo "$HERE/phases/s6_wire_local.sh" ;;
53
+ S7_VERIFY_E2E) echo "$HERE/phases/s7_verify_e2e.sh" ;;
54
+ *) echo "" ;;
55
+ esac
56
+ }
57
+
58
+ # Dependency check.
59
+ check_deps() {
60
+ local b missing=""
61
+ for b in aws jq ssh scp curl; do
62
+ command -v "$b" >/dev/null 2>&1 || missing="$missing $b"
63
+ done
64
+ [ -z "$missing" ] && return 0
65
+
66
+ warn "Missing dependencies:$missing"
67
+ case " $missing " in
68
+ *" jq "*)
69
+ warn "jq is required for state.json. If this workspace has .tools/bin/jq.exe, run from a POSIX shell that can see that path."
70
+ ;;
71
+ esac
72
+ case " $missing " in
73
+ *" aws "*)
74
+ warn "Install AWS CLI v2 and configure credentials first:"
75
+ warn " macOS: curl 'https://awscli.amazonaws.com/AWSCLIV2.pkg' -o AWSCLIV2.pkg && sudo installer -pkg ./AWSCLIV2.pkg -target /"
76
+ warn " Linux x86_64: curl 'https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip' -o awscliv2.zip && unzip awscliv2.zip && sudo ./aws/install"
77
+ warn " Configure: aws configure --profile p2p-matrix"
78
+ warn " Use: export AWS_PROFILE=p2p-matrix AWS_DEFAULT_REGION=<region>"
79
+ warn "See references/user-journey.md for the AWS CLI setup guide."
80
+ ;;
81
+ esac
82
+ warn "On Windows, use a working POSIX Bash environment such as Git Bash, MSYS2, Cygwin, or WSL. Do not assume C:\\Windows\\System32\\bash.exe is usable; verify with: bash -lc 'echo ok'."
83
+ fail "Install the missing dependencies and rerun."
84
+ }
85
+
86
+ # Run one phase by sourcing its script, then clear run_phase to avoid leakage.
87
+ run_one_phase() {
88
+ local ph=$1 file; file=$(phase_file "$1")
89
+ [ -n "$file" ] && [ -f "$file" ] || fail "Phase script not found: $ph ($file)"
90
+ unset -f run_phase 2>/dev/null || true
91
+ # shellcheck disable=SC1090
92
+ source "$file"
93
+ run_phase
94
+ }
95
+
96
+ # Print current state summary.
97
+ cmd_status_inventory() {
98
+ local nodes state found=0 domain phase current instance service_dir
99
+ nodes="${DIREXIO_HOME:-$HOME/.direxio}/nodes"
100
+ if [ ! -d "$nodes" ]; then
101
+ warn "No local service directory found: $nodes"
102
+ warn "Set DOMAIN=<service domain> when running or inspecting a specific deployment."
103
+ return 0
104
+ fi
105
+
106
+ echo "local services: $nodes"
107
+ for state in "$nodes"/*/state.json; do
108
+ [ -f "$state" ] || continue
109
+ found=1
110
+ service_dir=${state%/state.json}
111
+ domain=$(jq -r '.domain // empty' "$state")
112
+ phase=$(jq -r '.phase // empty' "$state")
113
+ instance=$(jq -r '.resources.instance_id // empty' "$state")
114
+ if STATE_JSON="$state" first_unfinished_phase >/dev/null 2>&1; then
115
+ current=$(STATE_JSON="$state" first_unfinished_phase)
116
+ else
117
+ current=${phase:-unknown}
118
+ fi
119
+ printf " %-32s current=%-18s instance=%s\n" "${domain:-$(basename "$service_dir")}" "${current:-unknown}" "${instance:-none}"
120
+ printf " service_dir=%s\n" "$service_dir"
121
+ printf " state_json=%s\n" "$state"
122
+ done
123
+
124
+ if [ "$found" -eq 0 ]; then
125
+ warn "No service state files found under $nodes"
126
+ fi
127
+ }
128
+
129
+ phase_user_meaning() {
130
+ case "$1" in
131
+ S0_PREREQ_AWS) echo "AWS credentials, CLI tooling, or account identity are not ready." ;;
132
+ S1_PREFLIGHT) echo "AWS region, default VPC, quota, or Ubuntu AMI checks are not ready." ;;
133
+ S2_DOMAIN) echo "The long-lived domain, DNS authority, or irreversible Matrix server_name binding is not confirmed." ;;
134
+ S3_PROVISION) echo "AWS infrastructure provisioning, fixed public IP, security group, or DNS record setup is not complete." ;;
135
+ S4_BOOTSTRAP_STACK) echo "The EC2 instance exists, but cloud-init, Docker, Caddy/TLS, or message-server has not reached healthy state." ;;
136
+ S5_INIT_TOKENS) echo "The server is not yet returning fresh bootstrap credentials from /opt/p2p/bootstrap.json." ;;
137
+ S6_WIRE_LOCAL) echo "The cloud service is likely up, but local direxio-connect, service credentials, or MCP snippets are not wired." ;;
138
+ S7_VERIFY_E2E) echo "The deployed service failed one or more final automated health, Matrix, CORS, TURN, or API checks." ;;
139
+ DONE) echo "Automated S0-S7 checks are complete." ;;
140
+ *) echo "The deployment state is incomplete or unknown." ;;
141
+ esac
142
+ }
143
+
144
+ phase_at_or_after_s3() {
145
+ case "$1" in
146
+ S3_PROVISION|S4_BOOTSTRAP_STACK|S5_INIT_TOKENS|S6_WIRE_LOCAL|S7_VERIFY_E2E|DONE) return 0 ;;
147
+ *) return 1 ;;
148
+ esac
149
+ }
150
+
151
+ recorded_billable_resources() {
152
+ local iid volume pubip eip zone out=""
153
+ iid=$(res_get instance_id)
154
+ volume=$(res_get root_volume_id)
155
+ pubip=$(res_get public_ip)
156
+ eip=$(res_get eip_id)
157
+ zone=$(res_get route53_zone_id)
158
+ [ -n "$iid" ] && out="EC2 $iid"
159
+ if [ -n "$volume" ]; then
160
+ [ -n "$out" ] && out="$out, "
161
+ out="${out}EBS root volume $volume"
162
+ fi
163
+ if [ -n "$pubip" ]; then
164
+ [ -n "$out" ] && out="$out, "
165
+ out="${out}public IPv4 $pubip"
166
+ fi
167
+ if [ -n "$eip" ]; then
168
+ [ -n "$out" ] && out="$out, "
169
+ out="${out}Elastic IP $eip"
170
+ fi
171
+ if [ -n "$zone" ]; then
172
+ [ -n "$out" ] && out="$out, "
173
+ out="${out}Route53 hosted zone $zone"
174
+ fi
175
+ printf '%s\n' "$out"
176
+ }
177
+
178
+ status_billing_impact() {
179
+ local current=$1 billable
180
+ billable=$(recorded_billable_resources)
181
+ if [ -n "$billable" ]; then
182
+ echo "recorded AWS resources may keep billing: $billable"
183
+ elif phase_at_or_after_s3 "$current"; then
184
+ echo "S3 or later may have created billable AWS resources; inspect AWS if state is incomplete"
185
+ else
186
+ echo "no EC2, public IPv4, or EBS resource is recorded yet"
187
+ fi
188
+ }
189
+
190
+ status_resume_safety() {
191
+ local current=$1 billable
192
+ billable=$(recorded_billable_resources)
193
+ if [ -n "$billable" ] || phase_at_or_after_s3 "$current"; then
194
+ echo "do not reset state; fix the issue and rerun with P2P_EXISTING_STATE_ACTION=continue"
195
+ else
196
+ echo "safe to rerun the same command after the next action is complete"
197
+ fi
198
+ }
199
+
200
+ local_refresh_pending() {
201
+ [ "$(state_get agent_install_status)" = "refresh_pending" ]
202
+ }
203
+
204
+ status_local_refresh() {
205
+ if local_refresh_pending; then
206
+ echo "update/reset cleared old credentials, user confirmations, runtime checks, and bridge install proof"
207
+ fi
208
+ }
209
+
210
+ status_next_action() {
211
+ if local_refresh_pending; then
212
+ case "$1" in
213
+ S4_BOOTSTRAP_STACK|S5_INIT_TOKENS|S6_WIRE_LOCAL|S7_VERIFY_E2E|DONE)
214
+ echo "rerun the deployment workflow to refresh S4-S7, local credentials, MCP snippets, and runtime checks"
215
+ return 0
216
+ ;;
217
+ esac
218
+ fi
219
+
220
+ case "$1" in
221
+ S0_PREREQ_AWS) echo "configure AWS CLI credentials for the selected deployment identity and rerun status" ;;
222
+ S1_PREFLIGHT) echo "fix AWS region, default VPC, EC2 quota, or AMI availability before creating resources" ;;
223
+ S2_DOMAIN) echo "confirm the long-lived domain, DNS authority, and irreversible Matrix server_name binding" ;;
224
+ S3_PROVISION) echo "inspect EC2 provisioning, Elastic IP allocation, security group creation, and DNS record setup" ;;
225
+ S4_BOOTSTRAP_STACK) echo "inspect cloud-init, Docker, Caddy/TLS, and message-server logs over SSH" ;;
226
+ S5_INIT_TOKENS) echo "inspect /opt/p2p/bootstrap.json, init-tokens.sh, and message-server bootstrap logs" ;;
227
+ S6_WIRE_LOCAL) echo "refresh local credentials, direxio-connect config, MCP snippets, and agent runtime settings without destroying cloud resources" ;;
228
+ S7_VERIFY_E2E) echo "inspect the failed health, Matrix, well-known, owner.json/CORS, TURN, MCP, or runtime gate before declaring delivery" ;;
229
+ DONE) echo "give the user the App domain and eight-digit initialization code, then record App initialization and agent/MCP confirmation separately" ;;
230
+ *) echo "inspect state.json and the current phase evidence before taking action" ;;
231
+ esac
232
+ }
233
+
234
+ status_stop_loss() {
235
+ local domain billable
236
+ domain=$(state_get domain)
237
+ billable=$(recorded_billable_resources)
238
+ if [ -z "$billable" ]; then
239
+ echo "no recorded cloud resources need destroy from this state"
240
+ else
241
+ echo "ask the agent to run destroy, or run:"
242
+ if [ "${DIREXIO_LOCAL_PATH_STYLE:-}" = "windows" ] || [ -n "${DIREXIO_WINDOWS_HOME:-}" ]; then
243
+ echo " \$env:DOMAIN = \"${domain:-__DOMAIN__}\"; .\\scripts\\destroy.ps1"
244
+ else
245
+ echo " DOMAIN=${domain:-__DOMAIN__} bash $HERE/destroy.sh"
246
+ fi
247
+ echo " Purchased domains, third-party DNS records, and retained hosted zones are not automatically removed."
248
+ fi
249
+ }
250
+
251
+ print_recovery_summary() {
252
+ local current=$1 status refresh
253
+ status=$(phase_status "$current")
254
+ refresh=$(status_local_refresh)
255
+ echo "-- Recovery summary --"
256
+ echo "Where it is blocked: $current (${status:-unknown}) - $(phase_user_meaning "$current")"
257
+ echo "Billing impact: $(status_billing_impact "$current")"
258
+ echo "Resume safety: $(status_resume_safety "$current")"
259
+ [ -z "$refresh" ] || echo "Local refresh: $refresh"
260
+ echo "Next action: $(status_next_action "$current")"
261
+ printf "Stop-loss: "
262
+ status_stop_loss
263
+ }
264
+
265
+ cmd_status() {
266
+ if [ ! -f "$STATE_JSON" ]; then
267
+ if [ -z "${DOMAIN:-}" ] && [ -z "$P2P_WORKDIR_WAS_SET" ] && [ -z "$DIREXIO_WORKDIR_WAS_SET" ]; then
268
+ cmd_status_inventory
269
+ return 0
270
+ fi
271
+ warn "state.json not found: $STATE_JSON"
272
+ warn "Set DOMAIN=<service domain> or explicit P2P_WORKDIR=<service dir> to inspect a specific deployment."
273
+ return 0
274
+ fi
275
+ echo "run_id : $(state_get run_id)"
276
+ echo "region : $(state_get region)"
277
+ echo "domain_mode: $(state_get domain_mode)"
278
+ echo "domain : $(state_get domain)"
279
+ echo "instance : $(state_get instance_type)"
280
+ echo "dns_ready : $(state_get dns_ready)"
281
+ echo "current : $(first_unfinished_phase)"
282
+ local current
283
+ current=$(first_unfinished_phase)
284
+ echo "-- phases --"
285
+ local p
286
+ for p in "${PHASES[@]}"; do
287
+ printf " %-20s %s\n" "$p" "$(phase_status "$p")"
288
+ done
289
+ echo "-- resources --"
290
+ jq -r '.resources | to_entries[]? | " \(.key)=\(.value)"' "$STATE_JSON"
291
+ print_recovery_summary "$current"
292
+ }
293
+
294
+ # Delivery summary.
295
+ print_delivery() {
296
+ local domain password keyfile pubip iid region statejson envfile agent_room_id runtime install_policy install_mode install_status install_command
297
+ local agent_node_id agent_service_id agent_service_dir agent_cred cc_config cc_binary cc_agent cc_user cc_pkg
298
+ local report_path runtime_summary app_gate real_chat_gate agent_runtime_gate
299
+ domain=$(state_get domain)
300
+ password=$(state_get password)
301
+ if ! printf '%s' "$password" | grep -Eq '^[0-9]{8}$'; then
302
+ warn "state password field is not an exact eight-digit initialization code; rerun S5_INIT_TOKENS before reporting it."
303
+ return 1
304
+ fi
305
+ keyfile=$(res_get key_file); pubip=$(res_get public_ip)
306
+ iid=$(res_get instance_id); region=$(state_get region); statejson="$STATE_JSON"
307
+ envfile=$(state_get agent_env_file)
308
+ agent_node_id=$(state_get agent_node_id)
309
+ agent_service_id=$(state_get agent_service_id)
310
+ agent_service_dir=$(state_get agent_service_dir)
311
+ agent_cred=$(state_get agent_credentials_file)
312
+ agent_room_id=$(state_get agent_room_id)
313
+ runtime=$(state_get agent_runtime)
314
+ cc_config=$(state_get cc_connect_config)
315
+ cc_binary=$(state_get cc_connect_binary)
316
+ cc_agent=$(state_get cc_connect_agent)
317
+ cc_user=$(state_get cc_connect_matrix_user)
318
+ cc_pkg=$(state_get cc_connect_npm_package)
319
+ install_policy=$(state_get agent_install_policy)
320
+ install_mode=$(state_get agent_install_mode)
321
+ install_status=$(state_get agent_install_status)
322
+ install_command=$(state_get agent_install_command)
323
+ runtime_summary=$(jq -r '.runtime_checks.summary.status // "not_run"' "$STATE_JSON")
324
+ app_gate=$(jq -r '.user_confirmations.app_initialization.status // "pending_user_confirmation"' "$STATE_JSON")
325
+ real_chat_gate=$(jq -r '.user_confirmations.real_chat.status // "pending_user_confirmation"' "$STATE_JSON")
326
+ agent_runtime_gate=$(jq -r '.user_confirmations.agent_mcp_runtime.status // "pending_runtime_confirmation"' "$STATE_JSON")
327
+ echo
328
+ echo -e "\033[32m========== Automated Deployment Gates Passed ==========\033[0m"
329
+ echo " App domain : $domain"
330
+ echo " init code : $password <- enter in the App initialization flow"
331
+ echo " status : server automation is green; product completion waits for user/runtime confirmation"
332
+ echo " user gates : app_initialization=$app_gate real_chat=$real_chat_gate agent_mcp_runtime=$agent_runtime_gate"
333
+ echo " runtime check: ${runtime_summary:-not_run}"
334
+ echo " agent node : ${agent_node_id:-default}"
335
+ echo " service id : ${agent_service_id:-not recorded}"
336
+ echo " service dir : ${agent_service_dir:-not recorded}"
337
+ echo " credentials : init code/password field, access_token, and agent_token written to ${agent_cred:-~/.direxio/nodes/<service_id>/credentials.json}"
338
+ echo " agent room : ${agent_room_id:-written to credentials.json}"
339
+ echo " cc-connect : package=${cc_pkg:-direxio-connent@latest} config=${cc_config:-not recorded} command=${cc_binary:-direxio-connect}"
340
+ echo " matrix user : ${cc_user:-created during S6}"
341
+ echo " agent runtime: ${runtime:-unknown}"
342
+ echo " install mode : policy=${install_policy:-recommend} mode=${install_mode:-cc-connect} agent=${cc_agent:-codex} status=${install_status:-recommend}"
343
+ [ -n "$install_command" ] && echo " install cmd : $install_command"
344
+ echo " daemon : ${cc_binary:-direxio-connect} daemon status --service-name ${agent_service_id:-cc-connect}"
345
+ echo " env vars : DIREXIO_DOMAIN, DIREXIO_AGENT_TOKEN, DIREXIO_AGENT_ROOM_ID persisted${envfile:+ via $envfile}"
346
+ echo " AWS region : $region"
347
+ echo " EC2 : $iid ($pubip)"
348
+ echo " SSH : ssh -i $keyfile ubuntu@$pubip"
349
+ echo " state.json : $statejson"
350
+ echo " stop billing : ask the agent to destroy this node when finished"
351
+ echo " Note : EC2/public IPv4/EBS resources keep billing until destroy is run."
352
+ echo " security : delete/disable temporary IAM keys after deployment; rotate/remove root keys if used."
353
+ echo " Product gate : S7 is green; final product completion still needs App initialization and agent/MCP runtime confirmation."
354
+ if report_path=$(operation_report_write new_deploy automated_gates_complete_user_confirmation_pending "$STATE_JSON" 2>/dev/null); then
355
+ echo " report : $report_path"
356
+ else
357
+ echo " report : not written; run bash $0 report new_deploy"
358
+ fi
359
+ }
360
+
361
+ ensure_region_selected() {
362
+ local region
363
+ region=$(state_get region)
364
+ if [ -z "$region" ]; then
365
+ region=${AWS_DEFAULT_REGION:-${AWS_REGION:-}}
366
+ [ -z "$region" ] && region=$(aws_configured_region)
367
+ if [ -z "$region" ] && [ -t 0 ]; then
368
+ warn "Choose an AWS region. Region affects latency, price, default VPC, and EC2 quota."
369
+ printf "AWS region [us-east-1]: " >&2
370
+ read -r region
371
+ region=${region:-us-east-1}
372
+ fi
373
+ if [ -z "$region" ]; then
374
+ warn "Confirm AWS region first. This script will not silently default to us-east-1."
375
+ warn "Set it with aws configure or AWS_DEFAULT_REGION."
376
+ warn "Example: AWS_DEFAULT_REGION=ap-southeast-1 bash $0"
377
+ warn "Or: AWS_DEFAULT_REGION=us-east-1 bash $0"
378
+ return 2
379
+ fi
380
+ state_set region "$region"
381
+ fi
382
+ export AWS_DEFAULT_REGION="$region"
383
+ return 0
384
+ }
385
+
386
+ ensure_cost_estimate() {
387
+ local output status total region instance_type args
388
+ args=(--state "$STATE_JSON" --write-state)
389
+ if [ -n "${INSTANCE_TYPE:-}" ]; then
390
+ args+=(--instance-type "$INSTANCE_TYPE")
391
+ fi
392
+
393
+ if output=$(bash "$HERE/pricing-estimate.sh" "${args[@]}" 2>/dev/null); then
394
+ status=$(printf '%s\n' "$output" | jq -r '.pricing_status // "unknown"' 2>/dev/null)
395
+ total=$(printf '%s\n' "$output" | jq -r '.total_monthly_usd // "unknown"' 2>/dev/null)
396
+ region=$(printf '%s\n' "$output" | jq -r '.region // "unknown"' 2>/dev/null)
397
+ instance_type=$(printf '%s\n' "$output" | jq -r '.components.ec2_instance.instance_type // "unknown"' 2>/dev/null)
398
+ log "Cost estimate recorded (status=${status:-unknown}, region=${region:-unknown}, instance=${instance_type:-unknown}, monthly_usd≈${total:-unknown})."
399
+ if [ "$status" = "fallback" ]; then
400
+ warn "AWS Pricing API was unavailable or incomplete; cost_estimate uses conservative fallback values."
401
+ fi
402
+ else
403
+ warn "Could not write AWS cost estimate. Continue only after giving the user a manual billing estimate."
404
+ fi
405
+ }
406
+
407
+ precheck_new_deploy_domain_env() {
408
+ local domain
409
+ domain=$(domain_normalize "${DOMAIN:-}")
410
+ [ -f "$STATE_JSON" ] && return 0
411
+ if [ "${DOMAIN_MODE:-}" = "ec2" ]; then
412
+ warn "Deployment blocked: DOMAIN_MODE=ec2 temporary-domain mode has been removed."
413
+ warn "Prepare a production domain and use DOMAIN=__DOMAIN__ DOMAIN_MODE=user CONFIRM_DOMAIN_BINDING=1."
414
+ return 2
415
+ fi
416
+ if [ -z "$domain" ]; then
417
+ warn "Deployment blocked: DOMAIN is missing. P2P-IM requires a confirmed production Matrix server_name."
418
+ warn "Use this skill to prepare domain/DNS, then rerun:"
419
+ warn " DOMAIN=__DOMAIN__ DOMAIN_MODE=user CONFIRM_DOMAIN_BINDING=1 bash $0"
420
+ return 2
421
+ fi
422
+ if ! domain_is_formal_name "$domain"; then
423
+ warn "Deployment blocked: DOMAIN=$domain is not a valid production domain."
424
+ warn "Use a long-lived domain you own and can manage in DNS, such as __DOMAIN__. IPs, localhost, wildcards, and temporary resolver domains are not accepted."
425
+ return 2
426
+ fi
427
+ if [ "${CONFIRM_DOMAIN_BINDING:-0}" != "1" ]; then
428
+ warn "Deployment blocked: Matrix server_name domain binding has not been confirmed."
429
+ warn "Rerun after confirmation:"
430
+ warn " DOMAIN=$domain DOMAIN_MODE=${DOMAIN_MODE:-user} CONFIRM_DOMAIN_BINDING=1 bash $0"
431
+ return 2
432
+ fi
433
+ return 0
434
+ }
435
+
436
+ ensure_production_domain_selected() {
437
+ local state_domain state_mode env_domain domain mode confirmed
438
+ state_domain=$(state_get domain)
439
+ state_domain=$(domain_normalize "$state_domain")
440
+ state_mode=$(state_get domain_mode)
441
+ env_domain=$(domain_normalize "${DOMAIN:-}")
442
+ confirmed=$(jq -r '.domain_confirmed_irreversible // false' "$STATE_JSON")
443
+
444
+ if [ -n "$env_domain" ] && [ -n "$state_domain" ] && [ "$env_domain" != "$state_domain" ]; then
445
+ warn "Deployment blocked: current state is bound to DOMAIN=$state_domain, but this run passed DOMAIN=${env_domain}."
446
+ warn "Do not switch Matrix server_name inside the same service state. Continue with the old domain, destroy and rebuild, or use a different DOMAIN/service directory."
447
+ return 2
448
+ fi
449
+ if [ -n "${DOMAIN_MODE:-}" ] && [ -n "$state_mode" ] && [ "$DOMAIN_MODE" != "$state_mode" ]; then
450
+ warn "Deployment blocked: current state is bound to DOMAIN_MODE=$state_mode, but this run passed DOMAIN_MODE=${DOMAIN_MODE}."
451
+ warn "Continue with the old mode, destroy and rebuild, or use a different DOMAIN/service directory."
452
+ return 2
453
+ fi
454
+
455
+ domain=${env_domain:-$state_domain}
456
+ mode=${DOMAIN_MODE:-$state_mode}
457
+
458
+ if [ "$mode" = "ec2" ]; then
459
+ warn "Deployment blocked: DOMAIN_MODE=ec2 temporary-domain mode has been removed."
460
+ warn "Prepare a production domain and use DOMAIN=__DOMAIN__ DOMAIN_MODE=user CONFIRM_DOMAIN_BINDING=1."
461
+ return 2
462
+ fi
463
+ if [ -z "$domain" ]; then
464
+ warn "Deployment blocked: DOMAIN is missing. P2P-IM requires a confirmed production Matrix server_name."
465
+ warn "Use this skill to prepare domain/DNS, then rerun:"
466
+ warn " DOMAIN=__DOMAIN__ DOMAIN_MODE=user CONFIRM_DOMAIN_BINDING=1 bash $0"
467
+ return 2
468
+ fi
469
+ if ! domain_is_formal_name "$domain"; then
470
+ warn "Deployment blocked: DOMAIN=$domain is not a valid production domain."
471
+ warn "Use a long-lived domain you own and can manage in DNS, such as __DOMAIN__. IPs, localhost, wildcards, and temporary resolver domains are not accepted."
472
+ return 2
473
+ fi
474
+ if [ "$confirmed" != "true" ] && [ "${CONFIRM_DOMAIN_BINDING:-0}" != "1" ]; then
475
+ warn "Deployment blocked: Matrix server_name domain binding has not been confirmed."
476
+ warn "After $domain becomes server_name, changing the domain is effectively a new homeserver identity."
477
+ warn "Rerun after confirmation:"
478
+ warn " DOMAIN=$domain DOMAIN_MODE=${mode:-user} CONFIRM_DOMAIN_BINDING=1 bash $0"
479
+ return 2
480
+ fi
481
+ return 0
482
+ }
483
+
484
+ guard_existing_state() {
485
+ [ -f "$STATE_JSON" ] || return 0
486
+ local resources_count confirmed action
487
+ resources_count=$(jq -r '.resources | length' "$STATE_JSON")
488
+ [ "$resources_count" -eq 0 ] && return 0
489
+ if [ "$(jq -r '.domain_mode // empty' "$STATE_JSON")" = "ec2" ]; then
490
+ warn "Found legacy temporary-domain deployment state (domain_mode=ec2). Production deployment no longer supports resuming this mode."
491
+ warn "Destroy and rebuild, or use a new service directory:"
492
+ warn " P2P_EXISTING_STATE_ACTION=destroy bash $0"
493
+ warn " DOMAIN=__DOMAIN__ DOMAIN_MODE=user CONFIRM_DOMAIN_BINDING=1 bash $0"
494
+ return 2
495
+ fi
496
+ confirmed=$(jq -r '.existing_state_confirmed // false' "$STATE_JSON")
497
+ [ "$confirmed" = "true" ] && return 0
498
+
499
+ action=${P2P_EXISTING_STATE_ACTION:-}
500
+ if [ -z "$action" ] && [ -t 0 ]; then
501
+ warn "Found existing deployment state with recorded AWS resources:"
502
+ jq -r '.resources | to_entries[]? | " \(.key)=\(.value)"' "$STATE_JSON" >&2
503
+ warn "Choose: continue=resume / destroy=destroy and rebuild / abort=stop now"
504
+ printf "Action [abort]: " >&2
505
+ read -r action
506
+ action=${action:-abort}
507
+ fi
508
+
509
+ case "$action" in
510
+ continue)
511
+ state_set_raw existing_state_confirmed 'true'
512
+ warn "Continuing with existing state and resources."
513
+ return 0 ;;
514
+ destroy)
515
+ warn "Destroying AWS resources recorded in state.json, then starting over."
516
+ bash "$HERE/destroy.sh" "$STATE_JSON" || return 1
517
+ return 0 ;;
518
+ ""|abort)
519
+ warn "Existing service state must be handled explicitly to avoid accidental reuse or duplicate EC2 creation."
520
+ warn "Resume: P2P_EXISTING_STATE_ACTION=continue bash $0"
521
+ warn "Rebuild: P2P_EXISTING_STATE_ACTION=destroy bash $0"
522
+ warn "New service: DOMAIN=__DOMAIN__ DOMAIN_MODE=user CONFIRM_DOMAIN_BINDING=1 bash $0"
523
+ return 2 ;;
524
+ *)
525
+ warn "Unknown P2P_EXISTING_STATE_ACTION=$action (expected continue|destroy|abort)."
526
+ return 2 ;;
527
+ esac
528
+ }
529
+
530
+ # Main loop: start at the first unfinished phase.
531
+ cmd_run() {
532
+ precheck_new_deploy_domain_env || return $?
533
+ check_deps
534
+ guard_existing_state || return $?
535
+ state_ensure
536
+ ensure_production_domain_selected || return $?
537
+ ensure_region_selected || return $?
538
+ ensure_cost_estimate
539
+ log "State machine started. state.json = $STATE_JSON"
540
+
541
+ while true; do
542
+ local cur; cur=$(first_unfinished_phase)
543
+ if [ "$cur" = "DONE" ]; then
544
+ ok "All phases completed."
545
+ print_delivery
546
+ return 0
547
+ fi
548
+ log "Entering phase $cur (current status=$(phase_status "$cur"))"
549
+
550
+ local rc=0
551
+ run_one_phase "$cur" || rc=$?
552
+
553
+ case "$rc" in
554
+ 0) ok "Phase $cur completed." ;;
555
+ 2) warn "Phase $cur is waiting for user action (credentials/quota/confirmation). Resolve it and rerun this script to resume."; return 2 ;;
556
+ *) warn "Phase $cur failed (rc=$rc). Fix it and rerun to resume, or ask the agent to destroy this node to remove resources."; return 1 ;;
557
+ esac
558
+ done
559
+ }
560
+
561
+ cmd_report() {
562
+ local operation=${1:-new_deploy} status report_path
563
+ [ -f "$STATE_JSON" ] || {
564
+ warn "state.json not found: $STATE_JSON"
565
+ return 1
566
+ }
567
+ case "$operation" in
568
+ new_deploy) status=automated_gates_complete_user_confirmation_pending ;;
569
+ repair_or_verify) status=verification_report ;;
570
+ update) status=update_report ;;
571
+ reset_app_data) status=reset_app_data_report ;;
572
+ destroy) status=destroy_processed ;;
573
+ *)
574
+ echo "Usage: $0 report [new_deploy|repair_or_verify|update|reset_app_data|destroy]" >&2
575
+ return 1
576
+ ;;
577
+ esac
578
+ report_path=$(operation_report_write "$operation" "$status" "$STATE_JSON")
579
+ echo "operation report: $report_path"
580
+ }
581
+
582
+ cmd_confirm() {
583
+ local gate=${1:-} evidence=${DIREXIO_CONFIRM_EVIDENCE:-}
584
+ local runtime_summary_status runtime_probe_confirmed
585
+ [ -f "$STATE_JSON" ] || {
586
+ warn "state.json not found: $STATE_JSON"
587
+ return 1
588
+ }
589
+ case "$gate" in
590
+ app_initialization|real_chat|agent_mcp_runtime) ;;
591
+ *)
592
+ echo "Usage: $0 confirm [app_initialization|real_chat|agent_mcp_runtime]" >&2
593
+ return 1
594
+ ;;
595
+ esac
596
+ if [ -z "$evidence" ]; then
597
+ warn "confirm $gate requires DIREXIO_CONFIRM_EVIDENCE with a concrete user/runtime evidence note."
598
+ return 1
599
+ fi
600
+ if [ "${#evidence}" -lt 12 ]; then
601
+ warn "DIREXIO_CONFIRM_EVIDENCE is too short; provide a concrete user/runtime evidence note."
602
+ return 1
603
+ fi
604
+ runtime_summary_status=$(jq -r '.runtime_checks.summary.status // "not_run"' "$STATE_JSON")
605
+ runtime_probe_confirmed=false
606
+ if [ "$gate" = "agent_mcp_runtime" ]; then
607
+ if [ "$runtime_summary_status" != "passed" ]; then
608
+ warn "agent_mcp_runtime confirmation requires runtime_checks.summary.status=passed. Run: DOMAIN=<DOMAIN> bash $0 verify runtime"
609
+ return 1
610
+ fi
611
+ if [ "${DIREXIO_CONFIRM_RUNTIME_PROBE:-0}" != "1" ]; then
612
+ warn "agent_mcp_runtime confirmation requires DIREXIO_CONFIRM_RUNTIME_PROBE=1 after the selected runtime/channel probe is actually confirmed."
613
+ return 1
614
+ fi
615
+ runtime_probe_confirmed=true
616
+ fi
617
+ _state_write '
618
+ .user_confirmations[$gate] = {
619
+ status: "confirmed",
620
+ ts: $ts,
621
+ evidence: $evidence
622
+ }
623
+ + (if $gate == "agent_mcp_runtime" then {
624
+ runtime_summary_status: $runtime_summary_status,
625
+ runtime_probe_confirmed: ($runtime_probe_confirmed == "true")
626
+ } else {} end)
627
+ ' --arg gate "$gate" \
628
+ --arg ts "$(_now)" \
629
+ --arg evidence "$evidence" \
630
+ --arg runtime_summary_status "$runtime_summary_status" \
631
+ --arg runtime_probe_confirmed "$runtime_probe_confirmed"
632
+ echo "confirmed gate: $gate"
633
+ }
634
+
635
+ cmd_verify_mcp_doctor() {
636
+ [ -f "$STATE_JSON" ] || {
637
+ warn "state.json not found: $STATE_JSON"
638
+ return 1
639
+ }
640
+
641
+ local credentials mcp_cmd node_id out err report token_status
642
+ credentials=$(jq -r '.agent_credentials_file // .mcp_credentials_file // empty' "$STATE_JSON")
643
+ mcp_cmd=$(jq -r '.mcp_command // "direxio-mcp"' "$STATE_JSON")
644
+ node_id=$(jq -r '.agent_node_id // empty' "$STATE_JSON")
645
+ [ -n "$credentials" ] || {
646
+ warn "mcp doctor check requires agent_credentials_file or mcp_credentials_file in state.json"
647
+ return 1
648
+ }
649
+ [ -n "$mcp_cmd" ] || mcp_cmd=direxio-mcp
650
+
651
+ out=$(mktemp)
652
+ err=$(mktemp)
653
+ if ! DIREXIO_CREDENTIALS_FILE="$credentials" DIREXIO_AGENT_NODE_ID="$node_id" bash -c "$mcp_cmd doctor --json" > "$out" 2> "$err"; then
654
+ _state_write '
655
+ .runtime_checks.mcp_doctor = {
656
+ status: "failed",
657
+ ts: $ts,
658
+ evidence: "direxio-mcp doctor failed"
659
+ }
660
+ ' --arg ts "$(_now)"
661
+ cat "$err" >&2
662
+ rm -f "$out" "$err"
663
+ return 1
664
+ fi
665
+ if ! jq empty "$out" >/dev/null 2>&1; then
666
+ _state_write '
667
+ .runtime_checks.mcp_doctor = {
668
+ status: "failed",
669
+ ts: $ts,
670
+ evidence: "direxio-mcp doctor returned non-json output"
671
+ }
672
+ ' --arg ts "$(_now)"
673
+ rm -f "$out" "$err"
674
+ return 1
675
+ fi
676
+ report=$(cat "$out")
677
+ token_status=$(printf '%s\n' "$report" | jq -r '
678
+ if (.token // "") == "redacted" then "redacted"
679
+ elif ((.token // "") | tostring | length) > 0 then "present_redacted"
680
+ else "missing"
681
+ end
682
+ ')
683
+ _state_write '
684
+ .runtime_checks.mcp_doctor = {
685
+ status: "passed",
686
+ ts: $ts,
687
+ evidence: "direxio-mcp doctor --json succeeded",
688
+ domain: ($report.domain // ""),
689
+ agent_room_id: ($report.agent_room_id // ""),
690
+ token: $token_status
691
+ }
692
+ ' --arg ts "$(_now)" --argjson report "$report" --arg token_status "$token_status"
693
+ rm -f "$out" "$err"
694
+ echo "verified runtime check: mcp_doctor"
695
+ }
696
+
697
+ cmd_verify_mcp_smoke() {
698
+ [ -f "$STATE_JSON" ] || {
699
+ warn "state.json not found: $STATE_JSON"
700
+ return 1
701
+ }
702
+
703
+ local service_url token room_id body code payload tmp url
704
+ service_url=$(jq -r '.as_url // empty' "$STATE_JSON")
705
+ if [ -z "$service_url" ]; then
706
+ local domain
707
+ domain=$(jq -r '.domain // empty' "$STATE_JSON")
708
+ [ -n "$domain" ] && service_url="https://$domain"
709
+ fi
710
+ token=$(jq -r '.agent_token // empty' "$STATE_JSON")
711
+ room_id=$(jq -r '.agent_room_id // empty' "$STATE_JSON")
712
+ if [ -z "$service_url" ] || [ -z "$token" ] || [ -z "$room_id" ]; then
713
+ warn "mcp smoke check requires as_url/domain, agent_token, and agent_room_id in state.json"
714
+ return 1
715
+ fi
716
+
717
+ body=$(mktemp)
718
+ payload=$(jq -cn --arg room_id "$room_id" '{action:"mcp.messages.list", params:{room_id:$room_id, limit:1}}')
719
+ url="${service_url%/}/_p2p/query"
720
+ code=$(curl -sk -o "$body" -w '%{http_code}' \
721
+ -X POST "$url" \
722
+ -H 'Content-Type: application/json' \
723
+ -H "Authorization: Bearer $token" \
724
+ -d "$payload" 2>/dev/null)
725
+ if [ "$code" != "200" ] || ! jq -e '(.messages | type == "array") and (.room_id | type == "string")' "$body" >/dev/null 2>&1; then
726
+ _state_write '
727
+ .runtime_checks.mcp_smoke = {
728
+ status: "failed",
729
+ ts: $ts,
730
+ action: "mcp.messages.list",
731
+ evidence: $evidence
732
+ }
733
+ ' --arg ts "$(_now)" --arg evidence "mcp.messages.list returned HTTP $code or invalid response"
734
+ rm -f "$body"
735
+ return 1
736
+ fi
737
+
738
+ tmp=$(mktemp)
739
+ jq -n --slurpfile response "$body" \
740
+ --arg ts "$(_now)" \
741
+ --arg room_id "$room_id" \
742
+ '{
743
+ status: "passed",
744
+ ts: $ts,
745
+ action: "mcp.messages.list",
746
+ room_id: $room_id,
747
+ response_room_id: ($response[0].room_id // ""),
748
+ response_messages_type: (($response[0].messages // null) | type),
749
+ evidence: "read-only backend smoke check succeeded"
750
+ }' > "$tmp"
751
+ _state_write '.runtime_checks.mcp_smoke = $check[0]' --slurpfile check "$tmp"
752
+ rm -f "$body" "$tmp"
753
+ echo "verified runtime check: mcp_smoke"
754
+ }
755
+
756
+ cmd_verify_mcp_tools() {
757
+ [ -f "$STATE_JSON" ] || {
758
+ warn "state.json not found: $STATE_JSON"
759
+ return 1
760
+ }
761
+
762
+ local credentials mcp_cmd node_id node_cmd node_script out err report
763
+ credentials=$(jq -r '.agent_credentials_file // .mcp_credentials_file // empty' "$STATE_JSON")
764
+ mcp_cmd=$(jq -r '.mcp_command // "direxio-mcp"' "$STATE_JSON")
765
+ node_id=$(jq -r '.agent_node_id // empty' "$STATE_JSON")
766
+ [ -n "$credentials" ] || {
767
+ warn "mcp tools check requires agent_credentials_file or mcp_credentials_file in state.json"
768
+ return 1
769
+ }
770
+ [ -n "$mcp_cmd" ] || mcp_cmd=direxio-mcp
771
+ node_cmd=$(_node_command)
772
+ [ -n "$node_cmd" ] || {
773
+ warn "mcp tools check requires node or node.exe to run scripts/mcp-tools-list.mjs"
774
+ return 1
775
+ }
776
+ node_script=$(_node_script_path "$node_cmd" "$HERE/mcp-tools-list.mjs")
777
+
778
+ out=$(mktemp)
779
+ err=$(mktemp)
780
+ if ! DIREXIO_CREDENTIALS_FILE="$credentials" DIREXIO_AGENT_NODE_ID="$node_id" "$node_cmd" "$node_script" "$mcp_cmd" > "$out" 2> "$err"; then
781
+ _state_write '
782
+ .runtime_checks.mcp_tools = {
783
+ status: "failed",
784
+ ts: $ts,
785
+ evidence: "MCP tools/list failed"
786
+ }
787
+ ' --arg ts "$(_now)"
788
+ cat "$err" >&2
789
+ rm -f "$out" "$err"
790
+ return 1
791
+ fi
792
+ if ! jq -e '(.tools | type == "array") and (.tool_count | type == "number")' "$out" >/dev/null 2>&1; then
793
+ _state_write '
794
+ .runtime_checks.mcp_tools = {
795
+ status: "failed",
796
+ ts: $ts,
797
+ evidence: "MCP tools/list returned invalid output"
798
+ }
799
+ ' --arg ts "$(_now)"
800
+ rm -f "$out" "$err"
801
+ return 1
802
+ fi
803
+ report=$(cat "$out")
804
+ _state_write '
805
+ .runtime_checks.mcp_tools = {
806
+ status: "passed",
807
+ ts: $ts,
808
+ evidence: "MCP tools/list succeeded",
809
+ tool_count: ($report.tool_count // 0),
810
+ tools: ($report.tools // [])
811
+ }
812
+ ' --arg ts "$(_now)" --argjson report "$report"
813
+ rm -f "$out" "$err"
814
+ echo "verified runtime check: mcp_tools"
815
+ }
816
+
817
+ _node_command() {
818
+ if command -v node >/dev/null 2>&1; then
819
+ command -v node
820
+ return 0
821
+ fi
822
+ if command -v node.exe >/dev/null 2>&1; then
823
+ command -v node.exe
824
+ return 0
825
+ fi
826
+ return 1
827
+ }
828
+
829
+ _node_script_path() {
830
+ local node_cmd=$1 script=$2
831
+ case "$node_cmd" in
832
+ *.exe|*.EXE)
833
+ if command -v cygpath >/dev/null 2>&1; then
834
+ cygpath -w "$script"
835
+ return 0
836
+ fi
837
+ case "$script" in
838
+ /mnt/[A-Za-z]/*)
839
+ local drive rest
840
+ drive=${script#/mnt/}
841
+ drive=${drive%%/*}
842
+ rest=${script#/mnt/$drive/}
843
+ printf '%s:\\%s\n' "$(printf '%s' "$drive" | tr '[:lower:]' '[:upper:]')" "$(printf '%s' "$rest" | sed 's#/#\\#g')"
844
+ return 0
845
+ ;;
846
+ /[A-Za-z]/*)
847
+ local drive rest
848
+ drive=${script#/}
849
+ drive=${drive%%/*}
850
+ rest=${script#/$drive/}
851
+ printf '%s:\\%s\n' "$(printf '%s' "$drive" | tr '[:lower:]' '[:upper:]')" "$(printf '%s' "$rest" | sed 's#/#\\#g')"
852
+ return 0
853
+ ;;
854
+ esac
855
+ ;;
856
+ esac
857
+ printf '%s\n' "$script"
858
+ }
859
+
860
+ path_dirname() {
861
+ local path=$1
862
+ path=${path%/}
863
+ case "$path" in
864
+ */*) printf '%s\n' "${path%/*}" ;;
865
+ *) printf '.\n' ;;
866
+ esac
867
+ }
868
+
869
+ normalize_check_path() {
870
+ local path=$1
871
+ path=$(printf '%s' "$path" | sed 's#\\#/#g')
872
+ if command -v cygpath >/dev/null 2>&1; then
873
+ cygpath -m "$path" 2>/dev/null && return 0
874
+ fi
875
+ while [ "${#path}" -gt 1 ] && [ "${path%/}" != "$path" ]; do
876
+ case "$path" in [A-Za-z]:/) break ;; esac
877
+ path=${path%/}
878
+ done
879
+ printf '%s\n' "$path"
880
+ }
881
+
882
+ paths_match_for_check() {
883
+ local left right
884
+ left=$(normalize_check_path "$1")
885
+ right=$(normalize_check_path "$2")
886
+ case "$left:$right" in
887
+ [A-Za-z]:/*:[A-Za-z]:/*)
888
+ [ "$(printf '%s' "$left" | tr '[:upper:]' '[:lower:]')" = "$(printf '%s' "$right" | tr '[:upper:]' '[:lower:]')" ]
889
+ ;;
890
+ *)
891
+ [ "$left" = "$right" ]
892
+ ;;
893
+ esac
894
+ }
895
+
896
+ connect_daemon_agent_error_from_logs() {
897
+ local binary=$1 service_name=$2
898
+ "$binary" daemon logs --service-name "$service_name" -n "${DIREXIO_CONNECT_LOG_TAIL_LINES:-120}" 2>/dev/null \
899
+ | grep -Eio 'ACP_SESSION_INIT_FAILED|ACP metadata is missing|Recreate this ACP session' \
900
+ | head -n 1 || true
901
+ }
902
+
903
+ cmd_verify_connect_daemon() {
904
+ [ -f "$STATE_JSON" ] || {
905
+ warn "state.json not found: $STATE_JSON"
906
+ return 1
907
+ }
908
+
909
+ local service_name service_dir config runtime_dir binary target_work_dir status_out daemon_status work_dir evidence agent_error
910
+ service_name=$(jq -r '.agent_service_id // .domain // empty' "$STATE_JSON")
911
+ service_dir=$(jq -r '.agent_service_dir // empty' "$STATE_JSON")
912
+ config=$(jq -r '.cc_connect_config // empty' "$STATE_JSON")
913
+ runtime_dir=$(jq -r '.cc_connect_runtime_dir // empty' "$STATE_JSON")
914
+ binary=$(jq -r '.cc_connect_binary // "direxio-connect"' "$STATE_JSON")
915
+ [ -n "$service_name" ] || service_name=cc-connect
916
+ [ -n "$binary" ] || binary=direxio-connect
917
+
918
+ if [ -n "$config" ]; then
919
+ target_work_dir=$(path_dirname "$config")
920
+ elif [ -n "$runtime_dir" ]; then
921
+ target_work_dir="$runtime_dir"
922
+ elif [ -n "$service_dir" ]; then
923
+ target_work_dir="$service_dir/cc-connect"
924
+ else
925
+ warn "connect daemon check requires cc_connect_config, cc_connect_runtime_dir, or agent_service_dir in state.json"
926
+ return 1
927
+ fi
928
+
929
+ case "$binary" in
930
+ */*|[A-Za-z]:/*|[A-Za-z]:\\*) ;;
931
+ *)
932
+ command -v "$binary" >/dev/null 2>&1 || {
933
+ _state_write '
934
+ .runtime_checks.connect_daemon = {
935
+ status: "failed",
936
+ ts: $ts,
937
+ evidence: "direxio-connect binary not found"
938
+ }
939
+ ' --arg ts "$(_now)"
940
+ warn "connect daemon check could not find binary: $binary"
941
+ return 1
942
+ }
943
+ ;;
944
+ esac
945
+
946
+ status_out=$("$binary" daemon status --service-name "$service_name" 2>/dev/null) || {
947
+ _state_write '
948
+ .runtime_checks.connect_daemon = {
949
+ status: "failed",
950
+ ts: $ts,
951
+ service_name: $service_name,
952
+ evidence: "direxio-connect daemon status failed"
953
+ }
954
+ ' --arg ts "$(_now)" --arg service_name "$service_name"
955
+ return 1
956
+ }
957
+ daemon_status=$(printf '%s\n' "$status_out" | sed -nE 's/^[[:space:]]*Status:[[:space:]]*//p' | head -n 1)
958
+ work_dir=$(printf '%s\n' "$status_out" | sed -nE 's/^[[:space:]]*WorkDir:[[:space:]]*//p' | head -n 1)
959
+
960
+ if [ "$daemon_status" != "Running" ]; then
961
+ evidence="direxio-connect daemon is not Running"
962
+ elif [ -z "$work_dir" ]; then
963
+ evidence="direxio-connect daemon status has no WorkDir"
964
+ elif ! paths_match_for_check "$target_work_dir" "$work_dir"; then
965
+ evidence="direxio-connect daemon belongs to a different service"
966
+ else
967
+ agent_error=$(connect_daemon_agent_error_from_logs "$binary" "$service_name")
968
+ if [ -n "$agent_error" ]; then
969
+ _state_write '
970
+ .runtime_checks.connect_daemon = {
971
+ status: "failed",
972
+ ts: $ts,
973
+ evidence: "direxio-connect daemon logs report ACP session initialization failure",
974
+ service_name: $service_name,
975
+ daemon_status: $daemon_status,
976
+ work_dir: $work_dir,
977
+ expected_work_dir: $target_work_dir,
978
+ agent_error: $agent_error
979
+ }
980
+ ' --arg ts "$(_now)" \
981
+ --arg service_name "$service_name" \
982
+ --arg daemon_status "$daemon_status" \
983
+ --arg work_dir "$(normalize_check_path "$work_dir")" \
984
+ --arg target_work_dir "$(normalize_check_path "$target_work_dir")" \
985
+ --arg agent_error "$agent_error"
986
+ warn "direxio-connect daemon logs report ACP session initialization failure"
987
+ return 1
988
+ fi
989
+ _state_write '
990
+ .runtime_checks.connect_daemon = {
991
+ status: "passed",
992
+ ts: $ts,
993
+ evidence: "direxio-connect daemon is running for this service",
994
+ service_name: $service_name,
995
+ daemon_status: $daemon_status,
996
+ work_dir: $work_dir,
997
+ expected_work_dir: $target_work_dir
998
+ }
999
+ ' --arg ts "$(_now)" \
1000
+ --arg service_name "$service_name" \
1001
+ --arg daemon_status "$daemon_status" \
1002
+ --arg work_dir "$(normalize_check_path "$work_dir")" \
1003
+ --arg target_work_dir "$(normalize_check_path "$target_work_dir")"
1004
+ echo "verified runtime check: connect_daemon"
1005
+ return 0
1006
+ fi
1007
+
1008
+ _state_write '
1009
+ .runtime_checks.connect_daemon = {
1010
+ status: "failed",
1011
+ ts: $ts,
1012
+ evidence: $evidence,
1013
+ service_name: $service_name,
1014
+ daemon_status: $daemon_status,
1015
+ work_dir: $work_dir,
1016
+ expected_work_dir: $target_work_dir
1017
+ }
1018
+ ' --arg ts "$(_now)" \
1019
+ --arg evidence "$evidence" \
1020
+ --arg service_name "$service_name" \
1021
+ --arg daemon_status "$daemon_status" \
1022
+ --arg work_dir "$(normalize_check_path "$work_dir")" \
1023
+ --arg target_work_dir "$(normalize_check_path "$target_work_dir")"
1024
+ warn "$evidence"
1025
+ return 1
1026
+ }
1027
+
1028
+ runtime_check_status() {
1029
+ local check=$1
1030
+ jq -r --arg check "$check" '.runtime_checks[$check].status // "not_run"' "$STATE_JSON"
1031
+ }
1032
+
1033
+ cmd_verify_runtime() {
1034
+ [ -f "$STATE_JSON" ] || {
1035
+ warn "state.json not found: $STATE_JSON"
1036
+ return 1
1037
+ }
1038
+
1039
+ local rc=0 failed_count=0 connect_status doctor_status tools_status smoke_status status
1040
+
1041
+ cmd_verify_connect_daemon >/dev/null || rc=1
1042
+ cmd_verify_mcp_doctor >/dev/null || rc=1
1043
+ cmd_verify_mcp_tools >/dev/null || rc=1
1044
+ cmd_verify_mcp_smoke >/dev/null || rc=1
1045
+
1046
+ connect_status=$(runtime_check_status connect_daemon)
1047
+ doctor_status=$(runtime_check_status mcp_doctor)
1048
+ tools_status=$(runtime_check_status mcp_tools)
1049
+ smoke_status=$(runtime_check_status mcp_smoke)
1050
+
1051
+ for status in "$connect_status" "$doctor_status" "$tools_status" "$smoke_status"; do
1052
+ [ "$status" = "passed" ] || failed_count=$((failed_count + 1))
1053
+ done
1054
+
1055
+ if [ "$failed_count" -eq 0 ]; then
1056
+ _state_write '
1057
+ .runtime_checks.summary = {
1058
+ status: "passed",
1059
+ ts: $ts,
1060
+ failed_count: 0,
1061
+ evidence: "all runtime checks passed",
1062
+ checks: {
1063
+ connect_daemon: $connect_status,
1064
+ mcp_doctor: $doctor_status,
1065
+ mcp_tools: $tools_status,
1066
+ mcp_smoke: $smoke_status
1067
+ }
1068
+ }
1069
+ ' --arg ts "$(_now)" \
1070
+ --arg connect_status "$connect_status" \
1071
+ --arg doctor_status "$doctor_status" \
1072
+ --arg tools_status "$tools_status" \
1073
+ --arg smoke_status "$smoke_status"
1074
+ echo "verified runtime checks: passed"
1075
+ return 0
1076
+ fi
1077
+
1078
+ _state_write '
1079
+ .runtime_checks.summary = {
1080
+ status: "failed",
1081
+ ts: $ts,
1082
+ failed_count: ($failed_count | tonumber),
1083
+ evidence: "one or more runtime checks failed",
1084
+ checks: {
1085
+ connect_daemon: $connect_status,
1086
+ mcp_doctor: $doctor_status,
1087
+ mcp_tools: $tools_status,
1088
+ mcp_smoke: $smoke_status
1089
+ }
1090
+ }
1091
+ ' --arg ts "$(_now)" \
1092
+ --arg failed_count "$failed_count" \
1093
+ --arg connect_status "$connect_status" \
1094
+ --arg doctor_status "$doctor_status" \
1095
+ --arg tools_status "$tools_status" \
1096
+ --arg smoke_status "$smoke_status"
1097
+ warn "runtime checks failed: $failed_count"
1098
+ return "${rc:-1}"
1099
+ }
1100
+
1101
+ cmd_verify() {
1102
+ case "${1:-}" in
1103
+ connect_daemon) cmd_verify_connect_daemon ;;
1104
+ mcp_doctor) cmd_verify_mcp_doctor ;;
1105
+ mcp_smoke) cmd_verify_mcp_smoke ;;
1106
+ mcp_tools) cmd_verify_mcp_tools ;;
1107
+ runtime) cmd_verify_runtime ;;
1108
+ *)
1109
+ echo "Usage: $0 verify [connect_daemon|mcp_doctor|mcp_smoke|mcp_tools|runtime]" >&2
1110
+ return 1
1111
+ ;;
1112
+ esac
1113
+ }
1114
+
1115
+ # Entry point.
1116
+ case "${1:-run}" in
1117
+ run) cmd_run ;;
1118
+ status) cmd_status ;;
1119
+ report) shift; cmd_report "${1:-new_deploy}" ;;
1120
+ confirm) shift; cmd_confirm "${1:-}" ;;
1121
+ verify) shift; cmd_verify "${1:-}" ;;
1122
+ reset)
1123
+ [ -f "$STATE_JSON" ] && { mv "$STATE_JSON" "$STATE_JSON.reset-$(date -u +%Y%m%d%H%M%S)"; warn "Archived old state.json."; }
1124
+ warn "Warning: after reset, destroy no longer has state data. Any remaining AWS resources must be removed manually." ;;
1125
+ *) echo "Usage: $0 [run|status|report|confirm|verify|reset]"; exit 1 ;;
1126
+ esac