@meshxdata/fops 0.1.36 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/CHANGELOG.md +207 -0
  2. package/fops.mjs +37 -14
  3. package/package.json +1 -1
  4. package/src/agent/llm.js +2 -0
  5. package/src/auth/azure.js +92 -0
  6. package/src/auth/cloudflare.js +125 -0
  7. package/src/auth/index.js +2 -0
  8. package/src/commands/index.js +8 -4
  9. package/src/commands/lifecycle.js +31 -10
  10. package/src/plugins/bundled/fops-plugin-azure/index.js +44 -2896
  11. package/src/plugins/bundled/fops-plugin-azure/lib/azure-aks.js +130 -2
  12. package/src/plugins/bundled/fops-plugin-azure/lib/azure-auth.js +497 -0
  13. package/src/plugins/bundled/fops-plugin-azure/lib/azure-helpers.js +51 -13
  14. package/src/plugins/bundled/fops-plugin-azure/lib/azure-ops.js +206 -52
  15. package/src/plugins/bundled/fops-plugin-azure/lib/azure-provision.js +128 -34
  16. package/src/plugins/bundled/fops-plugin-azure/lib/azure-shared-cache.js +1 -1
  17. package/src/plugins/bundled/fops-plugin-azure/lib/azure-sync.js +4 -4
  18. package/src/plugins/bundled/fops-plugin-azure/lib/azure.js +2 -2
  19. package/src/plugins/bundled/fops-plugin-azure/lib/commands/fleet-cmds.js +254 -0
  20. package/src/plugins/bundled/fops-plugin-azure/lib/commands/infra-cmds.js +894 -0
  21. package/src/plugins/bundled/fops-plugin-azure/lib/commands/test-cmds.js +314 -0
  22. package/src/plugins/bundled/fops-plugin-azure/lib/commands/vm-cmds.js +893 -0
  23. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/dai-backend.yaml +13 -0
  24. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/dai-frontend.yaml +13 -0
  25. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-backend.yaml +13 -0
  26. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-frontend.yaml +13 -0
  27. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-hive.yaml +13 -0
  28. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-kafka.yaml +13 -0
  29. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-meltano.yaml +13 -0
  30. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-mlflow.yaml +13 -0
  31. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-opa.yaml +13 -0
  32. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-processor.yaml +13 -0
  33. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-scheduler.yaml +13 -0
  34. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-storage-engine.yaml +13 -0
  35. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-trino.yaml +13 -0
  36. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/apps/foundation-watcher.yaml +13 -0
  37. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/config/repository.yaml +66 -0
  38. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/kustomization.yaml +30 -0
  39. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/acr-webhook-controller.yaml +63 -0
  40. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/externalsecrets.yaml +15 -0
  41. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/istio.yaml +42 -0
  42. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/kafka.yaml +15 -0
  43. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/kube-reflector.yaml +33 -0
  44. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/kubecost.yaml +12 -0
  45. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/nats-server.yaml +15 -0
  46. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/prometheus-agent.yaml +34 -0
  47. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/reloader.yaml +12 -0
  48. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/spark.yaml +112 -0
  49. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/tailscale.yaml +67 -0
  50. package/src/plugins/bundled/fops-plugin-azure/templates/cluster/operator/vertical-pod-autoscaler.yaml +15 -0
  51. package/src/plugins/bundled/fops-plugin-file/demo/orders_renamed.aligned.csv +1 -1
  52. package/src/plugins/bundled/fops-plugin-file/index.js +81 -12
  53. package/src/plugins/bundled/fops-plugin-file/lib/match.js +133 -15
  54. package/src/plugins/bundled/fops-plugin-file/lib/report.js +3 -0
  55. package/src/plugins/bundled/fops-plugin-foundation/index.js +26 -6
  56. package/src/plugins/bundled/fops-plugin-foundation/lib/client.js +9 -5
  57. package/src/plugins/bundled/fops-plugin-foundation-graphql/lib/graphql/resolvers/data-product.js +32 -0
  58. package/src/plugins/bundled/fops-plugin-foundation-graphql/lib/graphql/schema.js +20 -1
  59. package/src/plugins/loader.js +23 -6
@@ -8,7 +8,7 @@ import {
8
8
  DEFAULTS, DIM, OK, WARN, ERR,
9
9
  banner, hint, kvLine,
10
10
  resolvePublicIp, subArgs, buildTags, fetchMyIp,
11
- sshCmd, waitForSsh, fopsUpCmd, buildPublicUrl,
11
+ sshCmd, waitForSsh, closeMux, fopsUpCmd, buildPublicUrl,
12
12
  runReconcilers, ensureOpenAiNetworkAccess,
13
13
  reconcileOk, RECONCILE_LABEL_WIDTH,
14
14
  } from "./azure-helpers.js";
@@ -80,7 +80,7 @@ export async function configureVm(execa, ip, user, publicUrl, { githubToken, k3s
80
80
 
81
81
  console.log(chalk.dim(" Configuring VM..."));
82
82
 
83
- // Batch: sshd tuning + docker group + ownership + public URL — single SSH round-trip
83
+ // Batch: sshd tuning + docker group + ownership — single SSH round-trip
84
84
  const setupBatch = [
85
85
  // Speed up SSH: accept forwarded env vars, disable DNS reverse lookup
86
86
  `sudo grep -q '^AcceptEnv.*BEARER_TOKEN' /etc/ssh/sshd_config 2>/dev/null || {`,
@@ -91,22 +91,10 @@ export async function configureVm(execa, ip, user, publicUrl, { githubToken, k3s
91
91
  `}`,
92
92
  `sudo usermod -aG docker ${user} 2>/dev/null; true`,
93
93
  "sudo chown -R azureuser:azureuser /opt/foundation-compose 2>/dev/null; true",
94
- `cd /opt/foundation-compose`,
95
- `sed -i -e '$a\\' .env 2>/dev/null || true`,
96
- `if grep -q '^FOUNDATION_PUBLIC_URL=' .env 2>/dev/null; then`,
97
- ` sed -i 's|^FOUNDATION_PUBLIC_URL=.*|FOUNDATION_PUBLIC_URL=${publicUrl}|' .env;`,
98
- `else`,
99
- ` echo 'FOUNDATION_PUBLIC_URL=${publicUrl}' >> .env;`,
100
- `fi`,
101
- // Persist COMPOSE_PROFILES so k3s/watcher/traefik start on manual restarts too
102
- `if grep -q '^COMPOSE_PROFILES=' .env 2>/dev/null; then`,
103
- ` sed -i 's|^COMPOSE_PROFILES=.*|COMPOSE_PROFILES=k3s,traefik${dai ? ",dai" : ""}|' .env;`,
104
- `else`,
105
- ` echo 'COMPOSE_PROFILES=k3s,traefik${dai ? ",dai" : ""}' >> .env;`,
106
- `fi`,
94
+ // Only inject FOUNDATION_PUBLIC_URL if not already set — never overwrite
95
+ `cd /opt/foundation-compose && grep -q '^FOUNDATION_PUBLIC_URL=' .env 2>/dev/null || echo 'FOUNDATION_PUBLIC_URL=${publicUrl}' >> .env`,
107
96
  ].join("\n");
108
97
  await ssh(setupBatch);
109
- console.log(chalk.green(` ✓ FOUNDATION_PUBLIC_URL=${publicUrl}`));
110
98
 
111
99
  let ghcrOk = false;
112
100
  if (githubToken) {
@@ -448,7 +436,36 @@ async function vmReconcileNetworking(ctx) {
448
436
  }
449
437
  }
450
438
  } else {
451
- console.log(chalk.yellow(" ⚠ No public IP attached to NIC"));
439
+ // No public IP attached create one and attach it
440
+ const pipName = `${vmName}PublicIP`;
441
+ console.log(chalk.dim(` ↻ ${"Public IP".padEnd(RECONCILE_LABEL_WIDTH)} — creating ${pipName}…`));
442
+ const { exitCode: createCode } = await execa("az", [
443
+ "network", "public-ip", "create", "-g", rg, "-n", pipName,
444
+ "--sku", "Standard", "--allocation-method", "Static", "--output", "none",
445
+ ...subArgs(sub),
446
+ ], { reject: false, timeout: 30000 });
447
+ if (createCode === 0) {
448
+ const { exitCode: attachCode } = await execa("az", [
449
+ "network", "nic", "ip-config", "update",
450
+ "-g", rg, "--nic-name", ctx.nicName, "-n", "ipconfig1",
451
+ "--public-ip-address", pipName, "--output", "none",
452
+ ...subArgs(sub),
453
+ ], { reject: false, timeout: 30000 });
454
+ if (attachCode === 0) {
455
+ const { stdout: newPipJson } = await execa("az", [
456
+ "network", "public-ip", "show", "-g", rg, "-n", pipName, "--output", "json",
457
+ ...subArgs(sub),
458
+ ], { reject: false, timeout: 15000 });
459
+ try {
460
+ ctx.ip = JSON.parse(newPipJson).ipAddress || "";
461
+ } catch {}
462
+ reconcileOk("Public IP", ctx.ip ? `${ctx.ip} (created)` : `${pipName} (created)`);
463
+ } else {
464
+ console.log(chalk.yellow(` ⚠ Created ${pipName} but could not attach to NIC`));
465
+ }
466
+ } else {
467
+ console.log(chalk.yellow(` ⚠ Could not create public IP — VM will have no public IP`));
468
+ }
452
469
  }
453
470
 
454
471
  const nsgId = nic.networkSecurityGroup?.id || "";
@@ -560,6 +577,7 @@ async function vmReconcileNsg(ctx) {
560
577
  ...subArgs(sub),
561
578
  ], { reject: false, timeout: 30000 });
562
579
  reconcileOk("SSH (22)", source);
580
+ ctx.operatorIpChanged = true;
563
581
  } else if (sshSource && !sshHasMyIp) {
564
582
  // Add our IP to existing admin IPs
565
583
  const merged = [...new Set([...sshCurrentSources, sshSource])];
@@ -573,6 +591,7 @@ async function vmReconcileNsg(ctx) {
573
591
  ...subArgs(sub),
574
592
  ], { reject: false, timeout: 30000 });
575
593
  reconcileOk("SSH (22)", merged.join(", "));
594
+ ctx.operatorIpChanged = true;
576
595
  } else {
577
596
  reconcileOk("SSH (22)", sshCurrentSources.join(", ") || "*");
578
597
  }
@@ -1026,14 +1045,46 @@ async function removeSshBypassViaRunCommand(execa, rg, vmName, sourceCidr, sub)
1026
1045
  // ── Step: SSH reachability ───────────────────────────────────────────────────
1027
1046
 
1028
1047
  async function vmReconcileSsh(ctx) {
1029
- const { execa, ip, adminUser, knockSequence, port, desiredUrl, vmName, rg, sub } = ctx;
1048
+ const { execa, ip, adminUser, port, desiredUrl, vmName, rg, sub } = ctx;
1030
1049
  console.log(chalk.dim(" Checking SSH..."));
1031
- const maxWaitFirst = 90000;
1050
+
1051
+ // Always fetch the knock sequence fresh from the Azure VM tag — local state can drift
1052
+ // after state recovery or manual changes, causing knocks with the wrong sequence.
1053
+ let knockSequence = ctx.knockSequence;
1054
+ if (rg) {
1055
+ try {
1056
+ const { stdout: tagRaw } = await execa("az", [
1057
+ "vm", "show", "-g", rg, "-n", vmName,
1058
+ "--query", "tags.fopsKnock", "-o", "tsv", ...subArgs(sub),
1059
+ ], { timeout: 15000, reject: false });
1060
+ const raw = (tagRaw || "").trim().replace(/[()]/g, "");
1061
+ if (raw) {
1062
+ const fresh = raw.split(/[-,]/).map((s) => parseInt(s.trim(), 10)).filter((n) => Number.isInteger(n) && n > 0);
1063
+ if (fresh.length >= 2) {
1064
+ if (!knockSequence?.length || fresh.some((v, i) => v !== knockSequence[i])) {
1065
+ console.log(chalk.dim(" Syncing knock sequence from Azure tag..."));
1066
+ knockSequence = fresh;
1067
+ ctx.knockSequence = fresh;
1068
+ writeVmState(vmName, { knockSequence: fresh });
1069
+ }
1070
+ }
1071
+ }
1072
+ } catch { /* best-effort */ }
1073
+ }
1074
+
1075
+ // Close any stale mux master before probing. ControlPersist=600 keeps the SSH master
1076
+ // alive for 10 min, but if the VM rebooted its underlying TCP is broken — every probe
1077
+ // through the stale mux fails even after a successful knock. A fresh connect fixes it.
1078
+ await closeMux(execa, ip, adminUser);
1079
+ // When Run Command is available (rg set), use a short knock probe then fall through to
1080
+ // Run Command rather than burning 2+ minutes on knock retries.
1081
+ const canRunCommand = knockSequence?.length && rg;
1082
+ const maxWaitFirst = canRunCommand ? 20000 : 90000;
1032
1083
  if (knockSequence?.length) {
1033
1084
  await performKnock(ip, knockSequence, { quiet: true });
1034
1085
  }
1035
1086
  let sshReady = await waitForSsh(execa, ip, adminUser, maxWaitFirst);
1036
- if (!sshReady && knockSequence?.length) {
1087
+ if (!sshReady && knockSequence?.length && !canRunCommand) {
1037
1088
  console.log(chalk.dim(" Re-sending knock and retrying SSH..."));
1038
1089
  await performKnock(ip, knockSequence, { quiet: true });
1039
1090
  await new Promise((r) => setTimeout(r, 5000));
@@ -1056,15 +1107,44 @@ async function vmReconcileSsh(ctx) {
1056
1107
  }
1057
1108
  }
1058
1109
  if (!sshReady) {
1059
- console.log(chalk.yellow(" ⚠ SSH not reachable VM may still be booting. Skipping in-guest checks."));
1060
- if (knockSequence?.length) {
1061
- console.log(chalk.dim(" If knock is enabled, the stored sequence may not match the VM (e.g. after state recovery)."));
1062
- console.log(chalk.dim(" Try: fops azure knock open " + ctx.vmName + " then immediately fops azure ssh " + ctx.vmName));
1063
- console.log(chalk.dim(" Or remove knock: fops azure knock disable " + ctx.vmName));
1110
+ // Verify actual VM power state via Azure CLI the earlier instance-view may be stale
1111
+ if (rg) {
1112
+ try {
1113
+ const { stdout: showJson } = await execa("az", [
1114
+ "vm", "show", "--resource-group", rg, "--name", vmName,
1115
+ "--show-details", "--output", "json", ...subArgs(sub),
1116
+ ], { timeout: 20000, reject: false });
1117
+ const vmDetails = JSON.parse(showJson || "{}");
1118
+ const powerState = vmDetails?.powerState || "";
1119
+ if (powerState && !powerState.includes("running")) {
1120
+ console.log(chalk.yellow(` ⚠ Azure reports VM power state: ${chalk.bold(powerState)} — starting VM...`));
1121
+ await execa("az", [
1122
+ "vm", "start", "--resource-group", rg, "--name", vmName, "--output", "none",
1123
+ ...subArgs(sub),
1124
+ ], { timeout: 300000 });
1125
+ reconcileOk("VM", "started");
1126
+ // Give the VM a moment then retry SSH once more
1127
+ await new Promise((r) => setTimeout(r, 10000));
1128
+ if (knockSequence?.length) await performKnock(ip, knockSequence, { quiet: true });
1129
+ sshReady = await waitForSsh(execa, ip, adminUser, 60000);
1130
+ } else if (powerState) {
1131
+ console.log(chalk.dim(` Azure VM power state: ${powerState}`));
1132
+ }
1133
+ } catch {
1134
+ // best-effort
1135
+ }
1136
+ }
1137
+ if (!sshReady) {
1138
+ console.log(chalk.yellow(" ⚠ SSH not reachable — VM may still be booting. Skipping in-guest checks."));
1139
+ if (knockSequence?.length) {
1140
+ console.log(chalk.dim(" If knock is enabled, the stored sequence may not match the VM (e.g. after state recovery)."));
1141
+ console.log(chalk.dim(" Try: fops azure knock open " + ctx.vmName + " then immediately fops azure ssh " + ctx.vmName));
1142
+ console.log(chalk.dim(" Or remove knock: fops azure knock disable " + ctx.vmName));
1143
+ }
1144
+ ctx.publicUrl = desiredUrl || buildPublicUrl(ip, port);
1145
+ ctx.done = true;
1146
+ return;
1064
1147
  }
1065
- ctx.publicUrl = desiredUrl || buildPublicUrl(ip, port);
1066
- ctx.done = true;
1067
- return;
1068
1148
  }
1069
1149
  reconcileOk("SSH", "reachable");
1070
1150
  }
@@ -1081,8 +1161,25 @@ async function vmReconcileKnock(ctx) {
1081
1161
  );
1082
1162
  const knockdRunning = knockdCheck?.trim() === "active";
1083
1163
 
1084
- if (knockdRunning && ctx.knockSequence?.length) {
1085
- reconcileOk("Port knocking", "active");
1164
+ // Read the authoritative sequence from knockd.conf on the VM
1165
+ const { stdout: knockConfOut } = await ssh(
1166
+ "grep -m1 'sequence' /etc/knockd.conf 2>/dev/null | sed 's/.*sequence[[:space:]]*=[[:space:]]*//'", 8000,
1167
+ );
1168
+ const vmKnockSeq = (knockConfOut || "").trim()
1169
+ .split(",").map((s) => parseInt(s.trim(), 10)).filter((n) => Number.isInteger(n) && n > 0);
1170
+
1171
+ if (knockdRunning && vmKnockSeq.length >= 2) {
1172
+ // Reconcile: keep tag + local state in sync with knockd.conf
1173
+ const local = ctx.knockSequence;
1174
+ const seqDiffers = !local?.length || local.length !== vmKnockSeq.length || local.some((v, i) => v !== vmKnockSeq[i]);
1175
+ if (seqDiffers) {
1176
+ console.log(chalk.dim(` ↻ ${"Port knocking".padEnd(RECONCILE_LABEL_WIDTH)} — sequence drift detected, syncing…`));
1177
+ writeVmState(vmName, { knockSequence: vmKnockSeq });
1178
+ const { setVmKnockTag } = await import("./azure-helpers.js");
1179
+ await setVmKnockTag(execa, ctx.rg, vmName, vmKnockSeq, ctx.sub);
1180
+ ctx.knockSequence = vmKnockSeq;
1181
+ }
1182
+ reconcileOk("Port knocking", `active [${vmKnockSeq.join(", ")}]`);
1086
1183
  } else if (!knockdRunning) {
1087
1184
  console.log(chalk.dim(" Setting up port knocking (after post-start + UI reachable)..."));
1088
1185
  const knockSeq = generateKnockSequence();
@@ -1092,8 +1189,7 @@ async function vmReconcileKnock(ctx) {
1092
1189
  await setVmKnockTag(execa, ctx.rg, vmName, knockSeq, ctx.sub);
1093
1190
  ctx.knockSequence = knockSeq;
1094
1191
  } else {
1095
- console.log(chalk.yellow(" ⚠ knockd active on VM but no local knock sequence stored"));
1096
- console.log(chalk.dim(" To re-create: fops azure knock disable && fops azure up"));
1192
+ console.log(chalk.yellow(" ⚠ knockd active on VM but could not read sequence from knockd.conf"));
1097
1193
  }
1098
1194
 
1099
1195
  if (ctx.knockSequence?.length) {
@@ -1590,10 +1686,8 @@ export async function provisionVm(execa, ip, adminUser, { githubToken, branch =
1590
1686
  }
1591
1687
 
1592
1688
  await runScript("Cloning foundation-compose", [
1593
- "cp /opt/foundation-compose/.env /tmp/.env.fops-backup 2>/dev/null || true",
1594
1689
  "rm -rf /opt/foundation-compose",
1595
1690
  `git clone --branch ${branch} --depth 1 --recurse-submodules https://github.com/meshxdata/foundation-compose.git /opt/foundation-compose`,
1596
- "if [ -f /tmp/.env.fops-backup ]; then cp /tmp/.env.fops-backup /opt/foundation-compose/.env; rm -f /tmp/.env.fops-backup; else cp /opt/foundation-compose/.env.example /opt/foundation-compose/.env; fi",
1597
1691
  "mkdir -p /opt/foundation-compose/credentials",
1598
1692
  "touch /opt/foundation-compose/credentials/kubeconfig.yaml",
1599
1693
  `chown -R ${adminUser}:${adminUser} /opt/foundation-compose`,
@@ -22,7 +22,7 @@ import { readState, listVms } from "./azure-state.js";
22
22
  // fops_by = alessio (who synced)
23
23
 
24
24
  const TAG_PREFIX = "fops_";
25
- const TAG_MAX_AGE_MS = 10 * 60 * 1000; // 10 minutes — tags are cheaper to check
25
+ const TAG_MAX_AGE_MS = 20 * 60 * 1000; // 20 minutes — tags are cheaper to check
26
26
 
27
27
  // ── Write: publish probe results as tags on a VM ─────────────────────────────
28
28
 
@@ -12,7 +12,7 @@ import {
12
12
  // Stored in ~/.fops.json under azure.cache:
13
13
  // { updatedAt, vms: { <name>: { ... } }, clusters: { <name>: { ... } } }
14
14
 
15
- const CACHE_MAX_AGE_MS = 5 * 60 * 1000; // 5 minutes
15
+ const CACHE_MAX_AGE_MS = 15 * 60 * 1000; // 15 minutes
16
16
 
17
17
  // Short keys for the 6 tracked Foundation services
18
18
  const SVC_MAP = {
@@ -169,16 +169,16 @@ async function syncVms(execa) {
169
169
 
170
170
  // After a knock, iptables rule needs a moment to propagate; first SSH needs full handshake.
171
171
  // Brief delay then retry once to avoid false "unreachable" (e.g. uaenorth latency).
172
- await new Promise((r) => setTimeout(r, 800));
172
+ await new Promise((r) => setTimeout(r, 400));
173
173
  let sshOk = false;
174
174
  for (let attempt = 0; attempt < 2; attempt++) {
175
175
  const { exitCode: sshCode } = await execa("ssh", [
176
176
  ...MUX_OPTS(vm.publicIp, DEFAULTS.adminUser),
177
177
  "-o", "BatchMode=yes",
178
178
  `${DEFAULTS.adminUser}@${vm.publicIp}`, "echo ok",
179
- ], { timeout: 15000, reject: false }).catch(() => ({ exitCode: 1 }));
179
+ ], { timeout: 8000, reject: false }).catch(() => ({ exitCode: 1 }));
180
180
  if (sshCode === 0) { sshOk = true; break; }
181
- if (attempt === 0) await new Promise((r) => setTimeout(r, 2000));
181
+ if (attempt === 0) await new Promise((r) => setTimeout(r, 1000));
182
182
  }
183
183
 
184
184
  if (!sshOk) {
@@ -27,7 +27,7 @@ export {
27
27
  managedImageId, resolvePublicIp,
28
28
  resolveGithubToken, verifyGithubToken,
29
29
  sshCmd, closeMux, MUX_OPTS, muxSocketPath, waitForSsh,
30
- knockForVm, fopsUpCmd,
30
+ knockForVm, ensureKnockSequence, fopsUpCmd,
31
31
  runReconcilers, ensureOpenAiNetworkAccess,
32
32
  } from "./azure-helpers.js";
33
33
 
@@ -46,7 +46,7 @@ export {
46
46
 
47
47
  // ── VM operations ────────────────────────────────────────────────────────────
48
48
  export {
49
- azureStatus, azureTrinoStatus, azureSsh, azurePortForward, azureSshAdminAdd, azureVmCheck, azureAgent, azureOpenAiDebugVm,
49
+ azureStatus, azureTrinoStatus, azureSsh, azureSshWhitelistMe, azurePortForward, azureSshAdminAdd, azureVmCheck, azureAgent, azureOpenAiDebugVm,
50
50
  azureDeploy, azurePull, azureDeployVersion, azureRunUp, azureConfig, azureConfigVersions, azureUpdate,
51
51
  azureLogs, azureGrantAdmin, azureContext,
52
52
  azureList, azureApply,
@@ -0,0 +1,254 @@
1
+ /**
2
+ * Fleet management, swarm, audit, snapshot/restore commands.
3
+ * All operations target multiple Azure VMs or the swarm cluster.
4
+ */
5
+ import chalk from "chalk";
6
+
7
+ export function registerFleetCommands(azure) {
8
+ // ── Fleet management ───────────────────────────────────────────────────
9
+
10
+ azure
11
+ .command("exec <command>")
12
+ .description("Run a command on all tracked VMs in parallel")
13
+ .option("--vm-name <name>", "Target a specific VM instead of all")
14
+ .option("--timeout <seconds>", "Per-VM command timeout (default: 120)", "120")
15
+ .option("--quiet", "Show only summary, not command output")
16
+ .action(async (command, opts) => {
17
+ const { fleetExec } = await import("../azure-fleet.js");
18
+ await fleetExec(command, { vmName: opts.vmName, timeout: Number(opts.timeout), quiet: opts.quiet });
19
+ });
20
+
21
+ azure
22
+ .command("diff")
23
+ .description("Compare configuration across all VMs — detect drift")
24
+ .option("--vm-name <name>", "Target a specific VM instead of all")
25
+ .action(async (opts) => {
26
+ const { fleetDiff } = await import("../azure-fleet.js");
27
+ await fleetDiff({ vmName: opts.vmName });
28
+ });
29
+
30
+ azure
31
+ .command("rollout")
32
+ .description("Rolling deploy: pull, restart, health-check across VMs in batches")
33
+ .option("--vm-name <name>", "Target a specific VM instead of all")
34
+ .option("--batch <size>", "Number of VMs to deploy in parallel per batch (default: 1)", "1")
35
+ .option("--branch <branch>", "Git branch to deploy (default: main)")
36
+ .option("--health-timeout <seconds>", "Seconds to wait for healthy after restart (default: 120)", "120")
37
+ .option("--force", "Continue rolling out even if a batch fails")
38
+ .action(async (opts) => {
39
+ const { fleetRollout } = await import("../azure-fleet.js");
40
+ await fleetRollout({
41
+ vmName: opts.vmName, batch: Number(opts.batch), branch: opts.branch,
42
+ healthTimeout: Number(opts.healthTimeout), force: opts.force,
43
+ });
44
+ });
45
+
46
+ azure
47
+ .command("sync")
48
+ .description("Push local config files to all VMs")
49
+ .option("--vm-name <name>", "Target a specific VM instead of all")
50
+ .option("--files <files>", "Comma-separated list of files to sync (default: docker-compose.yaml,.env)", "docker-compose.yaml,.env")
51
+ .option("--restart", "Run fops up after syncing files")
52
+ .action(async (opts) => {
53
+ const { fleetSync } = await import("../azure-fleet.js");
54
+ await fleetSync({ vmName: opts.vmName, files: opts.files.split(","), restart: opts.restart });
55
+ });
56
+
57
+ azure
58
+ .command("health")
59
+ .description("Deep health report across all VMs — containers, disk, memory, load")
60
+ .option("--vm-name <name>", "Target a specific VM instead of all")
61
+ .action(async (opts) => {
62
+ const { fleetHealth } = await import("../azure-fleet.js");
63
+ await fleetHealth({ vmName: opts.vmName });
64
+ });
65
+
66
+ azure
67
+ .command("snapshot [name]")
68
+ .description("Create Azure disk snapshots of all (or one) tracked VMs")
69
+ .option("--vm-name <name>", "Target a specific VM instead of all")
70
+ .option("--tag <tag>", "Snapshot tag/label (default: ISO timestamp)")
71
+ .option("--profile <subscription>", "Azure subscription name or ID")
72
+ .action(async (name, opts) => {
73
+ const { fleetSnapshot } = await import("../azure-fleet.js");
74
+ await fleetSnapshot({ vmName: opts.vmName || name, tag: opts.tag, profile: opts.profile });
75
+ });
76
+
77
+ azure
78
+ .command("restore <name>")
79
+ .description("Restore a VM from an Azure disk snapshot")
80
+ .option("--snapshot <name>", "Snapshot name to restore from (omit to list available)")
81
+ .option("--profile <subscription>", "Azure subscription name or ID")
82
+ .option("--yes", "Skip confirmation prompt")
83
+ .action(async (name, opts) => {
84
+ const { fleetRestore } = await import("../azure-fleet.js");
85
+ await fleetRestore({ vmName: name, snapshot: opts.snapshot, profile: opts.profile, yes: opts.yes });
86
+ });
87
+
88
+ // ── Audit subcommands ──────────────────────────────────────────────────
89
+
90
+ const audit = azure
91
+ .command("audit")
92
+ .description("Security & compliance audit across Azure resources");
93
+
94
+ audit
95
+ .command("all", { isDefault: true })
96
+ .description("Run all audit checks (VMs, AKS, storage encryption)")
97
+ .option("--profile <subscription>", "Azure subscription name or ID")
98
+ .option("--vm-name <name>", "Limit VM audit to a specific VM")
99
+ .option("--cluster <name>", "Limit AKS audit to a specific cluster")
100
+ .option("--account <name>", "Limit storage audit to a specific account")
101
+ .option("--verbose", "Show info-level suggestions alongside warnings")
102
+ .action(async (opts) => {
103
+ const { auditAll } = await import("../azure-audit.js");
104
+ await auditAll({ profile: opts.profile, vmName: opts.vmName, clusterName: opts.cluster, account: opts.account, verbose: opts.verbose });
105
+ });
106
+
107
+ audit
108
+ .command("vm [vmName]")
109
+ .description("Audit VM security — disk encryption, NSG rules, managed identity, patching")
110
+ .option("--profile <subscription>", "Azure subscription name or ID")
111
+ .option("--vm-name <name>", "Audit a specific VM (default: all tracked)")
112
+ .option("--verbose", "Show info-level suggestions alongside warnings")
113
+ .action(async (vmName, opts) => {
114
+ const { auditVms } = await import("../azure-audit.js");
115
+ await auditVms({ profile: opts.profile, vmName: opts.vmName || vmName, verbose: opts.verbose });
116
+ });
117
+
118
+ audit
119
+ .command("aks [clusterName]")
120
+ .description("Audit AKS cluster security — RBAC, network policy, auto-upgrade, Defender")
121
+ .option("--profile <subscription>", "Azure subscription name or ID")
122
+ .option("--verbose", "Show info-level suggestions alongside warnings")
123
+ .action(async (clusterName, opts) => {
124
+ const { auditAks } = await import("../azure-audit.js");
125
+ await auditAks({ profile: opts.profile, clusterName, verbose: opts.verbose });
126
+ });
127
+
128
+ audit
129
+ .command("storage")
130
+ .description("Audit storage account encryption & security posture")
131
+ .option("--profile <subscription>", "Azure subscription name or ID")
132
+ .option("--account <name>", "Audit a specific storage account (default: all)")
133
+ .option("--verbose", "Show info-level suggestions alongside warnings")
134
+ .action(async (opts) => {
135
+ const { auditStorage } = await import("../azure-audit.js");
136
+ await auditStorage({ profile: opts.profile, account: opts.account, verbose: opts.verbose });
137
+ });
138
+
139
+ audit
140
+ .command("sessions [vmName]")
141
+ .description("View SSH session recordings across the fleet")
142
+ .option("--session <id>", "Read a specific session by ID")
143
+ .option("--live", "Show only active (live) sessions")
144
+ .option("--cloud", "Read from blob storage instead of SSH")
145
+ .option("--push", "Push collected sessions to blob storage")
146
+ .option("--last <n>", "Show only the last N sessions", "50")
147
+ .option("--tail <n>", "When reading a session, show only the last N lines")
148
+ .action(async (vmName, opts) => {
149
+ const { fleetAudit } = await import("../azure-fleet.js");
150
+ await fleetAudit({
151
+ vmName,
152
+ session: opts.session,
153
+ live: opts.live,
154
+ cloud: opts.cloud,
155
+ push: opts.push,
156
+ last: Number(opts.last),
157
+ tail: opts.tail ? Number(opts.tail) : undefined,
158
+ });
159
+ });
160
+
161
+ audit
162
+ .command("zap [vmName]")
163
+ .description("Run an authenticated OWASP ZAP DAST scan against a VM's frontend")
164
+ .option("--vm-name <name>", "Target VM (default: active)")
165
+ .option("--target <url>", "Override target URL (default: https://<vm>.meshx.app)")
166
+ .option("--output <dir>", "Directory for JSON report (default: cwd)")
167
+ .option("--spider-minutes <n>", "Spider duration in minutes (default: 3)", "3")
168
+ .option("--ajax-minutes <n>", "Ajax spider duration in minutes (default: 3)", "3")
169
+ .option("--active-scan-minutes <n>", "Active scan rule timeout in minutes (default: 5)", "5")
170
+ .option("--max-minutes <n>", "Overall scan timeout in minutes (default: 20)", "20")
171
+ .option("--verbose", "Show fix suggestions for each finding")
172
+ .option("--aggressive", "Pentest mode: Penetration Tester policy, longer crawl/scan")
173
+ .action(async (vmName, opts) => {
174
+ const { auditZap } = await import("../azure-audit.js");
175
+ await auditZap({
176
+ vmName: opts.vmName || vmName,
177
+ target: opts.target,
178
+ output: opts.output,
179
+ spiderMinutes: opts.aggressive ? undefined : (opts.spiderMinutes ? Number(opts.spiderMinutes) : undefined),
180
+ ajaxMinutes: opts.aggressive ? undefined : (opts.ajaxMinutes ? Number(opts.ajaxMinutes) : undefined),
181
+ activeScanMinutes: opts.aggressive ? undefined : (opts.activeScanMinutes ? Number(opts.activeScanMinutes) : undefined),
182
+ maxMinutes: opts.aggressive ? undefined : (opts.maxMinutes ? Number(opts.maxMinutes) : undefined),
183
+ verbose: opts.verbose,
184
+ aggressive: opts.aggressive,
185
+ });
186
+ });
187
+
188
+ // ── Swarm subcommands ──────────────────────────────────────────────────
189
+
190
+ const swarm = azure
191
+ .command("swarm")
192
+ .description("Docker Swarm cluster management");
193
+
194
+ swarm
195
+ .command("init <vmName>")
196
+ .description("Initialize Docker Swarm on a VM (single-node manager)")
197
+ .option("--stack", "Also deploy the compose stack as swarm services")
198
+ .action(async (vmName, opts) => {
199
+ const { swarmInit } = await import("../azure-fleet.js");
200
+ await swarmInit({ vmName, stack: opts.stack });
201
+ });
202
+
203
+ swarm
204
+ .command("join <vmName>")
205
+ .description("Join a VM as a worker to an existing swarm (auto-creates the VM if it doesn't exist)")
206
+ .requiredOption("--manager <name>", "Manager VM to join")
207
+ .option("--as-manager", "Join as a manager instead of worker")
208
+ .option("--vm-size <size>", "VM size when auto-creating (default: inherited from manager)")
209
+ .option("--location <region>", "Azure region when auto-creating (default: inherited from manager)")
210
+ .option("--image <urn>", "Custom image URN when auto-creating")
211
+ .option("--url <url>", "Public URL override when auto-creating")
212
+ .option("--profile <subscription>", "Azure subscription when auto-creating")
213
+ .action(async (vmName, opts) => {
214
+ const { swarmJoin } = await import("../azure-fleet.js");
215
+ await swarmJoin({
216
+ vmName, manager: opts.manager, asManager: opts.asManager,
217
+ vmSize: opts.vmSize, location: opts.location,
218
+ image: opts.image, url: opts.url, profile: opts.profile,
219
+ });
220
+ });
221
+
222
+ swarm
223
+ .command("status [vmName]")
224
+ .description("Show swarm node and service status")
225
+ .action(async (vmName) => {
226
+ const { swarmStatus } = await import("../azure-fleet.js");
227
+ await swarmStatus({ vmName });
228
+ });
229
+
230
+ swarm
231
+ .command("promote <vmName>")
232
+ .description("Promote a swarm worker to manager")
233
+ .action(async (vmName) => {
234
+ const { swarmPromote } = await import("../azure-fleet.js");
235
+ await swarmPromote({ vmName });
236
+ });
237
+
238
+ swarm
239
+ .command("deploy [vmName]")
240
+ .description("Deploy the compose stack as swarm services (or update existing)")
241
+ .action(async (vmName) => {
242
+ const { swarmDeploy } = await import("../azure-fleet.js");
243
+ await swarmDeploy({ vmName });
244
+ });
245
+
246
+ swarm
247
+ .command("leave <vmName>")
248
+ .description("Remove a VM from the swarm")
249
+ .option("--force", "Force leave (required for managers)")
250
+ .action(async (vmName, opts) => {
251
+ const { swarmLeave } = await import("../azure-fleet.js");
252
+ await swarmLeave({ vmName, force: opts.force });
253
+ });
254
+ }