@meshxdata/fops 0.1.52 → 0.1.54

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/CHANGELOG.md +559 -0
  2. package/package.json +2 -6
  3. package/src/agent/agent.js +6 -0
  4. package/src/commands/setup.js +34 -0
  5. package/src/fleet-registry.js +38 -2
  6. package/src/plugins/__test-fixtures__/fake-plugin.js +2 -0
  7. package/src/plugins/__test-fixtures__/no-register-plugin.js +2 -0
  8. package/src/plugins/__test-fixtures__/with-register/index.js +2 -0
  9. package/src/plugins/__test-fixtures__/without-register/index.js +2 -0
  10. package/src/plugins/api.js +4 -0
  11. package/src/plugins/builtins/docker-compose.js +65 -0
  12. package/src/plugins/bundled/fops-plugin-azure/index.js +4 -0
  13. package/src/plugins/bundled/fops-plugin-azure/lib/azure-aks-core.js +44 -53
  14. package/src/plugins/bundled/fops-plugin-azure/lib/azure-aks-storage.js +2 -2
  15. package/src/plugins/bundled/fops-plugin-azure/lib/azure-cost.js +52 -22
  16. package/src/plugins/bundled/fops-plugin-azure/lib/azure-helpers.js +6 -2
  17. package/src/plugins/bundled/fops-plugin-azure/lib/azure-ops.js +113 -7
  18. package/src/plugins/bundled/fops-plugin-azure/lib/azure-provision-init.js +13 -4
  19. package/src/plugins/bundled/fops-plugin-azure/lib/azure-provision.js +91 -14
  20. package/src/plugins/bundled/fops-plugin-azure/lib/azure-service.js +507 -0
  21. package/src/plugins/bundled/fops-plugin-azure/lib/azure-sync.js +146 -7
  22. package/src/plugins/bundled/fops-plugin-azure/lib/azure.js +1 -1
  23. package/src/plugins/bundled/fops-plugin-azure/lib/commands/vm-cmds.js +61 -0
  24. package/src/plugins/bundled/fops-plugin-cloud/api.js +712 -0
  25. package/src/plugins/bundled/fops-plugin-cloud/fops.plugin.json +6 -0
  26. package/src/plugins/bundled/fops-plugin-cloud/index.js +208 -0
  27. package/src/plugins/bundled/fops-plugin-cloud/lib/azure-provider.js +81 -0
  28. package/src/plugins/bundled/fops-plugin-cloud/lib/provider.js +50 -0
  29. package/src/plugins/bundled/fops-plugin-cloud/ui/dist/assets/favicon-C49brna2.svg +15 -0
  30. package/src/plugins/bundled/fops-plugin-cloud/ui/dist/assets/index-CVqQ_kKW.js +65 -0
  31. package/src/plugins/bundled/fops-plugin-cloud/ui/dist/assets/index-DZetahP3.css +1 -0
  32. package/src/plugins/bundled/fops-plugin-cloud/ui/dist/index.html +28 -0
  33. package/src/plugins/bundled/fops-plugin-cloud/ui/index.html +27 -0
  34. package/src/plugins/bundled/fops-plugin-cloud/ui/package-lock.json +2634 -0
  35. package/src/plugins/bundled/fops-plugin-cloud/ui/package.json +29 -0
  36. package/src/plugins/bundled/fops-plugin-cloud/ui/postcss.config.cjs +5 -0
  37. package/src/plugins/bundled/fops-plugin-cloud/ui/src/App.jsx +32 -0
  38. package/src/plugins/bundled/fops-plugin-cloud/ui/src/api/client.js +114 -0
  39. package/src/plugins/bundled/fops-plugin-cloud/ui/src/api/queries.js +111 -0
  40. package/src/plugins/bundled/fops-plugin-cloud/ui/src/components/LogPanel.jsx +162 -0
  41. package/src/plugins/bundled/fops-plugin-cloud/ui/src/components/ThemeToggle.jsx +46 -0
  42. package/src/plugins/bundled/fops-plugin-cloud/ui/src/css/additional-styles/utility-patterns.css +147 -0
  43. package/src/plugins/bundled/fops-plugin-cloud/ui/src/css/style.css +138 -0
  44. package/src/plugins/bundled/fops-plugin-cloud/ui/src/favicon.svg +15 -0
  45. package/src/plugins/bundled/fops-plugin-cloud/ui/src/lib/utils.ts +19 -0
  46. package/src/plugins/bundled/fops-plugin-cloud/ui/src/main.jsx +25 -0
  47. package/src/plugins/bundled/fops-plugin-cloud/ui/src/pages/Audit.jsx +164 -0
  48. package/src/plugins/bundled/fops-plugin-cloud/ui/src/pages/Costs.jsx +305 -0
  49. package/src/plugins/bundled/fops-plugin-cloud/ui/src/pages/CreateResource.jsx +285 -0
  50. package/src/plugins/bundled/fops-plugin-cloud/ui/src/pages/Fleet.jsx +307 -0
  51. package/src/plugins/bundled/fops-plugin-cloud/ui/src/pages/Resources.jsx +229 -0
  52. package/src/plugins/bundled/fops-plugin-cloud/ui/src/partials/Header.jsx +132 -0
  53. package/src/plugins/bundled/fops-plugin-cloud/ui/src/partials/Sidebar.jsx +174 -0
  54. package/src/plugins/bundled/fops-plugin-cloud/ui/src/partials/SidebarLinkGroup.jsx +21 -0
  55. package/src/plugins/bundled/fops-plugin-cloud/ui/src/utils/AuthContext.jsx +170 -0
  56. package/src/plugins/bundled/fops-plugin-cloud/ui/src/utils/Info.jsx +49 -0
  57. package/src/plugins/bundled/fops-plugin-cloud/ui/src/utils/ThemeContext.jsx +37 -0
  58. package/src/plugins/bundled/fops-plugin-cloud/ui/src/utils/Transition.jsx +116 -0
  59. package/src/plugins/bundled/fops-plugin-cloud/ui/src/utils/Utils.js +63 -0
  60. package/src/plugins/bundled/fops-plugin-cloud/ui/vite.config.js +23 -0
  61. package/src/plugins/bundled/fops-plugin-foundation/test-helpers.js +65 -0
  62. package/src/plugins/loader.js +34 -1
  63. package/src/plugins/registry.js +15 -0
  64. package/src/plugins/schemas.js +17 -0
  65. package/src/project.js +1 -1
  66. package/src/serve.js +196 -2
  67. package/src/shell.js +21 -1
  68. package/src/web/admin.html.js +236 -0
  69. package/src/web/api.js +73 -0
  70. package/src/web/dist/assets/index-BphVaAUd.css +1 -0
  71. package/src/web/dist/assets/index-CSckLzuG.js +129 -0
  72. package/src/web/dist/index.html +2 -2
  73. package/src/web/frontend/index.html +16 -0
  74. package/src/web/frontend/src/App.jsx +445 -0
  75. package/src/web/frontend/src/components/ChatView.jsx +910 -0
  76. package/src/web/frontend/src/components/InputBox.jsx +523 -0
  77. package/src/web/frontend/src/components/Sidebar.jsx +410 -0
  78. package/src/web/frontend/src/components/StatusBar.jsx +37 -0
  79. package/src/web/frontend/src/components/TabBar.jsx +87 -0
  80. package/src/web/frontend/src/hooks/useWebSocket.js +412 -0
  81. package/src/web/frontend/src/index.css +78 -0
  82. package/src/web/frontend/src/main.jsx +6 -0
  83. package/src/web/frontend/vite.config.js +21 -0
  84. package/src/web/server.js +64 -1
  85. package/src/web/dist/assets/index-NXC8Hvnp.css +0 -1
  86. package/src/web/dist/assets/index-QH1N4ejK.js +0 -112
@@ -102,13 +102,31 @@ function parseSnapshot(raw, vmMeta) {
102
102
  if (eq > 0) flags[line.slice(0, eq)] = line.slice(eq + 1);
103
103
  }
104
104
 
105
- // Services
105
+ // Services — health + version from container images
106
+ const SERVICE_IMAGE_MAP = {
107
+ be: "foundation-backend",
108
+ fe: "foundation-frontend",
109
+ pr: "foundation-processor",
110
+ wa: "foundation-watcher",
111
+ sc: "foundation-scheduler",
112
+ se: "foundation-storage-engine",
113
+ };
114
+
106
115
  const services = {
107
116
  backend: (raw.backendHealth || "").trim() === "OK" ? "healthy" : "down",
108
117
  frontend: (raw.frontendHealth || "").trim() === "OK" ? "healthy" : "down",
109
118
  storage: (raw.storageHealth || "").trim() === "OK" ? "healthy" : "down",
110
119
  };
111
120
 
121
+ // Extract version tags from container images
122
+ for (const [key, imageName] of Object.entries(SERVICE_IMAGE_MAP)) {
123
+ const c = containers.find((c) => c.image?.includes(imageName));
124
+ if (c?.image) {
125
+ const tag = c.image.split(":").pop() || "";
126
+ services[key] = { tag, health: c.healthy ? "healthy" : c.unhealthy ? "unhealthy" : "down" };
127
+ }
128
+ }
129
+
112
130
  // Foundation entities (meshes, data systems, data sources, data products)
113
131
  let foundation = null;
114
132
  try {
@@ -306,7 +324,25 @@ export class FleetRegistry {
306
324
  fopsVersion: s.stack.fopsVersion,
307
325
  branch: s.stack.gitBranch,
308
326
  commit: s.stack.gitSha,
309
- services: s.services,
327
+ services: (() => {
328
+ const svc = { ...s.services };
329
+ // HTTP scrape puts versions in a nested object; merge them up
330
+ if (svc.versions) {
331
+ Object.assign(svc, svc.versions);
332
+ delete svc.versions;
333
+ }
334
+ // If still no version keys, extract from container images
335
+ if (!svc.be && s.containers?.list) {
336
+ const IMAGE_MAP = { be: "foundation-backend", fe: "foundation-frontend", pr: "foundation-processor", wa: "foundation-watcher", sc: "foundation-scheduler", se: "foundation-storage-engine" };
337
+ for (const [key, img] of Object.entries(IMAGE_MAP)) {
338
+ const c = s.containers.list.find((c) => c.image?.includes(img));
339
+ if (c?.image) {
340
+ svc[key] = { tag: c.image.split(":").pop() || "", health: c.healthy ? "healthy" : c.unhealthy ? "unhealthy" : "down" };
341
+ }
342
+ }
343
+ }
344
+ return svc;
345
+ })(),
310
346
  lastScrape: entry.lastScrape,
311
347
  });
312
348
  }
@@ -0,0 +1,2 @@
1
+ // Test fixture: minimal plugin with register function
2
+ export function register(api) {}
@@ -0,0 +1,2 @@
1
+ // Test fixture: plugin without register function
2
+ export const name = "no-register";
@@ -0,0 +1,2 @@
1
+ // Test fixture: discovered plugin with register function
2
+ export function register(api) {}
@@ -0,0 +1,2 @@
1
+ // Test fixture: discovered plugin without register function
2
+ export const name = "no-register";
@@ -98,6 +98,10 @@ export function createPluginApi(pluginId, registry, opts = {}) {
98
98
  registry.services.push({ pluginId, name, instance });
99
99
  },
100
100
 
101
+ registerWebPanel(panel) {
102
+ registry.addWebPanel({ pluginId, ...panel });
103
+ },
104
+
101
105
  registerIndexSource(source) {
102
106
  if (!registry.indexSources) registry.indexSources = [];
103
107
  registry.indexSources.push({ pluginId, name: source.name, fn: source.fn });
@@ -465,6 +465,71 @@ You manage Docker Compose stacks: inspect containers, read logs, restart service
465
465
  - If asked about security or CVEs, use compose_scan.`,
466
466
  });
467
467
 
468
+ // ── Debug Agent ───────────────────────────────────────────────────
469
+ api.registerAgent({
470
+ name: "debug",
471
+ description: "Debug stack issues — diagnose alerts, check containers, logs, metrics, and suggest fixes",
472
+ contextMode: "minimal",
473
+ tools: [
474
+ "compose_ps",
475
+ "compose_logs",
476
+ "compose_restart",
477
+ "compose_exec",
478
+ "compose_inspect",
479
+ "compose_stats",
480
+ "compose_images",
481
+ "embeddings_search",
482
+ ],
483
+ maxIterations: 20,
484
+ systemPrompt: `You are FOPS Debug Agent — an expert platform debugger for Foundation stack issues.
485
+
486
+ ## Role
487
+ You investigate alerts, diagnose service failures, and suggest fixes. You have direct access to Docker containers, logs, and system metrics. You are called by the Glue bot when monitoring alerts fire.
488
+
489
+ ## Tools Available
490
+ - **compose_ps**: List all containers and their status (start here)
491
+ - **compose_logs**: Read container logs (check for errors, crashes, OOM)
492
+ - **compose_inspect**: Get container details (health checks, env vars, mounts, restarts)
493
+ - **compose_stats**: CPU/memory/network usage per container
494
+ - **compose_exec**: Run commands inside containers (e.g. check disk, network, processes)
495
+ - **compose_images**: List images and versions
496
+ - **compose_restart**: Restart specific services
497
+ - **embeddings_search**: Search docs, configs, and past knowledge for context
498
+
499
+ ## Investigation Approach
500
+ 1. **Triage**: Run compose_ps to see overall stack health. Identify unhealthy/restarting containers.
501
+ 2. **Diagnose**: For each affected container:
502
+ - compose_logs to find errors, exceptions, OOM kills, crash traces
503
+ - compose_inspect for health check failures, restart count, resource limits
504
+ - compose_stats for CPU/memory spikes
505
+ 3. **Context**: Use embeddings_search to find relevant docs or known issues.
506
+ 4. **Root cause**: Correlate findings — is it a code bug, resource exhaustion, dependency failure, config issue?
507
+ 5. **Fix**: Suggest specific actions (restart, config change, scale, rollback).
508
+
509
+ ## Output Format
510
+ Structure your response with blank lines between each section:
511
+
512
+ **Status:** One-line summary (e.g. "Processor container restarting due to OOM")
513
+
514
+ **Findings:** What you discovered from each tool
515
+
516
+ **Root Cause:** Most likely cause
517
+
518
+ **Actions:** Specific steps to fix
519
+
520
+ **Prevention:** How to avoid this in the future
521
+
522
+ ## Rules
523
+ - Always check compose_ps first.
524
+ - Check logs BEFORE suggesting restarts.
525
+ - Look for patterns: repeated restarts, OOM kills, connection refused, timeout errors.
526
+ - If a dependency is down (postgres, kafka), flag it — fixing the dependency fixes the dependent.
527
+ - Be concise — this output goes into a Glue chat thread.
528
+ - Never suggest 'docker compose down' — prefer targeted restarts.
529
+ - After restarting, verify with compose_ps.
530
+ - IMPORTANT: Always put a blank line between sections in your response so they render as separate paragraphs.`,
531
+ });
532
+
468
533
  // ── Doctor check: Trivy ───────────────────────────────────────────
469
534
  api.registerDoctorCheck({
470
535
  name: "Trivy",
@@ -23,6 +23,10 @@ import { registerRegistryCommands } from "./lib/commands/registry-cmds.js";
23
23
  export { resolveFoundationCreds, resolveAuth0Config, authenticateVm, vmFetch };
24
24
 
25
25
  export async function register(api) {
26
+ // ── Service: expose structured API for cross-plugin use ──
27
+ const { AzureService } = await import("./lib/azure-service.js");
28
+ api.registerService("azure", new AzureService());
29
+
26
30
  // ── Commands ──────────────────────────────────────────────────────────
27
31
 
28
32
  api.registerCommand((program, registry) => {
@@ -1057,64 +1057,55 @@ export async function aksList(opts = {}) {
1057
1057
 
1058
1058
  banner("AKS Clusters");
1059
1059
 
1060
- // If no clusters tracked locally, try to discover fops-managed clusters from Azure
1061
- if (names.length === 0) {
1060
+ // Always discover fops-managed clusters from Azure so we pick up clusters
1061
+ // created by teammates or missing from local state.
1062
+ try {
1062
1063
  const execa = await lazyExeca();
1063
- try {
1064
- await ensureAzCli(execa);
1065
- await ensureAzAuth(execa, { subscription: opts.profile });
1066
- } catch {
1067
- hint("No clusters tracked.");
1068
- hint("Create one: fops azure aks up <name>\n");
1069
- return;
1070
- }
1071
-
1072
- hint("No clusters tracked locally — checking Azure for fops-managed clusters…\n");
1064
+ await ensureAzCli(execa);
1065
+ await ensureAzAuth(execa, { subscription: opts.profile });
1073
1066
 
1074
- try {
1075
- // Query all AKS clusters and filter by managed=fops tag
1076
- const { stdout, exitCode } = await execa("az", [
1077
- "aks", "list",
1078
- "--query", "[?tags.managed=='fops']",
1079
- "--output", "json",
1080
- ...subArgs(opts.profile),
1081
- ], { timeout: 60000, reject: false });
1082
-
1083
- if (exitCode === 0 && stdout?.trim()) {
1084
- const discovered = JSON.parse(stdout);
1085
- if (discovered.length > 0) {
1086
- for (const cl of discovered) {
1087
- const name = cl.name;
1088
- const info = {
1089
- resourceGroup: cl.resourceGroup,
1090
- location: cl.location,
1091
- kubernetesVersion: cl.kubernetesVersion,
1092
- fqdn: cl.fqdn,
1093
- nodeCount: cl.agentPoolProfiles?.reduce((s, p) => s + (p.count || 0), 0) || 0,
1094
- nodeVmSize: cl.agentPoolProfiles?.[0]?.vmSize || "unknown",
1095
- subscriptionId: cl.id?.split("/")[2],
1096
- createdAt: cl.provisioningState === "Succeeded" ? new Date().toISOString() : null,
1097
- };
1098
- writeClusterState(name, info);
1099
- console.log(OK(` + Discovered ${name} (${cl.location})`));
1100
- }
1101
- console.log("");
1102
- // Re-read after discovery
1103
- const updated = readAksClusters();
1104
- activeCluster = updated.activeCluster;
1105
- clusters = updated.clusters;
1106
- names = Object.keys(clusters);
1107
- }
1067
+ const { stdout, exitCode } = await execa("az", [
1068
+ "aks", "list",
1069
+ "--query", "[?tags.managed=='fops']",
1070
+ "--output", "json",
1071
+ ...subArgs(opts.profile),
1072
+ ], { timeout: 60000, reject: false });
1073
+
1074
+ if (exitCode === 0 && stdout?.trim()) {
1075
+ const discovered = JSON.parse(stdout);
1076
+ let added = 0;
1077
+ for (const cl of discovered) {
1078
+ if (clusters[cl.name]) continue; // already tracked
1079
+ const info = {
1080
+ resourceGroup: cl.resourceGroup,
1081
+ location: cl.location,
1082
+ kubernetesVersion: cl.kubernetesVersion,
1083
+ fqdn: cl.fqdn,
1084
+ nodeCount: cl.agentPoolProfiles?.reduce((s, p) => s + (p.count || 0), 0) || 0,
1085
+ nodeVmSize: cl.agentPoolProfiles?.[0]?.vmSize || "unknown",
1086
+ subscriptionId: cl.id?.split("/")[2],
1087
+ createdAt: cl.provisioningState === "Succeeded" ? new Date().toISOString() : null,
1088
+ };
1089
+ writeClusterState(cl.name, info);
1090
+ console.log(OK(` + Discovered ${cl.name} (${cl.location})`));
1091
+ added++;
1092
+ }
1093
+ if (added > 0) {
1094
+ console.log("");
1095
+ const updated = readAksClusters();
1096
+ activeCluster = updated.activeCluster;
1097
+ clusters = updated.clusters;
1098
+ names = Object.keys(clusters);
1108
1099
  }
1109
- } catch {
1110
- // Discovery failed, continue with empty list
1111
1100
  }
1101
+ } catch {
1102
+ // az not available or not authenticated — continue with local state
1103
+ }
1112
1104
 
1113
- if (names.length === 0) {
1114
- hint("No fops-managed clusters found in Azure.");
1115
- hint("Create one: fops azure aks up <name>\n");
1116
- return;
1117
- }
1105
+ if (names.length === 0) {
1106
+ hint("No clusters tracked.");
1107
+ hint("Create one: fops azure aks up <name>\n");
1108
+ return;
1118
1109
  }
1119
1110
 
1120
1111
  // Refresh each tracked cluster from Azure so RG, Location, Nodes, FQDN, etc. are current
@@ -38,7 +38,7 @@ export async function reconcileStorageAccount(ctx) {
38
38
  const { execa, clusterName, rg, sub } = ctx;
39
39
  const storageAccountName = `fops${clusterName.replace(/-/g, "")}`.toLowerCase().slice(0, 24);
40
40
  const vaultName = `fops-${clusterName}-kv`;
41
- const containers = ["foundation", "vault"];
41
+ const containers = ["foundation", "vault", "loki"];
42
42
 
43
43
  hint(`Reconciling Azure Storage Account "${storageAccountName}"…`);
44
44
 
@@ -571,7 +571,7 @@ export async function reconcileStorageReplication(ctx) {
571
571
 
572
572
  const sourceAccountName = `fops${clusterName.replace(/-/g, "")}`.toLowerCase().slice(0, 24);
573
573
  const destAccountName = `fops${clusterName.replace(/-/g, "")}ha`.toLowerCase().slice(0, 24);
574
- const containers = ["foundation", "vault"];
574
+ const containers = ["foundation", "vault", "loki"];
575
575
 
576
576
  hint(`Setting up cross-region storage replication (${location} → ${replicaRegion})…`);
577
577
 
@@ -24,22 +24,50 @@ async function az(args, opts = {}) {
24
24
  }
25
25
  }
26
26
 
27
+ // In-memory cache for cost queries (TTL: 1 hour)
28
+ const _costCache = new Map();
29
+ const COST_CACHE_TTL = 60 * 60 * 1000; // 1 hour
30
+
27
31
  async function costQuery(scope, dataset) {
28
- try {
29
- const body = JSON.stringify({ ...dataset });
30
- const { stdout, stderr } = await execa("az", [
31
- "rest", "--method", "POST",
32
- "--url", `https://management.azure.com${scope}/providers/Microsoft.CostManagement/query?api-version=2023-11-01`,
33
- "--body", body,
34
- "--output", "json",
35
- ], { timeout: 120_000, reject: false });
36
- if (stderr?.includes("Please run 'az login'") || stderr?.includes("AADSTS")) {
37
- return { error: stderr.split("\n")[0] };
32
+ const cacheKey = JSON.stringify({ scope, dataset });
33
+ const cached = _costCache.get(cacheKey);
34
+ if (cached && Date.now() - cached.ts < COST_CACHE_TTL) {
35
+ return cached.data;
36
+ }
37
+
38
+ const maxRetries = 3;
39
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
40
+ try {
41
+ const body = JSON.stringify({ ...dataset });
42
+ const { stdout, stderr } = await execa("az", [
43
+ "rest", "--method", "POST",
44
+ "--url", `https://management.azure.com${scope}/providers/Microsoft.CostManagement/query?api-version=2023-11-01`,
45
+ "--body", body,
46
+ "--output", "json",
47
+ ], { timeout: 120_000, reject: false });
48
+
49
+ if (stderr?.includes("Please run 'az login'") || stderr?.includes("AADSTS")) {
50
+ return { error: stderr.split("\n")[0] + "\nMake sure you are logged into Azure (az login) and have Cost Management access." };
51
+ }
52
+
53
+ // Handle 429 rate limiting
54
+ if (stderr?.includes("429") || stderr?.includes("Too many requests") || stderr?.includes("Too Many Requests")) {
55
+ const wait = Math.pow(2, attempt + 1) * 5000; // 10s, 20s, 40s
56
+ if (attempt < maxRetries - 1) {
57
+ await new Promise((r) => setTimeout(r, wait));
58
+ continue;
59
+ }
60
+ return { error: `Rate limited by Azure Cost Management API after ${maxRetries} retries. Try again in a few minutes.` };
61
+ }
62
+
63
+ const result = JSON.parse(stdout || "{}");
64
+ _costCache.set(cacheKey, { data: result, ts: Date.now() });
65
+ return result;
66
+ } catch (err) {
67
+ if (attempt === maxRetries - 1) return { error: err.message };
38
68
  }
39
- return JSON.parse(stdout || "{}");
40
- } catch (err) {
41
- return { error: err.message };
42
69
  }
70
+ return { error: "Cost query failed after retries" };
43
71
  }
44
72
 
45
73
  function formatCost(amount, currency = "USD") {
@@ -402,16 +430,18 @@ export async function registerCostTools(api) {
402
430
  ? allVms.filter(v => v.powerState?.toLowerCase().includes(input.state))
403
431
  : allVms;
404
432
 
405
- // Rough monthly cost estimates (USD, Pay-As-You-Go, select regions)
433
+ // Rough monthly cost estimates (USD, Pay-As-You-Go)
406
434
  const costs = {
407
- Standard_B2s: 30, Standard_B4ms: 60, Standard_B2ms: 60,
408
- Standard_D2s_v3: 70, Standard_D4s_v3: 140, Standard_D8s_v3: 281,
409
- Standard_D16s_v3: 562, Standard_D32s_v3: 1124,
410
- Standard_D2s_v5: 70, Standard_D4s_v5: 140, Standard_D8s_v5: 281,
411
- Standard_D16s_v5: 562, Standard_D32s_v5: 1124,
412
- Standard_E2s_v3: 92, Standard_E4s_v3: 184, Standard_E8s_v3: 368,
413
- Standard_E16s_v3: 736, Standard_E32s_v3: 1472,
414
- Standard_F2s_v2: 62, Standard_F4s_v2: 124, Standard_F8s_v2: 248,
435
+ // B-series (burstable)
436
+ Standard_B1s: 8, Standard_B2s: 30, Standard_B2ms: 60, Standard_B4ms: 120,
437
+ // D-series (general purpose) — v3/v4/v5 similar pricing
438
+ Standard_D2s_v3: 70, Standard_D4s_v3: 140, Standard_D8s_v3: 281, Standard_D16s_v3: 562, Standard_D32s_v3: 1124,
439
+ Standard_D2s_v5: 70, Standard_D4s_v5: 140, Standard_D8s_v5: 281, Standard_D16s_v5: 562, Standard_D32s_v5: 1124, Standard_D64s_v5: 2249,
440
+ // E-series (memory optimized)
441
+ Standard_E2s_v3: 92, Standard_E4s_v3: 184, Standard_E8s_v3: 368, Standard_E16s_v3: 736, Standard_E32s_v3: 1472,
442
+ Standard_E2s_v5: 92, Standard_E4s_v5: 184, Standard_E8s_v5: 368, Standard_E16s_v5: 736, Standard_E32s_v5: 1472, Standard_E64s_v5: 2621,
443
+ // F-series (compute optimized)
444
+ Standard_F2s_v2: 62, Standard_F4s_v2: 124, Standard_F8s_v2: 248, Standard_F16s_v2: 496, Standard_F32s_v2: 992,
415
445
  };
416
446
 
417
447
  let output = "Azure VMs\n" + "=".repeat(75) + "\n";
@@ -854,16 +854,20 @@ export function fopsUpCmd(publicUrl, { k3s, traefik, dai } = {}) {
854
854
  ].join("; ");
855
855
 
856
856
  const debugPostamble = [
857
- `echo \\\"=== fops up finished at \\$(date -Iseconds) with exit code \\$? ===\\\" >> ${logFile}`,
857
+ `echo \\\"=== fops up finished at \\$(date -Iseconds) with exit code \\$_fops_rc ===\\\" >> ${logFile}`,
858
858
  `echo \\\"--- Container status ---\\\" >> ${logFile}`,
859
859
  `docker compose ps --format 'table {{.Name}}\\t{{.Status}}' >> ${logFile} 2>&1`,
860
860
  `echo \\\"--- Recent docker events ---\\\" >> ${logFile}`,
861
861
  `tail -50 ${eventsLog} >> ${logFile} 2>&1 || true`,
862
+ `exit \\$_fops_rc`,
862
863
  ].join("; ");
863
864
 
865
+ // Fail fast if Docker is not installed
866
+ const dockerGuard = `command -v docker >/dev/null 2>&1 || { echo \\\"ERROR: Docker is not installed — cannot start Foundation\\\" >> ${logFile}; echo \\\"ERROR: Docker is not installed\\\" >&2; exit 1; }`;
867
+
864
868
  // Run from project dir with FOUNDATION_ROOT set explicitly (sudo can reset cwd)
865
869
  const envSetup = `export PATH=/usr/local/bin:/usr/bin:\\$PATH FOUNDATION_ROOT=/opt/foundation-compose`;
866
- return `bash -c "cd /opt/foundation-compose && ${envSetup}; ${debugPreamble}; ${quietPull}; if command -v fops >/dev/null 2>&1; then ${profileEnv}${fopsCmd}; else echo 'fops not found — falling back to docker compose'; ${composeCmd}; fi; ${debugPostamble}"`;
870
+ return `bash -c "cd /opt/foundation-compose && ${envSetup}; ${dockerGuard}; ${debugPreamble}; ${quietPull}; if command -v fops >/dev/null 2>&1; then ${profileEnv}${fopsCmd}; else echo 'fops not found — falling back to docker compose'; ${composeCmd}; fi; _fops_rc=\\$?; ${debugPostamble}"`;
867
871
  }
868
872
 
869
873
  /** Build remote "fops up [component] [branch]" args (same as local fops up). For foreground run on VM. */
@@ -321,6 +321,71 @@ export async function azureTrinoStatus(opts = {}) {
321
321
  console.log("");
322
322
  }
323
323
 
324
+ // ── ping ─────────────────────────────────────────────────────────────────────
325
+
326
+ /**
327
+ * Check Foundation backend /api/ping/json health endpoint on a VM.
328
+ */
329
+ export async function azurePing(opts = {}) {
330
+ const execa = await lazyExeca();
331
+ const state = requireVmState(opts.vmName);
332
+ const { vmName } = state;
333
+ const ip = state.publicIp;
334
+ const adminUser = DEFAULTS.adminUser;
335
+
336
+ if (!ip) {
337
+ console.log(WARN(` VM ${vmName} has no public IP (probably stopped)`));
338
+ return;
339
+ }
340
+
341
+ await knockForVm(state);
342
+ const sshOk = await waitForSsh(execa, ip, adminUser, 10000);
343
+ if (!sshOk) {
344
+ console.log(WARN("\n ⚠ SSH not reachable"));
345
+ return;
346
+ }
347
+
348
+ const pingToken = opts.token || process.env.FOPS_PING_TOKEN || "";
349
+ const tokenHeader = pingToken ? `-H "X-Ping-Token: ${pingToken}"` : "";
350
+ const { stdout, exitCode } = await sshCmd(execa, ip, adminUser,
351
+ `curl -sf ${tokenHeader} http://localhost:9001/api/ping/json 2>/dev/null || echo '{}'`,
352
+ 15000,
353
+ );
354
+
355
+ let ping;
356
+ try {
357
+ ping = JSON.parse(stdout.trim() || "{}");
358
+ } catch {
359
+ console.log(ERR(` Failed to parse ping response: ${stdout}`));
360
+ return;
361
+ }
362
+
363
+ banner(`Ping: ${vmName}`);
364
+
365
+ if (ping.ok === undefined) {
366
+ console.log(WARN(" No response from backend /api/ping/json"));
367
+ hint("Backend may be down or starting up");
368
+ console.log("");
369
+ return;
370
+ }
371
+
372
+ const overall = ping.ok ? OK("✓ healthy") : ERR("✗ unhealthy");
373
+ kvLine("Status", overall);
374
+ if (ping.tag) kvLine("Tag", DIM(ping.tag));
375
+
376
+ if (ping.checks) {
377
+ console.log("");
378
+ console.log(ACCENT(" Checks:"));
379
+ for (const [name, check] of Object.entries(ping.checks)) {
380
+ const status = check.ok ? OK("✓") : ERR("✗");
381
+ const latency = check.latency_ms !== undefined ? DIM(` (${check.latency_ms}ms)`) : "";
382
+ const err = check.error ? ERR(` — ${check.error}`) : "";
383
+ console.log(` ${status} ${name}${latency}${err}`);
384
+ }
385
+ }
386
+ console.log("");
387
+ }
388
+
324
389
  /**
325
390
  * Run VM diagnostics: show config versions, then run make download and print
326
391
  * full output so image-pull failures (e.g. after config versions change) can be diagnosed.
@@ -1295,6 +1360,41 @@ export async function azureList(opts = {}) {
1295
1360
  }
1296
1361
  } catch { /* az not available or not authenticated */ }
1297
1362
 
1363
+ // Always discover AKS clusters from Azure (tag managed=fops)
1364
+ try {
1365
+ const execa = await lazyExeca();
1366
+ const { writeClusterState } = await import("./azure-aks-state.js");
1367
+ const { stdout, exitCode } = await execa("az", [
1368
+ "aks", "list",
1369
+ "--query", "[?tags.managed=='fops']",
1370
+ "--output", "json",
1371
+ ...subArgs(opts.subscription),
1372
+ ], { timeout: 60000, reject: false });
1373
+ if (exitCode === 0 && stdout?.trim()) {
1374
+ const discovered = JSON.parse(stdout);
1375
+ let added = 0;
1376
+ for (const cl of discovered) {
1377
+ if (aksClusters[cl.name]) continue;
1378
+ writeClusterState(cl.name, {
1379
+ resourceGroup: cl.resourceGroup,
1380
+ location: cl.location,
1381
+ kubernetesVersion: cl.kubernetesVersion,
1382
+ fqdn: cl.fqdn,
1383
+ nodeCount: cl.agentPoolProfiles?.reduce((s, p) => s + (p.count || 0), 0) || 0,
1384
+ nodeVmSize: cl.agentPoolProfiles?.[0]?.vmSize || "unknown",
1385
+ subscriptionId: cl.id?.split("/")[2],
1386
+ });
1387
+ added++;
1388
+ }
1389
+ if (added > 0) {
1390
+ console.log(OK(` ✓ Re-discovered ${added} AKS cluster(s) from Azure`) + DIM(" (tag managed=fops)\n"));
1391
+ fullState = readState();
1392
+ aksClusters = (fullState.azure || {}).clusters || {};
1393
+ hasAks = Object.keys(aksClusters).length > 0;
1394
+ }
1395
+ }
1396
+ } catch { /* az not available or AKS discovery failed */ }
1397
+
1298
1398
  // JSON output mode - early return with structured data
1299
1399
  if (opts.json) {
1300
1400
  const output = {
@@ -1568,10 +1668,9 @@ export async function azureList(opts = {}) {
1568
1668
  const hasPrimary = primaryName && clusterNames.includes(primaryName);
1569
1669
  const prefix = isStandby && hasPrimary ? " └─" : "";
1570
1670
  const dot = active ? OK("●") : DIM("○");
1571
- const displayName = isStandby && hasPrimary
1572
- ? `${cr.name} ${DIM("(HA standby)")}`
1573
- : cr.name;
1574
- const cNameTxt = active ? OK(displayName.padEnd(maxCName + 13)) : LABEL(displayName.padEnd(maxCName + 13));
1671
+ const paddedName = cr.name.padEnd(maxCName);
1672
+ const standbySuffix = isStandby && hasPrimary ? ` ${DIM("(HA standby)")}` : "";
1673
+ const cNameTxt = active ? OK(paddedName) + standbySuffix : LABEL(paddedName) + standbySuffix;
1575
1674
  const loc = (cl?.location || cr.location || "–").padEnd(10);
1576
1675
  const nodes = cr.nodes != null ? `${cr.nodes} x ${cr.sizes || "?"}` : "–";
1577
1676
  const k8s = (cr.kubernetesVersion || "–").padEnd(6);
@@ -1640,12 +1739,19 @@ export function printServiceMatrix(results, nameWidth) {
1640
1739
  const withSvc = results.filter(r => r.services && Object.keys(r.services).length > 0);
1641
1740
  if (withSvc.length === 0) return;
1642
1741
 
1742
+ // Resolve display value for a service entry (supports both string and {tag,sha} formats)
1743
+ const svcVal = (entry) => {
1744
+ if (!entry) return null;
1745
+ if (typeof entry === "string") return entry;
1746
+ return entry.sha || entry.tag || null;
1747
+ };
1748
+
1643
1749
  // Find the majority value per column to highlight drift
1644
1750
  const majority = {};
1645
1751
  for (const svc of SVC_ORDER) {
1646
1752
  const counts = {};
1647
1753
  for (const r of withSvc) {
1648
- const v = r.services?.[svc];
1754
+ const v = svcVal(r.services?.[svc]);
1649
1755
  if (v) counts[v] = (counts[v] || 0) + 1;
1650
1756
  }
1651
1757
  const sorted = Object.entries(counts).sort((a, b) => b[1] - a[1]);
@@ -1660,7 +1766,7 @@ export function printServiceMatrix(results, nameWidth) {
1660
1766
  for (const r of withSvc) {
1661
1767
  const nameTxt = LABEL(r.name.padEnd(nameWidth));
1662
1768
  const cells = SVC_ORDER.map(svc => {
1663
- const v = r.services?.[svc] || "–";
1769
+ const v = svcVal(r.services?.[svc]) || "–";
1664
1770
  const display = v.padEnd(colW);
1665
1771
  if (v === "–") return DIM(display);
1666
1772
  if (v !== majority[svc]) return WARN(display);
@@ -1671,7 +1777,7 @@ export function printServiceMatrix(results, nameWidth) {
1671
1777
 
1672
1778
  // Check for drift
1673
1779
  const hasDrift = SVC_ORDER.some(svc => {
1674
- const vals = withSvc.map(r => r.services?.[svc]).filter(Boolean);
1780
+ const vals = withSvc.map(r => svcVal(r.services?.[svc])).filter(Boolean);
1675
1781
  return new Set(vals).size > 1;
1676
1782
  });
1677
1783
  if (hasDrift) {
@@ -44,19 +44,26 @@ export async function provisionVm(execa, ip, adminUser, { githubToken, branch =
44
44
  "apt-get install -y -qq apt-transport-https ca-certificates curl gnupg lsb-release jq git make unzip zsh software-properties-common python3-venv python3-pip",
45
45
  ].join("\n"), 300000);
46
46
 
47
- await runScript("Installing Docker", [
47
+ const dockerExit = await runScript("Installing Docker", [
48
48
  waitAptLock,
49
49
  "export DEBIAN_FRONTEND=noninteractive",
50
50
  "install -m 0755 -d /etc/apt/keyrings",
51
- "curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg",
51
+ "curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --batch --yes --dearmor -o /etc/apt/keyrings/docker.gpg",
52
52
  "chmod a+r /etc/apt/keyrings/docker.gpg",
53
53
  `echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" > /etc/apt/sources.list.d/docker.list`,
54
- "apt-get update -qq",
55
- "apt-get install -y -qq docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin",
54
+ "set +e",
55
+ "for _ in 1 2 3 4 5; do if apt-get update -qq && apt-get install -y -qq docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin; then break; fi; echo 'Retrying Docker install in 10s…'; sleep 10; done",
56
+ "set -e",
57
+ "command -v docker >/dev/null 2>&1 || (echo 'Docker not found after install attempts' && exit 1)",
56
58
  "systemctl enable docker && systemctl start docker",
57
59
  `usermod -aG docker ${adminUser}`,
58
60
  ].join("\n"), 300000);
59
61
 
62
+ if (dockerExit !== 0) {
63
+ console.log(WARN(" ✗ Docker installation failed — cannot continue provisioning"));
64
+ throw new Error("Docker installation failed");
65
+ }
66
+
60
67
  await runScript("Configuring br_netfilter for k3s DNS", [
61
68
  "modprobe br_netfilter",
62
69
  "echo br_netfilter > /etc/modules-load.d/br_netfilter.conf",
@@ -178,6 +185,8 @@ export async function provisionVm(execa, ip, adminUser, { githubToken, branch =
178
185
  Project dir: /opt/foundation-compose
179
186
 
180
187
  MOTD`,
188
+ `grep -q 'cd /opt/foundation-compose' /home/${adminUser}/.bashrc 2>/dev/null || echo 'cd /opt/foundation-compose' >> /home/${adminUser}/.bashrc`,
189
+ `grep -q 'cd /opt/foundation-compose' /home/${adminUser}/.zshrc 2>/dev/null || echo 'cd /opt/foundation-compose' >> /home/${adminUser}/.zshrc`,
181
190
  ].join("\n"));
182
191
 
183
192
  await ssh("sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/*", 30000);