@meshxdata/fops 0.1.43 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,377 @@
1
+ ## [0.1.45] - 2026-03-12
2
+
3
+ - bump fops to 0.1.44 (8c0ef5d)
4
+ - Mlflow and azure plugin fix (176881f)
5
+ - fix lifecycle (a2cb9e7)
6
+ - callback url for localhost (821fb94)
7
+ - disable 4 scaffolding plugin by default. (bfb2b76)
8
+ - jaccard improvements (b7494a0)
9
+ - refactor azure plugin (68dfef4)
10
+ - refactor azure plugin (b24a008)
11
+ - fix trino catalog missing (4928a55)
12
+ - v36 bump and changelog generation on openai (37a0440)
13
+ - v36 bump and changelog generation on openai (a3b02d9)
14
+ - bump (a990058)
15
+ - status bar fix and new plugin for ttyd (27dde1e)
16
+ - file demo and tray (1a3e704)
17
+ - electron app (59ad0bb)
18
+ - compose and fops file plugin (1cf0e81)
19
+ - bump (346ffc1)
20
+ - localhost replaced by 127.0.0.1 (82b9f30)
21
+ - .29 (587b0e1)
22
+ - improve up down and bootstrap script (b79ebaf)
23
+ - checksum (22c8086)
24
+ - checksum (96b434f)
25
+ - checksum (15ed3c0)
26
+ - checksum (8a6543a)
27
+ - bump embed trino linksg (8440504)
28
+ - bump data (765ffd9)
29
+ - bump (cb8b232)
30
+ - broken tests (c532229)
31
+ - release 0.1.18, preflight checks (d902249)
32
+ - fix compute display bug (d10f5d9)
33
+ - cleanup packer files (6330f18)
34
+ - plan mode (cb36a8a)
35
+ - bump to 0.1.16 - agent ui (41ac1a2)
36
+ - bump to 0.1.15 - agent ui (4ebe2e1)
37
+ - bump to 0.1.14 (6c3a7fa)
38
+ - bump to 0.1.13 (8db570f)
39
+ - release 0.1.12 (c1c79e5)
40
+ - bump (11aa3b0)
41
+ - git keep and bump tui (be1678e)
42
+ - skills, index, rrf, compacted context (100k > 10k) (7b2fffd)
43
+ - cloudflare and token consumption, graphs indexing (0ad9eec)
44
+ - bump storage default (22c83ba)
45
+ - storage fix (68a22a0)
46
+ - skills update (7f56500)
47
+ - v9 bump (3864446)
48
+ - bump (c95eedc)
49
+ - rrf (dbf8c95)
50
+ - feat: warning when running predictions (95e8c52)
51
+ - feat: support for local predictions (45cf26b)
52
+ - feat: wip support for predictions + mlflow (3457052)
53
+ - add Reciprocal Rank Fusion (RRF) to knowledge and skill retrieval (61549bc)
54
+ - validate CSV headers in compute_run readiness check (a8c7a43)
55
+ - fix corrupted Iceberg metadata: probe tables + force cleanup on re-apply (50578af)
56
+ - enforce: never use foundation_apply to fix broken products (2e049bf)
57
+ - update SKILL.md with complete tool reference for knowledge retrieval (30b1924)
58
+ - add storage read, input DP table probe, and compute_run improvements (34e6c4c)
59
+ - skills update (1220385)
60
+ - skills update (bb66958)
61
+ - some tui improvement andd tools apply overwrite (e90c35c)
62
+ - skills update (e9227a1)
63
+ - skills update (669c4b3)
64
+ - fix plugin pre-flight checks (f741743)
65
+ - increase agent context (6479aaa)
66
+ - skills and init sql fixes (5fce35e)
67
+ - checksum (3518b56)
68
+ - penging job limit (a139861)
69
+ - checksum (575d28c)
70
+ - bump (92049ba)
71
+ - fix bug per tab status (0a33657)
72
+ - fix bug per tab status (50457c6)
73
+ - checksumming (0ad842e)
74
+ - shot af mardkwon overlapping (51f63b9)
75
+ - add spark dockerfile for multiarch builds (95abbd1)
76
+ - fix plugin initialization (16b9782)
77
+ - split index.js (50902a2)
78
+ - cloudflare cidr (cc4e021)
79
+ - cloduflare restrictions (2f6ba2d)
80
+ - sequential start (86b496e)
81
+ - sequential start (4930fe1)
82
+ - sequential start (353f014)
83
+ - qa tests (2dc6a1a)
84
+ - bump sha for .85 (dc2edfe)
85
+ - preserve env on sudo (7831227)
86
+ - bump sha for .84 (6c052f9)
87
+ - non interactive for azure vms (0aa8a2f)
88
+ - keep .env if present (d072450)
89
+ - bump (7a8e732)
90
+ - ensure opa is on compose if not set (f4a5228)
91
+ - checksum bump (a2ccc20)
92
+ - netrc defensive checks (a0b0ccc)
93
+ - netrc defensive checks (ae37403)
94
+ - checksum (ec45d11)
95
+ - update sync and fix up (7f9af72)
96
+ - expand test for azure and add new per app tag support (388a168)
97
+ - checksum on update (44005fc)
98
+ - cleanup for later (15e5313)
99
+ - cleanup for later (11c9597)
100
+ - switch branch feature (822fecc)
101
+ - add pull (d1c19ab)
102
+ - Bump hono from 4.11.9 to 4.12.0 in /operator-cli (ad25144)
103
+ - tests (f180a9a)
104
+ - cleanup (39c49a3)
105
+ - registry (7b7126a)
106
+ - reconcile kafka (832d0db)
107
+ - gh login bug (025886c)
108
+ - cleanup (bb96cab)
109
+ - strip envs from process (2421180)
110
+ - force use of gh creds not tokens in envs var (fff7787)
111
+ - resolve import between npm installs and npm link (79522e1)
112
+ - fix gh scope and azure states (afd846c)
113
+ - refactoring (da50352)
114
+ - split fops repo (d447638)
115
+ - aks (b791f8f)
116
+ - refactor azure (67d3bad)
117
+ - wildcard (391f023)
118
+ - azure plugin (c074074)
119
+ - zap (d7e6e7f)
120
+ - fix knock (cf89c05)
121
+ - azure (4adec98)
122
+ - Bump tar from 7.5.7 to 7.5.9 in /operator-cli (e41e98e)
123
+ - azure stack index.js split (de12272)
124
+ - Bump ajv from 8.17.1 to 8.18.0 in /operator-cli (76da21f)
125
+ - packer (9665fbc)
126
+ - remove stack api (db0fd4d)
127
+ - packer cleanup (fe1bf14)
128
+ - force refresh token (3a3d7e2)
129
+ - provision shell (2ad505f)
130
+ - azure vm management (91dcb31)
131
+ - azure specific (2b0cca8)
132
+ - azure packer (12175b8)
133
+ - init hashed pwd (db8523c)
134
+ - packer (5b5c7c4)
135
+ - doctor for azure vm (ed524fa)
136
+ - packer and 1pwd (c6d053e)
137
+ - split big index.js (dc85a1b)
138
+ - kafka volume update (21815ec)
139
+ - fix openai azure tools confirmation and flow (0118cd1)
140
+ - nighly fixx, test fix (5e0d04f)
141
+ - open ai training (cdc494a)
142
+ - openai integration in azure (1ca1475)
143
+ - ci (672cea9)
144
+ - refresh ghcr creds (4220c48)
145
+ - cleaned up version (1a0074f)
146
+ - traefik on ghcr and templates (8e31a05)
147
+ - apply fcl (e78911f)
148
+ - demo landscape (dd205fe)
149
+ - smarter login and schema (1af514f)
150
+ - no down before up unless something broke (56b1132)
151
+ - dai, reconcile failed containers (12907fa)
152
+ - reconcile dead container (7da75e4)
153
+ - defensive around storage buckets dir (b98871d)
154
+ - defensive around storage buckets dir (e86e132)
155
+ - gear in for multiarch (bf3fa3e)
156
+ - up autofix (99c7f89)
157
+ - autofix stale containers on up (43c7d0f)
158
+ - shared sessions fix (5de1359)
159
+ - share sessions between ui and tui (8321391)
160
+ - fix chat view display details (e263996)
161
+ - fix chat view display details (9babdda)
162
+ - tui up fixes (86e9f17)
163
+ - fix commands init (442538b)
164
+ - enable k3s profile (b2dcfc8)
165
+ - test up till job creation (656d388)
166
+ - tui fixes (0599779)
167
+ - cleanup (27731f0)
168
+ - train (90bf559)
169
+ - training (f809bf6)
170
+ - training (ba2b836)
171
+ - training (6fc5267)
172
+ - training (4af8ac9)
173
+ - fix build script (bd82836)
174
+ - infra test (5b79815)
175
+ - infra test (3a0ac05)
176
+ - infra test (e5c67b5)
177
+ - tests (ae7b621)
178
+ - tests (c09ae6a)
179
+ - update tui (4784153)
180
+ - training (0a5a330)
181
+ - tui (df4dd4a)
182
+ - pkg builds (4dc9993)
183
+ - also source env for creds (9a17d8f)
184
+ - fcl support (e8a5743)
185
+
186
+ # Changelog
187
+
188
+ All notable changes to @meshxdata/fops (Foundation Operator CLI) are documented here.
189
+
190
+ ## [0.1.44] - 2026-03-11
191
+
192
+ - Mlflow and azure plugin fix (176881f)
193
+ - fix lifecycle (a2cb9e7)
194
+ - callback url for localhost (821fb94)
195
+ - disable 4 scaffolding plugin by default. (bfb2b76)
196
+ - jaccard improvements (b7494a0)
197
+ - refactor azure plugin (68dfef4)
198
+ - refactor azure plugin (b24a008)
199
+ - fix trino catalog missing (4928a55)
200
+ - v36 bump and changelog generation on openai (37a0440)
201
+ - v36 bump and changelog generation on openai (a3b02d9)
202
+ - bump (a990058)
203
+ - status bar fix and new plugin for ttyd (27dde1e)
204
+ - file demo and tray (1a3e704)
205
+ - electron app (59ad0bb)
206
+ - compose and fops file plugin (1cf0e81)
207
+ - bump (346ffc1)
208
+ - localhost replaced by 127.0.0.1 (82b9f30)
209
+ - .29 (587b0e1)
210
+ - improve up down and bootstrap script (b79ebaf)
211
+ - checksum (22c8086)
212
+ - checksum (96b434f)
213
+ - checksum (15ed3c0)
214
+ - checksum (8a6543a)
215
+ - bump embed trino linksg (8440504)
216
+ - bump data (765ffd9)
217
+ - bump (cb8b232)
218
+ - broken tests (c532229)
219
+ - release 0.1.18, preflight checks (d902249)
220
+ - fix compute display bug (d10f5d9)
221
+ - cleanup packer files (6330f18)
222
+ - plan mode (cb36a8a)
223
+ - bump to 0.1.16 - agent ui (41ac1a2)
224
+ - bump to 0.1.15 - agent ui (4ebe2e1)
225
+ - bump to 0.1.14 (6c3a7fa)
226
+ - bump to 0.1.13 (8db570f)
227
+ - release 0.1.12 (c1c79e5)
228
+ - bump (11aa3b0)
229
+ - git keep and bump tui (be1678e)
230
+ - skills, index, rrf, compacted context (100k > 10k) (7b2fffd)
231
+ - cloudflare and token consumption, graphs indexing (0ad9eec)
232
+ - bump storage default (22c83ba)
233
+ - storage fix (68a22a0)
234
+ - skills update (7f56500)
235
+ - v9 bump (3864446)
236
+ - bump (c95eedc)
237
+ - rrf (dbf8c95)
238
+ - feat: warning when running predictions (95e8c52)
239
+ - feat: support for local predictions (45cf26b)
240
+ - feat: wip support for predictions + mlflow (3457052)
241
+ - add Reciprocal Rank Fusion (RRF) to knowledge and skill retrieval (61549bc)
242
+ - validate CSV headers in compute_run readiness check (a8c7a43)
243
+ - fix corrupted Iceberg metadata: probe tables + force cleanup on re-apply (50578af)
244
+ - enforce: never use foundation_apply to fix broken products (2e049bf)
245
+ - update SKILL.md with complete tool reference for knowledge retrieval (30b1924)
246
+ - add storage read, input DP table probe, and compute_run improvements (34e6c4c)
247
+ - skills update (1220385)
248
+ - skills update (bb66958)
249
+ - some tui improvement andd tools apply overwrite (e90c35c)
250
+ - skills update (e9227a1)
251
+ - skills update (669c4b3)
252
+ - fix plugin pre-flight checks (f741743)
253
+ - increase agent context (6479aaa)
254
+ - skills and init sql fixes (5fce35e)
255
+ - checksum (3518b56)
256
+ - penging job limit (a139861)
257
+ - checksum (575d28c)
258
+ - bump (92049ba)
259
+ - fix bug per tab status (0a33657)
260
+ - fix bug per tab status (50457c6)
261
+ - checksumming (0ad842e)
262
+ - shot af mardkwon overlapping (51f63b9)
263
+ - add spark dockerfile for multiarch builds (95abbd1)
264
+ - fix plugin initialization (16b9782)
265
+ - split index.js (50902a2)
266
+ - cloudflare cidr (cc4e021)
267
+ - cloduflare restrictions (2f6ba2d)
268
+ - sequential start (86b496e)
269
+ - sequential start (4930fe1)
270
+ - sequential start (353f014)
271
+ - qa tests (2dc6a1a)
272
+ - bump sha for .85 (dc2edfe)
273
+ - preserve env on sudo (7831227)
274
+ - bump sha for .84 (6c052f9)
275
+ - non interactive for azure vms (0aa8a2f)
276
+ - keep .env if present (d072450)
277
+ - bump (7a8e732)
278
+ - ensure opa is on compose if not set (f4a5228)
279
+ - checksum bump (a2ccc20)
280
+ - netrc defensive checks (a0b0ccc)
281
+ - netrc defensive checks (ae37403)
282
+ - checksum (ec45d11)
283
+ - update sync and fix up (7f9af72)
284
+ - expand test for azure and add new per app tag support (388a168)
285
+ - checksum on update (44005fc)
286
+ - cleanup for later (15e5313)
287
+ - cleanup for later (11c9597)
288
+ - switch branch feature (822fecc)
289
+ - add pull (d1c19ab)
290
+ - Bump hono from 4.11.9 to 4.12.0 in /operator-cli (ad25144)
291
+ - tests (f180a9a)
292
+ - cleanup (39c49a3)
293
+ - registry (7b7126a)
294
+ - reconcile kafka (832d0db)
295
+ - gh login bug (025886c)
296
+ - cleanup (bb96cab)
297
+ - strip envs from process (2421180)
298
+ - force use of gh creds not tokens in envs var (fff7787)
299
+ - resolve import between npm installs and npm link (79522e1)
300
+ - fix gh scope and azure states (afd846c)
301
+ - refactoring (da50352)
302
+ - split fops repo (d447638)
303
+ - aks (b791f8f)
304
+ - refactor azure (67d3bad)
305
+ - wildcard (391f023)
306
+ - azure plugin (c074074)
307
+ - zap (d7e6e7f)
308
+ - fix knock (cf89c05)
309
+ - azure (4adec98)
310
+ - Bump tar from 7.5.7 to 7.5.9 in /operator-cli (e41e98e)
311
+ - azure stack index.js split (de12272)
312
+ - Bump ajv from 8.17.1 to 8.18.0 in /operator-cli (76da21f)
313
+ - packer (9665fbc)
314
+ - remove stack api (db0fd4d)
315
+ - packer cleanup (fe1bf14)
316
+ - force refresh token (3a3d7e2)
317
+ - provision shell (2ad505f)
318
+ - azure vm management (91dcb31)
319
+ - azure specific (2b0cca8)
320
+ - azure packer (12175b8)
321
+ - init hashed pwd (db8523c)
322
+ - packer (5b5c7c4)
323
+ - doctor for azure vm (ed524fa)
324
+ - packer and 1pwd (c6d053e)
325
+ - split big index.js (dc85a1b)
326
+ - kafka volume update (21815ec)
327
+ - fix openai azure tools confirmation and flow (0118cd1)
328
+ - nighly fixx, test fix (5e0d04f)
329
+ - open ai training (cdc494a)
330
+ - openai integration in azure (1ca1475)
331
+ - ci (672cea9)
332
+ - refresh ghcr creds (4220c48)
333
+ - cleaned up version (1a0074f)
334
+ - traefik on ghcr and templates (8e31a05)
335
+ - apply fcl (e78911f)
336
+ - demo landscape (dd205fe)
337
+ - smarter login and schema (1af514f)
338
+ - no down before up unless something broke (56b1132)
339
+ - dai, reconcile failed containers (12907fa)
340
+ - reconcile dead container (7da75e4)
341
+ - defensive around storage buckets dir (b98871d)
342
+ - defensive around storage buckets dir (e86e132)
343
+ - gear in for multiarch (bf3fa3e)
344
+ - up autofix (99c7f89)
345
+ - autofix stale containers on up (43c7d0f)
346
+ - shared sessions fix (5de1359)
347
+ - share sessions between ui and tui (8321391)
348
+ - fix chat view display details (e263996)
349
+ - fix chat view display details (9babdda)
350
+ - tui up fixes (86e9f17)
351
+ - fix commands init (442538b)
352
+ - enable k3s profile (b2dcfc8)
353
+ - test up till job creation (656d388)
354
+ - tui fixes (0599779)
355
+ - cleanup (27731f0)
356
+ - train (90bf559)
357
+ - training (f809bf6)
358
+ - training (ba2b836)
359
+ - training (6fc5267)
360
+ - training (4af8ac9)
361
+ - fix build script (bd82836)
362
+ - infra test (5b79815)
363
+ - infra test (3a0ac05)
364
+ - infra test (e5c67b5)
365
+ - tests (ae7b621)
366
+ - tests (c09ae6a)
367
+ - update tui (4784153)
368
+ - training (0a5a330)
369
+ - tui (df4dd4a)
370
+ - pkg builds (4dc9993)
371
+ - also source env for creds (9a17d8f)
372
+ - fcl support (e8a5743)
373
+ - fcl support (8d6b6cd)
374
+
1
375
  ## [0.1.43] - 2026-03-11
2
376
 
3
377
  - Mlflow and azure plugin fix (176881f)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@meshxdata/fops",
3
- "version": "0.1.43",
3
+ "version": "0.1.45",
4
4
  "description": "CLI to install and manage data mesh platforms",
5
5
  "keywords": [
6
6
  "fops",
@@ -159,6 +159,26 @@ export function registerLifecycleCommands(program, registry) {
159
159
  await dockerCompose(root, args);
160
160
  });
161
161
 
162
+ program
163
+ .command("restart")
164
+ .description("Restart Foundation services (all or specific)")
165
+ .argument("[services...]", "Services to restart (e.g. backend, watcher)")
166
+ .action(async (services) => {
167
+ const root = requireRoot(program);
168
+ const serviceNames = (services || []).filter(Boolean).flatMap((s) => {
169
+ const sub = COMPONENT_SUBMODULES[s];
170
+ if (sub) return sub.restart;
171
+ return [resolveServiceName(root, s)];
172
+ });
173
+ if (serviceNames.length) {
174
+ console.log(chalk.cyan(` Restarting ${serviceNames.join(", ")}...`));
175
+ await dockerCompose(root, ["restart", ...serviceNames]);
176
+ } else {
177
+ console.log(chalk.cyan(" Restarting all services..."));
178
+ await dockerCompose(root, ["restart"]);
179
+ }
180
+ });
181
+
162
182
  program
163
183
  .command("bake")
164
184
  .description("Build and push multi-arch base container images (node, etc.)")
@@ -0,0 +1,344 @@
1
+ /**
2
+ * Grafana datasource federation for Azure VMs via SSH tunnels.
3
+ *
4
+ * Each VM exposes Prometheus on port 9091 and Loki on port 3100 (host-bound
5
+ * in docker-compose). These are NOT routed through Traefik (which has Auth0
6
+ * SSO), so we use SSH port-forwards instead.
7
+ *
8
+ * Flow:
9
+ * 1. fops azure grafana tunnel — opens SSH tunnels for all VMs, blocks
10
+ * 2. fops azure grafana sync — registers localhost:<port> datasources
11
+ * in Grafana for each tunnelled VM
12
+ * 3. fops azure up/down — auto-registers/deregisters datasources
13
+ * if tunnel ports are already assigned
14
+ *
15
+ * Config in ~/.fops.json:
16
+ * grafana: { url, token, tunnelPorts: { "<vmName>": { prometheus, loki } } }
17
+ */
18
+ import net from "node:net";
19
+ import { spawn } from "node:child_process";
20
+ import chalk from "chalk";
21
+ import { readState, saveState } from "./azure-state.js";
22
+ import { DEFAULTS, MUX_OPTS } from "./azure-helpers.js";
23
+
24
+ const DIM = (s) => chalk.dim(s);
25
+ const OK = (s) => chalk.green(s);
26
+ const WARN = (s) => chalk.yellow(s);
27
+ const ERR = (s) => chalk.red(s);
28
+
29
+ // Remote ports as bound by docker-compose
30
+ const REMOTE_PROMETHEUS = 9091;
31
+ const REMOTE_LOKI = 3100;
32
+
33
+ // ── Config helpers ────────────────────────────────────────────────────────────
34
+
35
+ export function readGrafanaConfig() {
36
+ return readState().grafana || null;
37
+ }
38
+
39
+ export function writeGrafanaConfig(patch) {
40
+ const state = readState();
41
+ state.grafana = { ...state.grafana, ...patch };
42
+ saveState(state);
43
+ }
44
+
45
+ /** Get stored tunnel port pair for a VM, or null if not yet assigned. */
46
+ export function readTunnelPorts(vmName) {
47
+ return readState().grafana?.tunnelPorts?.[vmName] || null;
48
+ }
49
+
50
+ /** Persist tunnel port pair for a VM. */
51
+ export function writeTunnelPorts(vmName, ports) {
52
+ const state = readState();
53
+ if (!state.grafana) state.grafana = {};
54
+ if (!state.grafana.tunnelPorts) state.grafana.tunnelPorts = {};
55
+ state.grafana.tunnelPorts[vmName] = ports;
56
+ saveState(state);
57
+ }
58
+
59
+ /** Remove tunnel port assignment for a VM. */
60
+ export function clearTunnelPorts(vmName) {
61
+ const state = readState();
62
+ if (state.grafana?.tunnelPorts?.[vmName]) {
63
+ delete state.grafana.tunnelPorts[vmName];
64
+ saveState(state);
65
+ }
66
+ }
67
+
68
+ // ── Free port finder ──────────────────────────────────────────────────────────
69
+
70
+ function findFreePort() {
71
+ return new Promise((resolve, reject) => {
72
+ const s = net.createServer();
73
+ s.listen(0, "127.0.0.1", () => {
74
+ const port = s.address().port;
75
+ s.close(() => resolve(port));
76
+ });
77
+ s.on("error", reject);
78
+ });
79
+ }
80
+
81
+ async function assignPorts(vmName) {
82
+ const existing = readTunnelPorts(vmName);
83
+ if (existing) return existing;
84
+ const prometheus = await findFreePort();
85
+ const loki = await findFreePort();
86
+ const ports = { prometheus, loki };
87
+ writeTunnelPorts(vmName, ports);
88
+ return ports;
89
+ }
90
+
91
+ // ── SSH tunnel management ─────────────────────────────────────────────────────
92
+
93
+ /**
94
+ * Open SSH port-forwards for a single VM (non-blocking, returns child process).
95
+ * Forwards: localPrometheus → VM:9091, localLoki → VM:3100
96
+ */
97
+ function spawnTunnel(ip, user, ports) {
98
+ const args = [
99
+ ...MUX_OPTS(ip, user),
100
+ "-N",
101
+ "-L", `${ports.prometheus}:localhost:${REMOTE_PROMETHEUS}`,
102
+ "-L", `${ports.loki}:localhost:${REMOTE_LOKI}`,
103
+ `${user}@${ip}`,
104
+ ];
105
+ return spawn("ssh", args, { stdio: "ignore", detached: false });
106
+ }
107
+
108
+ /**
109
+ * Open SSH tunnels for all given VMs. Blocks until Ctrl+C.
110
+ * Port assignments are stored in ~/.fops.json so `grafana sync` can use them.
111
+ */
112
+ export async function openGrafanaTunnels(vms, { knockForVm }) {
113
+ const cfg = readGrafanaConfig();
114
+ const user = DEFAULTS.adminUser;
115
+ const entries = Object.entries(vms).filter(([, s]) => s?.publicIp);
116
+
117
+ if (entries.length === 0) {
118
+ console.log(WARN(" No running VMs found (need publicIp in state)."));
119
+ return;
120
+ }
121
+
122
+ console.log(chalk.cyan(`\n Opening Grafana tunnels for ${entries.length} VM(s)…\n`));
123
+
124
+ const children = [];
125
+
126
+ for (const [vmName, state] of entries) {
127
+ const ip = state.publicIp;
128
+ const ports = await assignPorts(vmName);
129
+
130
+ if (state.knockSequence?.length) await knockForVm(state);
131
+
132
+ const child = spawnTunnel(ip, user, ports);
133
+ children.push({ vmName, child, ports });
134
+
135
+ console.log(` ${OK("✓")} ${chalk.bold(vmName)}`);
136
+ console.log(` ${DIM(`prometheus → localhost:${ports.prometheus}`)}`);
137
+ console.log(` ${DIM(`loki → localhost:${ports.loki}`)}`);
138
+ }
139
+
140
+ console.log(chalk.cyan("\n Tunnels open. Run in another terminal:"));
141
+ console.log(DIM(" fops azure grafana sync\n"));
142
+ console.log(DIM(" Press Ctrl+C to close.\n"));
143
+
144
+ await new Promise((resolve) => {
145
+ const cleanup = () => {
146
+ for (const { child } of children) {
147
+ try { child.kill(); } catch {}
148
+ }
149
+ resolve();
150
+ };
151
+ process.on("SIGINT", cleanup);
152
+ process.on("SIGTERM", cleanup);
153
+ });
154
+ }
155
+
156
+ // ── Grafana HTTP API helpers ──────────────────────────────────────────────────
157
+
158
+ async function grafanaFetch(cfg, path, method = "GET", body = undefined) {
159
+ const url = `${cfg.url.replace(/\/$/, "")}${path}`;
160
+ const res = await fetch(url, {
161
+ method,
162
+ headers: {
163
+ "Content-Type": "application/json",
164
+ "Authorization": `Bearer ${cfg.token}`,
165
+ },
166
+ body: body ? JSON.stringify(body) : undefined,
167
+ });
168
+ const text = await res.text();
169
+ let json;
170
+ try { json = JSON.parse(text); } catch { json = { message: text }; }
171
+ return { ok: res.ok, status: res.status, json };
172
+ }
173
+
174
+ async function listDatasources(cfg) {
175
+ const { ok, json } = await grafanaFetch(cfg, "/api/datasources");
176
+ if (!ok) throw new Error(`Failed to list datasources: ${JSON.stringify(json)}`);
177
+ return json;
178
+ }
179
+
180
+ async function upsertDatasource(cfg, ds) {
181
+ const all = await listDatasources(cfg);
182
+ const existing = all.find((d) => d.name === ds.name);
183
+ if (existing) {
184
+ const { ok, json } = await grafanaFetch(cfg, `/api/datasources/${existing.id}`, "PUT", { ...ds, id: existing.id });
185
+ if (!ok) throw new Error(`Failed to update "${ds.name}": ${JSON.stringify(json)}`);
186
+ return { action: "updated", id: existing.id };
187
+ }
188
+ const { ok, json } = await grafanaFetch(cfg, "/api/datasources", "POST", ds);
189
+ if (!ok) throw new Error(`Failed to create "${ds.name}": ${JSON.stringify(json)}`);
190
+ return { action: "created", id: json.datasource?.id ?? json.id };
191
+ }
192
+
193
+ async function deleteDatasourceByName(cfg, name) {
194
+ const all = await listDatasources(cfg);
195
+ const existing = all.find((d) => d.name === name);
196
+ if (!existing) return false;
197
+ const { ok, json } = await grafanaFetch(cfg, `/api/datasources/${existing.id}`, "DELETE");
198
+ if (!ok) throw new Error(`Failed to delete "${name}": ${JSON.stringify(json)}`);
199
+ return true;
200
+ }
201
+
202
+ // ── Datasource definitions ────────────────────────────────────────────────────
203
+
204
+ function prometheusDs(vmName, localPort) {
205
+ return {
206
+ name: `${vmName}-prometheus`,
207
+ type: "prometheus",
208
+ url: `http://localhost:${localPort}`,
209
+ access: "proxy",
210
+ isDefault: false,
211
+ jsonData: { timeInterval: "15s" },
212
+ };
213
+ }
214
+
215
+ function lokiDs(vmName, localPort) {
216
+ return {
217
+ name: `${vmName}-loki`,
218
+ type: "loki",
219
+ url: `http://localhost:${localPort}`,
220
+ access: "proxy",
221
+ isDefault: false,
222
+ jsonData: {},
223
+ };
224
+ }
225
+
226
+ // ── Public API ────────────────────────────────────────────────────────────────
227
+
228
+ /**
229
+ * Register Prometheus + Loki datasources for a VM using its stored tunnel ports.
230
+ * No-ops if Grafana isn't configured or tunnel ports haven't been assigned yet.
231
+ */
232
+ export async function registerVmDatasources(vmName) {
233
+ const cfg = readGrafanaConfig();
234
+ if (!cfg?.url || !cfg?.token) return;
235
+
236
+ const ports = readTunnelPorts(vmName);
237
+ if (!ports) return; // tunnel not yet set up — user runs `grafana tunnel` first
238
+
239
+ try {
240
+ const pResult = await upsertDatasource(cfg, prometheusDs(vmName, ports.prometheus));
241
+ const lResult = await upsertDatasource(cfg, lokiDs(vmName, ports.loki));
242
+ console.log(OK(` ✓ Grafana datasources ${pResult.action}: ${vmName}-prometheus, ${vmName}-loki`));
243
+ } catch (err) {
244
+ console.log(WARN(` ⚠ Grafana datasource registration failed: ${err.message}`));
245
+ }
246
+ }
247
+
248
+ /**
249
+ * Remove Prometheus + Loki datasources and clear stored tunnel ports for a VM.
250
+ * No-ops if Grafana isn't configured.
251
+ */
252
+ export async function deregisterVmDatasources(vmName) {
253
+ const cfg = readGrafanaConfig();
254
+ if (!cfg?.url || !cfg?.token) {
255
+ clearTunnelPorts(vmName);
256
+ return;
257
+ }
258
+
259
+ try {
260
+ const pd = await deleteDatasourceByName(cfg, `${vmName}-prometheus`);
261
+ const ld = await deleteDatasourceByName(cfg, `${vmName}-loki`);
262
+ if (pd || ld) console.log(OK(` ✓ Grafana datasources removed: ${vmName}-prometheus, ${vmName}-loki`));
263
+ } catch (err) {
264
+ console.log(WARN(` ⚠ Grafana datasource removal failed: ${err.message}`));
265
+ }
266
+
267
+ clearTunnelPorts(vmName);
268
+ }
269
+
270
+ /**
271
+ * Reconcile Grafana datasources for all tracked VMs that have tunnel ports assigned.
272
+ * Removes datasources for VMs that are no longer tracked.
273
+ */
274
+ export async function syncGrafanaDatasources(vms) {
275
+ const cfg = readGrafanaConfig();
276
+ if (!cfg?.url || !cfg?.token) {
277
+ console.log(WARN(" No Grafana configured. Run: fops azure grafana configure <url> <token>"));
278
+ return;
279
+ }
280
+
281
+ console.log(chalk.cyan(`\n Syncing Grafana datasources → ${cfg.url}\n`));
282
+
283
+ let synced = 0;
284
+ for (const [vmName] of Object.entries(vms)) {
285
+ const ports = readTunnelPorts(vmName);
286
+ if (!ports) {
287
+ console.log(` ${DIM(`⊘ ${vmName}: no tunnel ports — run: fops azure grafana tunnel`)}`);
288
+ continue;
289
+ }
290
+ try {
291
+ const pResult = await upsertDatasource(cfg, prometheusDs(vmName, ports.prometheus));
292
+ const lResult = await upsertDatasource(cfg, lokiDs(vmName, ports.loki));
293
+ console.log(` ${OK("✓")} ${vmName}: ${DIM(`prometheus :${ports.prometheus} (${pResult.action}), loki :${ports.loki} (${lResult.action})`)}`);
294
+ synced++;
295
+ } catch (err) {
296
+ console.log(` ${WARN("⚠")} ${vmName}: ${err.message}`);
297
+ }
298
+ }
299
+
300
+ // Remove stale datasources for VMs no longer tracked
301
+ try {
302
+ const all = await listDatasources(cfg);
303
+ const trackedNames = new Set(Object.keys(vms));
304
+ for (const ds of all) {
305
+ const match = ds.name.match(/^(.+)-(prometheus|loki)$/);
306
+ if (!match) continue;
307
+ const [, dsVmName] = match;
308
+ if (!trackedNames.has(dsVmName)) {
309
+ await deleteDatasourceByName(cfg, ds.name);
310
+ console.log(` ${DIM(`✗ Removed stale: ${ds.name}`)}`);
311
+ }
312
+ }
313
+ } catch (err) {
314
+ console.log(WARN(` ⚠ Stale datasource cleanup failed: ${err.message}`));
315
+ }
316
+
317
+ if (synced === 0) {
318
+ console.log(chalk.cyan("\n No tunnels active. Start them first:"));
319
+ console.log(DIM(" fops azure grafana tunnel\n"));
320
+ } else {
321
+ console.log();
322
+ }
323
+ }
324
+
325
+ /** Test connectivity to the configured Grafana instance. */
326
+ export async function testGrafanaConnection() {
327
+ const cfg = readGrafanaConfig();
328
+ if (!cfg?.url || !cfg?.token) {
329
+ console.log(ERR(" No Grafana configured. Run: fops azure grafana configure <url> <token>"));
330
+ return false;
331
+ }
332
+ try {
333
+ const { ok, json } = await grafanaFetch(cfg, "/api/org");
334
+ if (ok) {
335
+ console.log(OK(` ✓ Connected to Grafana: ${json.name || cfg.url}`));
336
+ return true;
337
+ }
338
+ console.log(ERR(` ✗ Auth failed (${json.message || "check token"})`));
339
+ return false;
340
+ } catch (err) {
341
+ console.log(ERR(` ✗ Cannot reach Grafana at ${cfg.url}: ${err.message}`));
342
+ return false;
343
+ }
344
+ }
@@ -825,7 +825,7 @@ export async function ensureOpenAiNetworkAccess(execa, publicIp, sub) {
825
825
  // ── Remote fops up command builder ──────────────────────────────────────────
826
826
 
827
827
  export function fopsUpCmd(publicUrl, { k3s, traefik, dai } = {}) {
828
- const profiles = ["k3s", "traefik"];
828
+ const profiles = ["k3s", "traefik", "loki"];
829
829
  if (dai) profiles.push("dai");
830
830
 
831
831
  const profileEnv = `COMPOSE_PROFILES=${profiles.join(",")} `;
@@ -414,6 +414,7 @@ export async function azureSsh(opts = {}) {
414
414
  const ip = freshState?.publicIp;
415
415
  if (!ip) { console.error(chalk.red("\n No IP address. Is the VM running? Try: fops azure start\n")); process.exit(1); }
416
416
 
417
+ const sshUser = opts.user || DEFAULTS.adminUser;
417
418
  await knockForVm(freshState);
418
419
 
419
420
  const { execa } = await import("execa");
@@ -421,7 +422,7 @@ export async function azureSsh(opts = {}) {
421
422
  "-o", "StrictHostKeyChecking=no",
422
423
  "-o", "UserKnownHostsFile=/dev/null",
423
424
  "-o", "ConnectTimeout=15",
424
- `${DEFAULTS.adminUser}@${ip}`,
425
+ `${sshUser}@${ip}`,
425
426
  ], { stdio: "inherit", reject: false });
426
427
  if (result.exitCode !== 0 && result.exitCode !== null) {
427
428
  console.error(chalk.red(`\n SSH failed (exit ${result.exitCode}). If port-knock is enabled, try: fops azure knock open ${freshState?.vmName}\n`));
@@ -858,7 +859,7 @@ export async function azureDeploy(opts = {}) {
858
859
 
859
860
  hint(`Pulling latest (branch: ${branch})…`);
860
861
  const { stdout: pullOut, exitCode: pullCode } = await ssh(
861
- `cd /opt/foundation-compose && git fetch origin && git checkout ${branch} && git pull origin ${branch}`
862
+ `cd /opt/foundation-compose && cp -f .env .env.local-backup 2>/dev/null; git checkout -- .env 2>/dev/null; git fetch origin && git checkout ${branch} && git pull origin ${branch}; cp -f .env.local-backup .env 2>/dev/null; true`
862
863
  );
863
864
  if (pullCode !== 0) {
864
865
  console.log(WARN(" ⚠ git pull had issues — continuing anyway"));
@@ -991,12 +992,15 @@ export async function azureRunUp(opts = {}) {
991
992
  console.log(WARN(" .env sanitized (invalid lines removed)"));
992
993
  }
993
994
 
994
- // Git pull
995
+ // Git pull — preserve .env (always has local overrides) to avoid merge conflicts
995
996
  const gitScript = [
996
997
  "cd /opt/foundation-compose",
998
+ "cp -f .env .env.local-backup 2>/dev/null || true",
999
+ "git checkout -- .env 2>/dev/null || true",
997
1000
  "git stash --include-untracked 2>&1 || true",
998
1001
  "( git pull --ff-only 2>&1 || ( git fetch origin 2>&1 && git reset --hard \"origin/$(git branch --show-current)\" 2>&1 ) )",
999
1002
  "git stash pop 2>&1 || echo 'nothing to pop'",
1003
+ "cp -f .env.local-backup .env 2>/dev/null || true",
1000
1004
  "git submodule update --init --recursive 2>&1",
1001
1005
  ].join(" && ");
1002
1006
  const { exitCode: gitCode, stdout: gitOut, stderr: gitErr } = await ssh(gitScript, 120000);
@@ -1856,12 +1860,15 @@ export async function azureUpdate(opts = {}) {
1856
1860
  ).catch(() => {});
1857
1861
  }
1858
1862
 
1859
- // Stash local changes, pull latest (or fetch+reset if non–fast-forward), then stash pop and submodules
1863
+ // Preserve .env (local overrides), pull latest (or fetch+reset if non–fast-forward), restore .env
1860
1864
  const gitScript = [
1861
1865
  "cd /opt/foundation-compose",
1866
+ "cp -f .env .env.local-backup 2>/dev/null || true",
1867
+ "git checkout -- .env 2>/dev/null || true",
1862
1868
  "git stash --include-untracked 2>&1 || true",
1863
1869
  "( git pull --ff-only 2>&1 || ( git fetch origin 2>&1 && git reset --hard \"origin/$(git branch --show-current)\" 2>&1 ) )",
1864
1870
  "git stash pop 2>&1 || echo 'nothing to pop'",
1871
+ "cp -f .env.local-backup .env 2>/dev/null || true",
1865
1872
  "git submodule update --init --recursive 2>&1",
1866
1873
  ].join(" && ");
1867
1874
  const { exitCode: gitCode, stdout: gitOut, stderr: gitErr } = await ssh(gitScript, 120000);
@@ -2129,6 +2136,35 @@ export async function azureLogs(service, opts = {}) {
2129
2136
  ], { stdio: "inherit", reject: false });
2130
2137
  }
2131
2138
 
2139
+ // ── restart ─────────────────────────────────────────────────────────────────
2140
+
2141
+ export async function azureRestart(service, opts = {}) {
2142
+ const state = requireVmState(opts.vmName);
2143
+ const ip = state.publicIp;
2144
+ if (!ip) { console.error(chalk.red("\n No IP. Is the VM running?\n")); process.exit(1); }
2145
+
2146
+ await knockForVm(state);
2147
+
2148
+ const { execa } = await import("execa");
2149
+ const adminUser = DEFAULTS.adminUser;
2150
+
2151
+ const svcArg = service ? (service.startsWith("foundation-") ? service : `foundation-${service}`) : "";
2152
+ const label = svcArg || "all services";
2153
+ console.log(chalk.cyan(` Restarting ${label} on ${state.vmName || ip}...`));
2154
+
2155
+ const { exitCode } = await execa("ssh", [
2156
+ "-o", "StrictHostKeyChecking=no",
2157
+ `${adminUser}@${ip}`,
2158
+ `cd /opt/foundation-compose && sudo docker compose restart ${svcArg}`,
2159
+ ], { stdio: "inherit", reject: false });
2160
+
2161
+ if (exitCode === 0) {
2162
+ console.log(chalk.green(` ✓ ${label} restarted`));
2163
+ } else {
2164
+ console.error(chalk.red(` ✗ restart failed (exit ${exitCode})`));
2165
+ }
2166
+ }
2167
+
2132
2168
  // ── grant-admin ─────────────────────────────────────────────────────────────
2133
2169
 
2134
2170
  export async function azureGrantAdmin(opts = {}) {
@@ -80,7 +80,7 @@ export async function configureVm(execa, ip, user, publicUrl, { githubToken, k3s
80
80
 
81
81
  console.log(chalk.dim(" Configuring VM..."));
82
82
 
83
- // Batch: sshd tuning + docker group + ownership — single SSH round-trip
83
+ // Batch: sshd tuning + docker group + ownership + br_netfilter — single SSH round-trip
84
84
  const setupBatch = [
85
85
  // Speed up SSH: accept forwarded env vars, disable DNS reverse lookup
86
86
  `sudo grep -q '^AcceptEnv.*BEARER_TOKEN' /etc/ssh/sshd_config 2>/dev/null || {`,
@@ -91,6 +91,8 @@ export async function configureVm(execa, ip, user, publicUrl, { githubToken, k3s
91
91
  `}`,
92
92
  `sudo usermod -aG docker ${user} 2>/dev/null; true`,
93
93
  "sudo chown -R azureuser:azureuser /opt/foundation-compose 2>/dev/null; true",
94
+ // Ensure br_netfilter is loaded so k3s CoreDNS service-IP routing works
95
+ "sudo modprobe br_netfilter 2>/dev/null; sudo sysctl -qw net.bridge.bridge-nf-call-iptables=1 2>/dev/null; true",
94
96
  // Only inject FOUNDATION_PUBLIC_URL if not already set — never overwrite
95
97
  `cd /opt/foundation-compose && grep -q '^FOUNDATION_PUBLIC_URL=' .env 2>/dev/null || echo 'FOUNDATION_PUBLIC_URL=${publicUrl}' >> .env`,
96
98
  ].join("\n");
@@ -1633,7 +1635,7 @@ export async function provisionVm(execa, ip, adminUser, { githubToken, branch =
1633
1635
  waitAptLock,
1634
1636
  "export DEBIAN_FRONTEND=noninteractive",
1635
1637
  "apt-get update -y -qq",
1636
- "apt-get install -y -qq apt-transport-https ca-certificates curl gnupg lsb-release jq git make unzip zsh software-properties-common",
1638
+ "apt-get install -y -qq apt-transport-https ca-certificates curl gnupg lsb-release jq git make unzip zsh software-properties-common python3-venv python3-pip",
1637
1639
  ].join("\n"), 300000);
1638
1640
 
1639
1641
  await runScript("Installing Docker", [
@@ -1649,6 +1651,13 @@ export async function provisionVm(execa, ip, adminUser, { githubToken, branch =
1649
1651
  `usermod -aG docker ${adminUser}`,
1650
1652
  ].join("\n"), 300000);
1651
1653
 
1654
+ await runScript("Configuring br_netfilter for k3s DNS", [
1655
+ "modprobe br_netfilter",
1656
+ "echo br_netfilter > /etc/modules-load.d/br_netfilter.conf",
1657
+ "sysctl -w net.bridge.bridge-nf-call-iptables=1",
1658
+ "echo 'net.bridge.bridge-nf-call-iptables = 1' > /etc/sysctl.d/99-br-netfilter.conf",
1659
+ ].join("\n"));
1660
+
1652
1661
  await runScript("Installing GitHub CLI", [
1653
1662
  waitAptLock,
1654
1663
  "set +e",
@@ -235,6 +235,79 @@ export async function resultsPush(target, qaResult, opts = {}) {
235
235
  return { blob, account: store.account };
236
236
  }
237
237
 
238
+ // ── Remove ──────────────────────────────────────────────────────────────────
239
+
240
+ export async function resultsRemove(target, opts = {}) {
241
+ const execa = await lazyExeca();
242
+ await ensureAzCli(execa);
243
+ await ensureAzAuth(execa, { subscription: opts.subscription });
244
+
245
+ // 1. Clear local state
246
+ const state = readState();
247
+ const vm = state.azure?.vms?.[target];
248
+ if (vm?.qa) {
249
+ delete vm.qa;
250
+ saveState(state);
251
+ console.log(OK(` ✓ Local QA state cleared for "${target}"`));
252
+ } else {
253
+ console.log(DIM(` No local QA state found for "${target}"`));
254
+ }
255
+
256
+ // 2. Delete blobs from storage
257
+ const store = getResultsConfig();
258
+ if (!store?.account) {
259
+ console.log(DIM(" No results storage configured — skipping blob deletion."));
260
+ return;
261
+ }
262
+
263
+ const prefix = `${BLOB_PREFIX}/${target}/`;
264
+ let blobs;
265
+ try {
266
+ const { stdout } = await execa("az", [
267
+ "storage", "blob", "list",
268
+ "--account-name", store.account,
269
+ "--container-name", CONTAINER_NAME,
270
+ "--prefix", prefix,
271
+ "--query", "[].name",
272
+ "--output", "json",
273
+ ...AUTH,
274
+ ...subArgs(opts.subscription),
275
+ ], { timeout: 30000 });
276
+ blobs = JSON.parse(stdout || "[]");
277
+ } catch (err) {
278
+ console.log(WARN(` ⚠ Could not list blobs: ${err.message}`));
279
+ return;
280
+ }
281
+
282
+ if (blobs.length === 0) {
283
+ console.log(DIM(` No stored results found in blob storage for "${target}"`));
284
+ return;
285
+ }
286
+
287
+ console.log(chalk.cyan(`\n Deleting ${blobs.length} result(s) from ${store.account}/${CONTAINER_NAME}/${prefix}…\n`));
288
+
289
+ let deleted = 0;
290
+ for (const blobName of blobs) {
291
+ try {
292
+ await execa("az", [
293
+ "storage", "blob", "delete",
294
+ "--account-name", store.account,
295
+ "--container-name", CONTAINER_NAME,
296
+ "--name", blobName,
297
+ ...AUTH,
298
+ "--output", "none",
299
+ ...subArgs(opts.subscription),
300
+ ], { timeout: 15000 });
301
+ console.log(` ${DIM(`✗ ${blobName}`)}`);
302
+ deleted++;
303
+ } catch (err) {
304
+ console.log(WARN(` ⚠ Failed to delete ${blobName}: ${err.message}`));
305
+ }
306
+ }
307
+
308
+ console.log(OK(`\n ✓ Removed ${deleted}/${blobs.length} result(s) for "${target}"\n`));
309
+ }
310
+
238
311
  // ── List ────────────────────────────────────────────────────────────────────
239
312
 
240
313
  export async function resultsList(opts = {}) {
@@ -437,6 +437,10 @@ export async function azureUp(opts = {}) {
437
437
  }
438
438
 
439
439
  await ensureResourceLock(execa, rg, vmName, sub);
440
+
441
+ const { registerVmDatasources } = await import("./azure-grafana.js");
442
+ await registerVmDatasources(vmName, publicUrl);
443
+
440
444
  printInfo(vmName, publicIp, adminUser, publicUrl);
441
445
  return { publicIp, publicUrl, vmName, resourceGroup: rg };
442
446
  }
@@ -617,6 +621,9 @@ export async function azureDown(opts = {}) {
617
621
  await cleanupDns(cfToken, tracked.publicUrl);
618
622
  }
619
623
 
624
+ const { deregisterVmDatasources } = await import("./azure-grafana.js");
625
+ await deregisterVmDatasources(vmName);
626
+
620
627
  clearVmState(vmName);
621
628
  console.log(OK("\n ✓ Done.") + DIM(" State cleared.\n"));
622
629
  }
@@ -48,7 +48,7 @@ export {
48
48
  export {
49
49
  azureStatus, azureTrinoStatus, azureSsh, azureSshWhitelistMe, azurePortForward, azureSshAdminAdd, azureVmCheck, azureAgent, azureOpenAiDebugVm,
50
50
  azureDeploy, azurePull, azureDeployVersion, azureRunUp, azureConfig, azureConfigVersions, azureUpdate,
51
- azureLogs, azureGrantAdmin, azureContext,
51
+ azureLogs, azureRestart, azureGrantAdmin, azureContext,
52
52
  azureList, azureApply,
53
53
  azureKnock, azureKnockClose, azureKnockDisable, azureKnockVerify, azureKnockFix,
54
54
  } from "./azure-ops.js";
@@ -56,6 +56,39 @@ export function registerTestCommands(azure) {
56
56
  });
57
57
  let { bearerToken, qaUser, qaPass, useTokenMode } = auth;
58
58
 
59
+ // Fetch CF Access service token (needed to bypass Cloudflare Access)
60
+ // Priority: local .env file → process.env → remote VM .env
61
+ let cfAccessClientId = "";
62
+ let cfAccessClientSecret = "";
63
+ const localEnvPath = path.join(root, ".env");
64
+ try {
65
+ const localEnv = await fsp.readFile(localEnvPath, "utf8");
66
+ for (const line of localEnv.split("\n")) {
67
+ const m = line.match(/^CF_ACCESS_CLIENT_(ID|SECRET)=(.+)$/);
68
+ if (m?.[1] === "ID") cfAccessClientId = m[2].trim().replace(/^["']|["']$/g, "");
69
+ if (m?.[1] === "SECRET") cfAccessClientSecret = m[2].trim().replace(/^["']|["']$/g, "");
70
+ }
71
+ } catch { /* no local .env */ }
72
+ if (!cfAccessClientId) {
73
+ cfAccessClientId = process.env.CF_ACCESS_CLIENT_ID || "";
74
+ cfAccessClientSecret = process.env.CF_ACCESS_CLIENT_SECRET || "";
75
+ }
76
+ if (!cfAccessClientId && ip) {
77
+ try {
78
+ const sshUser = state?.adminUser || "azureuser";
79
+ const { stdout: cfOut } = await sshCmd(execa, ip, sshUser,
80
+ "grep -E '^CF_ACCESS_CLIENT_(ID|SECRET)=' /opt/foundation-compose/.env",
81
+ 10_000,
82
+ );
83
+ for (const line of (cfOut || "").split("\n")) {
84
+ const m = line.match(/^CF_ACCESS_CLIENT_(ID|SECRET)=(.+)$/);
85
+ if (m?.[1] === "ID") cfAccessClientId = m[2].trim();
86
+ if (m?.[1] === "SECRET") cfAccessClientSecret = m[2].trim();
87
+ }
88
+ if (cfAccessClientId) console.log(chalk.green(" ✓ Got CF Access service token from VM"));
89
+ } catch { /* optional — tests still run without it */ }
90
+ }
91
+
59
92
  if (!bearerToken && !qaUser) {
60
93
  console.error(chalk.red("\n No credentials found (local or remote)."));
61
94
  console.error(chalk.dim(" Set BEARER_TOKEN or QA_USERNAME/QA_PASSWORD, or ensure the VM has Auth0 configured in .env\n"));
@@ -94,6 +127,10 @@ export function registerTestCommands(azure) {
94
127
  envContent = setVar(envContent, "BEARER_TOKEN", bearerToken);
95
128
  envContent = setVar(envContent, "TOKEN_AUTH0", bearerToken);
96
129
  }
130
+ if (cfAccessClientId) {
131
+ envContent = setVar(envContent, "CF_ACCESS_CLIENT_ID", cfAccessClientId);
132
+ envContent = setVar(envContent, "CF_ACCESS_CLIENT_SECRET", cfAccessClientSecret);
133
+ }
97
134
 
98
135
  await fsp.writeFile(envPath, envContent);
99
136
  console.log(chalk.green(` ✓ Configured QA .env → ${apiUrl}`));
@@ -155,6 +192,10 @@ export function registerTestCommands(azure) {
155
192
  testEnv.BEARER_TOKEN = bearerToken;
156
193
  testEnv.TOKEN_AUTH0 = bearerToken;
157
194
  }
195
+ if (cfAccessClientId) {
196
+ testEnv.CF_ACCESS_CLIENT_ID = cfAccessClientId;
197
+ testEnv.CF_ACCESS_CLIENT_SECRET = cfAccessClientSecret;
198
+ }
158
199
 
159
200
  const startMs = Date.now();
160
201
  const proc = execa(
@@ -245,6 +286,15 @@ export function registerTestCommands(azure) {
245
286
  await resultsCompare({ target, last: parseInt(opts.last) });
246
287
  });
247
288
 
289
+ test
290
+ .command("rm <target>")
291
+ .description("Remove all stored test results for a VM (local state + blob storage)")
292
+ .option("--profile <subscription>", "Azure subscription name or ID")
293
+ .action(async (target, opts) => {
294
+ const { resultsRemove } = await import("../azure-results.js");
295
+ await resultsRemove(target, { subscription: opts.profile });
296
+ });
297
+
248
298
  test
249
299
  .command("push [target]")
250
300
  .description("Push local QA results to blob storage (default: all VMs with results)")
@@ -347,11 +347,20 @@ export function registerVmCommands(azure, api, registry) {
347
347
 
348
348
  ssh
349
349
  .command("connect [name]", { isDefault: true })
350
- .description("Open an interactive SSH session to the VM")
350
+ .description("Open an interactive SSH session (user@vm or just vm for azureuser)")
351
351
  .option("--vm-name <name>", "Target VM (default: active VM)")
352
+ .option("--user <user>", "SSH user (default: azureuser)")
352
353
  .action(async (name, opts) => {
353
354
  const { azureSsh } = await import("../azure.js");
354
- await azureSsh({ vmName: opts.vmName || name });
355
+ let vmName = opts.vmName || name;
356
+ let user = opts.user;
357
+ // Parse user@vm syntax
358
+ if (vmName && vmName.includes("@")) {
359
+ const parts = vmName.split("@");
360
+ user = parts[0];
361
+ vmName = parts[1];
362
+ }
363
+ await azureSsh({ vmName, user });
355
364
  });
356
365
 
357
366
  const sshAdmin = ssh.command("admin").description("Manage admin SSH keys across all VMs");
@@ -866,6 +875,16 @@ export function registerVmCommands(azure, api, registry) {
866
875
  await azureLogs(service, { vmName: opts.vmName || name });
867
876
  });
868
877
 
878
+ // ── restart ─────────────────────────────────────────────────────────────────
879
+ azure
880
+ .command("restart [name] [service]")
881
+ .description("Restart Foundation services on the VM (all or specific)")
882
+ .option("--vm-name <name>", "Target VM (default: active VM)")
883
+ .action(async (name, service, opts) => {
884
+ const { azureRestart } = await import("../azure.js");
885
+ await azureRestart(service, { vmName: opts.vmName || name });
886
+ });
887
+
869
888
  // ── context ───────────────────────────────────────────────────────────────
870
889
  azure
871
890
  .command("context [name]")
@@ -920,4 +939,48 @@ export function registerVmCommands(azure, api, registry) {
920
939
  const { azureSubscriptions } = await import("../azure.js");
921
940
  await azureSubscriptions();
922
941
  });
942
+
943
+ // ── grafana ───────────────────────────────────────────────────────────────
944
+ const grafana = azure.command("grafana").description("Manage Grafana datasource federation");
945
+
946
+ grafana
947
+ .command("configure <url> <token>")
948
+ .description("Set Grafana URL and service account token (stored in ~/.fops.json)")
949
+ .action(async (url, token) => {
950
+ const { writeGrafanaConfig, testGrafanaConnection } = await import("../azure-grafana.js");
951
+ writeGrafanaConfig({ url, token });
952
+ console.log(chalk.green(` ✓ Grafana configured: ${url}`));
953
+ await testGrafanaConnection();
954
+ });
955
+
956
+ grafana
957
+ .command("tunnel")
958
+ .description("Open SSH tunnels for Prometheus + Loki on all tracked VMs (blocks until Ctrl+C)")
959
+ .action(async () => {
960
+ const { openGrafanaTunnels } = await import("../azure-grafana.js");
961
+ const { listVms } = await import("../azure-state.js");
962
+ const { knockForVm } = await import("../azure.js");
963
+ const { vms } = listVms();
964
+ await openGrafanaTunnels(vms, { knockForVm });
965
+ });
966
+
967
+ grafana
968
+ .command("sync")
969
+ .description("Register localhost datasources in Grafana for all tunnelled VMs")
970
+ .action(async () => {
971
+ const { syncGrafanaDatasources } = await import("../azure-grafana.js");
972
+ const { listVms } = await import("../azure-state.js");
973
+ const { vms } = listVms();
974
+ await syncGrafanaDatasources(vms);
975
+ });
976
+
977
+ grafana
978
+ .command("status")
979
+ .description("Test connectivity to the configured Grafana instance")
980
+ .action(async () => {
981
+ const { testGrafanaConnection, readGrafanaConfig } = await import("../azure-grafana.js");
982
+ const cfg = readGrafanaConfig();
983
+ if (cfg?.url) console.log(chalk.dim(` URL: ${cfg.url}`));
984
+ await testGrafanaConnection();
985
+ });
923
986
  }
@@ -1,4 +1,27 @@
1
1
  import chalk from "chalk";
2
+ import fs from "node:fs";
3
+ import path from "node:path";
4
+ import { fileURLToPath, pathToFileURL } from "node:url";
5
+
6
+ /** Resolve a module inside the azure plugin (works from source tree and ~/.fops/plugins). */
7
+ function resolveAzure(relPath) {
8
+ const thisDir = path.dirname(fileURLToPath(import.meta.url));
9
+ // Source tree: ../../fops-plugin-azure/<relPath>
10
+ const fromSource = path.resolve(thisDir, "..", "fops-plugin-azure", relPath);
11
+ if (fs.existsSync(fromSource)) return pathToFileURL(fromSource).href;
12
+ // Installed CLI: find via the fops binary
13
+ const fopsBin = process.argv[1];
14
+ if (fopsBin) {
15
+ try {
16
+ const cliRoot = path.dirname(fs.realpathSync(fopsBin));
17
+ const fromCli = path.resolve(cliRoot, "src/plugins/bundled/fops-plugin-azure", relPath);
18
+ if (fs.existsSync(fromCli)) return pathToFileURL(fromCli).href;
19
+ } catch {}
20
+ }
21
+ // Fallback: ~/.fops/plugins sibling
22
+ const fromFops = path.resolve(thisDir, "../../fops-plugin-azure", relPath);
23
+ return pathToFileURL(fromFops).href;
24
+ }
2
25
 
3
26
  export async function register(api) {
4
27
  const config = api.config || {};
@@ -133,6 +156,112 @@ export async function register(api) {
133
156
  console.log(chalk.dim(" Customize via config.commandRoles in ~/.fops.json\n"));
134
157
  });
135
158
 
159
+ gh
160
+ .command("ssh-sync <username> [names...]")
161
+ .description("Sync a GitHub user's SSH keys to Azure VMs")
162
+ .option("--vm-name <name>", "Target a single VM instead of all")
163
+ .action(async (username, names, opts) => {
164
+ const { fetchUserKeys } = await import("./lib/github.js");
165
+
166
+ console.log(chalk.dim(`\n Fetching SSH keys for github.com/${username}…`));
167
+ let keys;
168
+ try {
169
+ keys = await fetchUserKeys(username);
170
+ } catch (err) {
171
+ console.log(chalk.yellow(` ⚠ ${username}: ${err.message} — skipped\n`));
172
+ return;
173
+ }
174
+ if (keys.length === 0) {
175
+ console.log(chalk.yellow(` ⚠ ${username}: no SSH keys on GitHub — skipped\n`));
176
+ return;
177
+ }
178
+ console.log(chalk.green(` ✓ Found ${keys.length} key(s)`));
179
+
180
+ // Resolve VMs from the azure plugin
181
+ const { listVms, requireVmState } = await import(resolveAzure("lib/azure-state.js"));
182
+ const {
183
+ lazyExeca, resolvePublicIp, knockForVm, waitForSsh, sshCmd, DEFAULTS,
184
+ } = await import(resolveAzure("lib/azure-helpers.js"));
185
+
186
+ let targetVms;
187
+ if (opts.vmName) {
188
+ const vm = requireVmState(opts.vmName);
189
+ targetVms = { [opts.vmName]: vm };
190
+ } else if (names.length > 0) {
191
+ targetVms = {};
192
+ for (const n of names) targetVms[n] = requireVmState(n);
193
+ } else {
194
+ ({ vms: targetVms } = listVms());
195
+ }
196
+
197
+ const vmNames = Object.keys(targetVms);
198
+ if (vmNames.length === 0) {
199
+ console.error(chalk.red("\n ✗ No VMs tracked. Use: fops azure up <name>\n"));
200
+ process.exit(1);
201
+ }
202
+
203
+ const execa = await lazyExeca();
204
+ const adminUser = DEFAULTS.adminUser;
205
+ let success = 0, failed = 0;
206
+
207
+ // Sanitize GitHub username → Linux username (lowercase, no leading hyphen)
208
+ const linuxUser = username.toLowerCase().replace(/[^a-z0-9_-]/g, "").replace(/^-/, "_");
209
+
210
+ console.log(chalk.cyan(`\n Syncing user ${chalk.white(linuxUser)} (${keys.length} key(s)) → ${vmNames.join(", ")}\n`));
211
+
212
+ for (const vmName of vmNames) {
213
+ const vm = targetVms[vmName];
214
+ let ip = vm.publicIp;
215
+ if (!ip) {
216
+ try {
217
+ ip = await resolvePublicIp(execa, vm.resourceGroup, vmName, null);
218
+ } catch {}
219
+ }
220
+ if (!ip) {
221
+ console.log(chalk.yellow(` ⚠ ${vmName}: no IP — skipped`));
222
+ failed++;
223
+ continue;
224
+ }
225
+
226
+ try { await knockForVm({ ...vm, vmName }); } catch {}
227
+ const sshOk = await waitForSsh(execa, ip, adminUser, 15000);
228
+ if (!sshOk) {
229
+ console.log(chalk.red(` ✗ ${vmName}: SSH unreachable — skipped`));
230
+ failed++;
231
+ continue;
232
+ }
233
+
234
+ // Create user with sudo, docker group, and SSH dir
235
+ const keysEscaped = keys.map(k => k.replace(/'/g, "'\\''")).join("\n");
236
+ const setupCmd = [
237
+ `sudo id ${linuxUser} &>/dev/null || sudo useradd -m -s /bin/bash ${linuxUser}`,
238
+ `sudo usermod -aG sudo,docker ${linuxUser} 2>/dev/null || sudo usermod -aG sudo ${linuxUser}`,
239
+ `sudo mkdir -p /home/${linuxUser}/.ssh`,
240
+ `echo '${keysEscaped}' | sudo tee /home/${linuxUser}/.ssh/authorized_keys >/dev/null`,
241
+ `sudo chmod 700 /home/${linuxUser}/.ssh`,
242
+ `sudo chmod 600 /home/${linuxUser}/.ssh/authorized_keys`,
243
+ `sudo chown -R ${linuxUser}:${linuxUser} /home/${linuxUser}/.ssh`,
244
+ `echo '${linuxUser} ALL=(ALL) NOPASSWD:ALL' | sudo tee /etc/sudoers.d/${linuxUser} >/dev/null`,
245
+ ].join(" && ");
246
+
247
+ const { exitCode } = await sshCmd(execa, ip, adminUser, setupCmd, 30000);
248
+ if (exitCode === 0) {
249
+ console.log(chalk.green(` ✓ ${vmName}: user ${linuxUser} created with ${keys.length} key(s)`));
250
+ success++;
251
+ } else {
252
+ console.log(chalk.red(` ✗ ${vmName}: failed (exit ${exitCode})`));
253
+ failed++;
254
+ }
255
+ }
256
+
257
+ console.log("");
258
+ if (failed === 0) {
259
+ console.log(chalk.green(` ✓ User ${linuxUser} synced to all ${success} VM(s)\n`));
260
+ } else {
261
+ console.log(chalk.yellow(` Done: ${success} succeeded, ${failed} failed\n`));
262
+ }
263
+ });
264
+
136
265
  gh
137
266
  .command("logout")
138
267
  .description("Clear stored GitHub token and cached role")
@@ -127,6 +127,21 @@ export async function isOrgOwner(token, org, username) {
127
127
  return data.role === "admin";
128
128
  }
129
129
 
130
+ /**
131
+ * Fetch a GitHub user's public SSH keys.
132
+ * Uses the unauthenticated https://github.com/<user>.keys endpoint.
133
+ * @param {string} username
134
+ * @returns {Promise<string[]>} - public key lines
135
+ */
136
+ export async function fetchUserKeys(username) {
137
+ const res = await fetch(`https://github.com/${encodeURIComponent(username)}.keys`, {
138
+ signal: AbortSignal.timeout(10_000),
139
+ });
140
+ if (!res.ok) throw new Error(`Failed to fetch keys for ${username}: HTTP ${res.status}`);
141
+ const text = await res.text();
142
+ return text.trim().split("\n").filter(Boolean);
143
+ }
144
+
130
145
  /**
131
146
  * Get the teams a user belongs to within an org.
132
147
  * Uses the "list teams for authenticated user" endpoint filtered by org.