@meshxdata/fops 0.1.49 → 0.1.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/CHANGELOG.md +182 -0
  2. package/package.json +1 -1
  3. package/src/plugins/bundled/fops-plugin-azure/lib/azure-aks-core.js +347 -6
  4. package/src/plugins/bundled/fops-plugin-azure/lib/azure-aks-data-bootstrap.js +421 -0
  5. package/src/plugins/bundled/fops-plugin-azure/lib/azure-aks-flux.js +5 -179
  6. package/src/plugins/bundled/fops-plugin-azure/lib/azure-aks-naming.js +14 -4
  7. package/src/plugins/bundled/fops-plugin-azure/lib/azure-aks-postgres.js +171 -4
  8. package/src/plugins/bundled/fops-plugin-azure/lib/azure-aks-storage.js +303 -8
  9. package/src/plugins/bundled/fops-plugin-azure/lib/azure-aks.js +2 -0
  10. package/src/plugins/bundled/fops-plugin-azure/lib/azure-auth.js +1 -1
  11. package/src/plugins/bundled/fops-plugin-azure/lib/azure-fleet-swarm.js +936 -0
  12. package/src/plugins/bundled/fops-plugin-azure/lib/azure-fleet.js +10 -918
  13. package/src/plugins/bundled/fops-plugin-azure/lib/azure-helpers.js +5 -0
  14. package/src/plugins/bundled/fops-plugin-azure/lib/azure-keyvault-keys.js +413 -0
  15. package/src/plugins/bundled/fops-plugin-azure/lib/azure-keyvault.js +14 -399
  16. package/src/plugins/bundled/fops-plugin-azure/lib/azure-ops-config.js +754 -0
  17. package/src/plugins/bundled/fops-plugin-azure/lib/azure-ops-knock.js +527 -0
  18. package/src/plugins/bundled/fops-plugin-azure/lib/azure-ops-ssh.js +427 -0
  19. package/src/plugins/bundled/fops-plugin-azure/lib/azure-ops.js +99 -1686
  20. package/src/plugins/bundled/fops-plugin-azure/lib/azure-provision-health.js +279 -0
  21. package/src/plugins/bundled/fops-plugin-azure/lib/azure-provision-init.js +186 -0
  22. package/src/plugins/bundled/fops-plugin-azure/lib/azure-provision.js +66 -444
  23. package/src/plugins/bundled/fops-plugin-azure/lib/azure-results.js +11 -0
  24. package/src/plugins/bundled/fops-plugin-azure/lib/azure-vm-lifecycle.js +5 -540
  25. package/src/plugins/bundled/fops-plugin-azure/lib/azure-vm-terraform.js +544 -0
  26. package/src/plugins/bundled/fops-plugin-azure/lib/commands/infra-cmds.js +75 -3
  27. package/src/plugins/bundled/fops-plugin-azure/lib/commands/test-cmds.js +227 -11
  28. package/src/plugins/bundled/fops-plugin-azure/lib/commands/vm-cmds.js +2 -1
  29. package/src/plugins/bundled/fops-plugin-azure/lib/pytest-parse.js +21 -0
  30. package/src/plugins/bundled/fops-plugin-foundation/index.js +309 -44
package/CHANGELOG.md CHANGED
@@ -1,3 +1,185 @@
1
+ ## [0.1.50] - 2026-03-24
2
+
3
+ - operator cli plugin fix (25620cc)
4
+ - operator cli test fixes (1d1c18f)
5
+ - feat(test): add setup-users command for QA test user creation (b929507)
6
+ - feat(aks): show HA standby clusters with visual grouping (8fb640c)
7
+ - refactor(provision): extract VM provisioning to dedicated module (af321a7)
8
+ - refactor(provision): extract post-start health checks to dedicated module (6ed5f2d)
9
+ - fix: ping timeout 15s, fix prometheus sed escaping (d11ac14)
10
+ - refactor(vm): extract terraform HCL generation to dedicated module (896a64b)
11
+ - refactor(keyvault): extract key operations to dedicated module (716bbe4)
12
+ - refactor(azure): extract swarm functions to azure-fleet-swarm.js (4690e34)
13
+ - refactor(azure): extract SSH/remote functions to azure-ops-ssh.js (e62b8f0)
14
+ - refactor(azure): split azure-ops.js into smaller modules (4515425)
15
+ - feat(aks): add --ha flag for full cross-region HA setup (ece68c5)
16
+ - feat(fops): inject ENVIRONMENT_NAME on VM provisioning (6ef2a27)
17
+ - fix(postgres): disable SSL mode to fix connection issues (c789ae9)
18
+ - feat(trino): add caching configuration for docker-compose (3668224)
19
+ - fix(fops-azure): run pytest directly instead of missing scripts (29f8410)
20
+ - add -d detach option for local frontend dev, remove hive cpu limits (3306667)
21
+ - release 0.1.49 (dcca32b)
22
+ - release 0.1.48 (9b195e5)
23
+ - stash on updates (2916c01)
24
+ - stash on updates (b5c14df)
25
+ - stash on updates (d0453d1)
26
+ - frontend dev fixes (0ca7b00)
27
+ - fix: update azure test commands (77c81da)
28
+ - default locust to CLI mode, add --web for UI (ca35bff)
29
+ - add locust command for load testing AKS clusters (1278722)
30
+ - update spot node pool default autoscaling to 1-20 (617c182)
31
+ - module for aks (3dd1a61)
32
+ - add hive to PG_SERVICE_DBS for fops pg-setup (afccb16)
33
+ - feat(azure): enhance aks doctor with ExternalSecrets and PGSSLMODE checks (8b14861)
34
+ - add foundation-postgres ExternalName service to reconciler (ea88e11)
35
+ - new flux templates (0e2e372)
36
+ - feat(azure): add storage-engine secrets to Key Vault (a4f488e)
37
+ - feat(azure-aks): add AUTH0_DOMAIN to template rendering variables (216c37e)
38
+ - feat(azure): add storage account creation per cluster (aa1b138)
39
+ - bump watcher (ab24473)
40
+ - fix: concurrent compute calls (#66) (03e2edf)
41
+ - bump backend version (5058ff5)
42
+ - bump fops to 0.1.44 (8c0ef5d)
43
+ - Mlflow and azure plugin fix (176881f)
44
+ - fix lifecycle (a2cb9e7)
45
+ - callback url for localhost (821fb94)
46
+ - disable 4 scaffolding plugin by default. (bfb2b76)
47
+ - jaccard improvements (b7494a0)
48
+ - refactor azure plugin (68dfef4)
49
+ - refactor azure plugin (b24a008)
50
+ - fix trino catalog missing (4928a55)
51
+ - v36 bump and changelog generation on openai (37a0440)
52
+ - v36 bump and changelog generation on openai (a3b02d9)
53
+ - bump (a990058)
54
+ - status bar fix and new plugin for ttyd (27dde1e)
55
+ - file demo and tray (1a3e704)
56
+ - electron app (59ad0bb)
57
+ - compose and fops file plugin (1cf0e81)
58
+ - bump (346ffc1)
59
+ - localhost replaced by 127.0.0.1 (82b9f30)
60
+ - .29 (587b0e1)
61
+ - improve up down and bootstrap script (b79ebaf)
62
+ - checksum (22c8086)
63
+ - checksum (96b434f)
64
+ - checksum (15ed3c0)
65
+ - checksum (8a6543a)
66
+ - bump embed trino linksg (8440504)
67
+ - bump data (765ffd9)
68
+ - bump (cb8b232)
69
+ - broken tests (c532229)
70
+ - release 0.1.18, preflight checks (d902249)
71
+ - fix compute display bug (d10f5d9)
72
+ - cleanup packer files (6330f18)
73
+ - plan mode (cb36a8a)
74
+ - bump to 0.1.16 - agent ui (41ac1a2)
75
+ - bump to 0.1.15 - agent ui (4ebe2e1)
76
+ - bump to 0.1.14 (6c3a7fa)
77
+ - bump to 0.1.13 (8db570f)
78
+ - release 0.1.12 (c1c79e5)
79
+ - bump (11aa3b0)
80
+ - git keep and bump tui (be1678e)
81
+ - skills, index, rrf, compacted context (100k > 10k) (7b2fffd)
82
+ - cloudflare and token consumption, graphs indexing (0ad9eec)
83
+ - bump storage default (22c83ba)
84
+ - storage fix (68a22a0)
85
+ - skills update (7f56500)
86
+ - v9 bump (3864446)
87
+ - bump (c95eedc)
88
+ - rrf (dbf8c95)
89
+ - feat: warning when running predictions (95e8c52)
90
+ - feat: support for local predictions (45cf26b)
91
+ - feat: wip support for predictions + mlflow (3457052)
92
+ - add Reciprocal Rank Fusion (RRF) to knowledge and skill retrieval (61549bc)
93
+ - validate CSV headers in compute_run readiness check (a8c7a43)
94
+ - fix corrupted Iceberg metadata: probe tables + force cleanup on re-apply (50578af)
95
+ - enforce: never use foundation_apply to fix broken products (2e049bf)
96
+ - update SKILL.md with complete tool reference for knowledge retrieval (30b1924)
97
+ - add storage read, input DP table probe, and compute_run improvements (34e6c4c)
98
+ - skills update (1220385)
99
+ - skills update (bb66958)
100
+ - some tui improvement andd tools apply overwrite (e90c35c)
101
+ - skills update (e9227a1)
102
+ - skills update (669c4b3)
103
+ - fix plugin pre-flight checks (f741743)
104
+ - increase agent context (6479aaa)
105
+ - skills and init sql fixes (5fce35e)
106
+ - checksum (3518b56)
107
+ - penging job limit (a139861)
108
+ - checksum (575d28c)
109
+ - bump (92049ba)
110
+ - fix bug per tab status (0a33657)
111
+ - fix bug per tab status (50457c6)
112
+ - checksumming (0ad842e)
113
+ - shot af mardkwon overlapping (51f63b9)
114
+ - add spark dockerfile for multiarch builds (95abbd1)
115
+ - fix plugin initialization (16b9782)
116
+ - split index.js (50902a2)
117
+ - cloudflare cidr (cc4e021)
118
+ - cloduflare restrictions (2f6ba2d)
119
+ - sequential start (86b496e)
120
+ - sequential start (4930fe1)
121
+ - sequential start (353f014)
122
+ - qa tests (2dc6a1a)
123
+ - bump sha for .85 (dc2edfe)
124
+ - preserve env on sudo (7831227)
125
+ - bump sha for .84 (6c052f9)
126
+ - non interactive for azure vms (0aa8a2f)
127
+ - keep .env if present (d072450)
128
+ - bump (7a8e732)
129
+ - ensure opa is on compose if not set (f4a5228)
130
+ - checksum bump (a2ccc20)
131
+ - netrc defensive checks (a0b0ccc)
132
+ - netrc defensive checks (ae37403)
133
+ - checksum (ec45d11)
134
+ - update sync and fix up (7f9af72)
135
+ - expand test for azure and add new per app tag support (388a168)
136
+ - checksum on update (44005fc)
137
+ - cleanup for later (15e5313)
138
+ - cleanup for later (11c9597)
139
+ - switch branch feature (822fecc)
140
+ - add pull (d1c19ab)
141
+ - Bump hono from 4.11.9 to 4.12.0 in /operator-cli (ad25144)
142
+ - tests (f180a9a)
143
+ - cleanup (39c49a3)
144
+ - registry (7b7126a)
145
+ - reconcile kafka (832d0db)
146
+ - gh login bug (025886c)
147
+ - cleanup (bb96cab)
148
+ - strip envs from process (2421180)
149
+ - force use of gh creds not tokens in envs var (fff7787)
150
+ - resolve import between npm installs and npm link (79522e1)
151
+ - fix gh scope and azure states (afd846c)
152
+ - refactoring (da50352)
153
+ - split fops repo (d447638)
154
+ - aks (b791f8f)
155
+ - refactor azure (67d3bad)
156
+ - wildcard (391f023)
157
+ - azure plugin (c074074)
158
+ - zap (d7e6e7f)
159
+ - fix knock (cf89c05)
160
+ - azure (4adec98)
161
+ - Bump tar from 7.5.7 to 7.5.9 in /operator-cli (e41e98e)
162
+ - azure stack index.js split (de12272)
163
+ - Bump ajv from 8.17.1 to 8.18.0 in /operator-cli (76da21f)
164
+ - packer (9665fbc)
165
+ - remove stack api (db0fd4d)
166
+ - packer cleanup (fe1bf14)
167
+ - force refresh token (3a3d7e2)
168
+ - provision shell (2ad505f)
169
+ - azure vm management (91dcb31)
170
+ - azure specific (2b0cca8)
171
+ - azure packer (12175b8)
172
+ - init hashed pwd (db8523c)
173
+ - packer (5b5c7c4)
174
+ - doctor for azure vm (ed524fa)
175
+ - packer and 1pwd (c6d053e)
176
+ - split big index.js (dc85a1b)
177
+ - kafka volume update (21815ec)
178
+ - fix openai azure tools confirmation and flow (0118cd1)
179
+ - nighly fixx, test fix (5e0d04f)
180
+ - open ai training (cdc494a)
181
+ - openai integration in azure (1ca1475)
182
+
1
183
  # Changelog
2
184
 
3
185
  All notable changes to @meshxdata/fops (Foundation Operator CLI) are documented here.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@meshxdata/fops",
3
- "version": "0.1.49",
3
+ "version": "0.1.50",
4
4
  "description": "CLI to install and manage data mesh platforms",
5
5
  "keywords": [
6
6
  "fops",
@@ -17,7 +17,7 @@ import {
17
17
  readState,
18
18
  resolveGithubToken,
19
19
  } from "./azure.js";
20
- import { AKS_DEFAULTS, PG_REPLICA_REGIONS, timeSince } from "./azure-aks-naming.js";
20
+ import { AKS_DEFAULTS, PG_REPLICA_REGIONS, HA_REPLICA_REGIONS, timeSince } from "./azure-aks-naming.js";
21
21
  import {
22
22
  readAksClusters,
23
23
  readClusterState,
@@ -34,6 +34,7 @@ import {
34
34
  } from "./azure-aks-flux.js";
35
35
  import { reconcileNetworkAccess } from "./azure-aks-network.js";
36
36
  import { reconcilePostgres, aksPostgresReplicaCreate, reconcileEventHubs } from "./azure-aks-postgres.js";
37
+ import { reconcileStorageReplication, reconcileStorageAccount } from "./azure-aks-storage.js";
37
38
  import { clusterDomain } from "./azure-aks-ingress.js";
38
39
  import { printClusterInfo } from "./azure-aks-stacks.js";
39
40
 
@@ -217,6 +218,177 @@ export async function getCredentials(execa, { clusterName, rg, sub, admin } = {}
217
218
  console.log(OK(` ✓ Kubeconfig merged`));
218
219
  }
219
220
 
221
+ // ── aks standby create (internal helper) ──────────────────────────────────────
222
+
223
+ async function aksStandbyCreate(execa, opts) {
224
+ const {
225
+ primaryClusterName,
226
+ standbyClusterName,
227
+ rg,
228
+ location,
229
+ nodeCount,
230
+ minCount,
231
+ maxCount,
232
+ nodeVmSize,
233
+ tier,
234
+ networkPlugin,
235
+ k8sVersion,
236
+ sub,
237
+ githubToken,
238
+ fluxRepo,
239
+ fluxOwner,
240
+ fluxBranch,
241
+ templateRepo,
242
+ templateOwner,
243
+ templateBranch,
244
+ environment,
245
+ account,
246
+ } = opts;
247
+
248
+ banner(`Creating Standby AKS: ${standbyClusterName}`);
249
+ kvLine("Region", DIM(location));
250
+ kvLine("Primary", DIM(primaryClusterName));
251
+
252
+ // Detect operator IP
253
+ const myIp = await fetchMyIp();
254
+
255
+ // Zone redundancy for standby
256
+ let zones = [];
257
+ try {
258
+ const { stdout: skuJson } = await execa("az", [
259
+ "vm", "list-skus", "--location", location, "--size", nodeVmSize,
260
+ "--resource-type", "virtualMachines", "--query", "[0].locationInfo[0].zones",
261
+ "--output", "json", ...subArgs(sub),
262
+ ], { timeout: 30000 });
263
+ zones = JSON.parse(skuJson || "[]").sort();
264
+ } catch { zones = []; }
265
+
266
+ const tags = buildTags(standbyClusterName, {
267
+ createdBy: account?.user?.name || "fops",
268
+ type: "aks-standby",
269
+ primaryCluster: primaryClusterName,
270
+ });
271
+ const tagStr = Object.entries(tags).map(([k, v]) => `${k}=${v}`).join(" ");
272
+
273
+ const createArgs = [
274
+ "aks", "create",
275
+ "--resource-group", rg,
276
+ "--name", standbyClusterName,
277
+ "--location", location,
278
+ "--node-count", String(nodeCount),
279
+ "--node-vm-size", nodeVmSize,
280
+ "--max-pods", "110",
281
+ "--enable-cluster-autoscaler",
282
+ "--min-count", String(minCount),
283
+ "--max-count", String(maxCount),
284
+ "--kubernetes-version", k8sVersion,
285
+ "--tier", tier,
286
+ "--network-plugin", networkPlugin,
287
+ "--generate-ssh-keys",
288
+ "--enable-managed-identity",
289
+ "--enable-oidc-issuer",
290
+ "--enable-workload-identity",
291
+ "--ssh-access", "disabled",
292
+ "--tags", ...tagStr.split(" "),
293
+ "--output", "json",
294
+ ...subArgs(sub),
295
+ ];
296
+
297
+ if (zones.length > 0) createArgs.push("--zones", ...zones);
298
+ if (myIp) createArgs.push("--api-server-authorized-ip-ranges", `${myIp}/32`);
299
+
300
+ hint("Creating standby cluster (5-10 minutes)…\n");
301
+ const { stdout: clusterJson } = await execa("az", createArgs, { timeout: 900000 });
302
+ const cluster = JSON.parse(clusterJson);
303
+ console.log(OK(` ✓ Standby AKS cluster created in ${location}`));
304
+
305
+ // Get kubeconfig
306
+ await getCredentials(execa, { clusterName: standbyClusterName, rg, sub });
307
+
308
+ // Save standby state
309
+ writeClusterState(standbyClusterName, {
310
+ resourceGroup: rg,
311
+ location,
312
+ nodeCount,
313
+ nodeVmSize,
314
+ kubernetesVersion: cluster.kubernetesVersion || k8sVersion,
315
+ subscriptionId: account?.id,
316
+ fqdn: cluster.fqdn,
317
+ provisioningState: cluster.provisioningState,
318
+ zones: zones.length > 0 ? zones : undefined,
319
+ isStandby: true,
320
+ primaryCluster: primaryClusterName,
321
+ createdAt: new Date().toISOString(),
322
+ });
323
+
324
+ // Bootstrap Flux with same config as primary
325
+ if (githubToken) {
326
+ const fluxPath = `clusters/${standbyClusterName}`;
327
+
328
+ // Provision templates for standby cluster
329
+ try {
330
+ await provisionFluxFromTemplate(execa, {
331
+ clusterName: standbyClusterName,
332
+ region: location,
333
+ domain: clusterDomain(standbyClusterName),
334
+ keyvaultUrl: `https://fops-${standbyClusterName}-kv.vault.azure.net`,
335
+ environment,
336
+ githubToken,
337
+ fluxRepo,
338
+ fluxOwner,
339
+ fluxBranch,
340
+ templateRepo,
341
+ templateOwner,
342
+ templateBranch,
343
+ });
344
+ } catch (err) {
345
+ console.log(WARN(` ⚠ Template provisioning: ${(err.message || "").split("\n")[0]}`));
346
+ }
347
+
348
+ await bootstrapFlux(execa, {
349
+ clusterName: standbyClusterName, rg, sub,
350
+ githubToken,
351
+ repo: fluxRepo,
352
+ owner: fluxOwner,
353
+ path: fluxPath,
354
+ branch: fluxBranch,
355
+ });
356
+
357
+ writeClusterState(standbyClusterName, {
358
+ flux: { repo: fluxRepo, owner: fluxOwner, path: fluxPath, branch: fluxBranch },
359
+ });
360
+
361
+ // Pre-install CRDs and fix webhook scheduling
362
+ try {
363
+ await reconcileFluxPrereqs({ execa, clusterName: standbyClusterName, rg, sub, opts: {} });
364
+ } catch { /* best effort */ }
365
+ }
366
+
367
+ // Set up Key Vault and network access for standby
368
+ try {
369
+ const { stdout: freshJson } = await execa("az", [
370
+ "aks", "show", "-g", rg, "-n", standbyClusterName, "--output", "json", ...subArgs(sub),
371
+ ], { timeout: 30000 });
372
+ const freshCluster = JSON.parse(freshJson);
373
+ await reconcileNetworkAccess({ execa, clusterName: standbyClusterName, rg, sub, cluster: freshCluster, opts: {} });
374
+ } catch { /* best effort */ }
375
+
376
+ // Create storage account in standby region (uses HA storage)
377
+ try {
378
+ const { stdout: freshJson } = await execa("az", [
379
+ "aks", "show", "-g", rg, "-n", standbyClusterName, "--output", "json", ...subArgs(sub),
380
+ ], { timeout: 30000 });
381
+ const freshCluster = JSON.parse(freshJson);
382
+ await reconcileStorageAccount({ execa, clusterName: standbyClusterName, rg, sub, cluster: freshCluster, opts: {} });
383
+ } catch { /* best effort */ }
384
+
385
+ console.log(OK(`\n ✓ Standby cluster "${standbyClusterName}" ready`));
386
+ hint(` Flux will sync workloads from ${fluxOwner}/${fluxRepo}`);
387
+ hint(` Configure postgres secret to point to read replica for read-only mode\n`);
388
+
389
+ return { clusterName: standbyClusterName, cluster };
390
+ }
391
+
220
392
  // ── aks up ────────────────────────────────────────────────────────────────────
221
393
 
222
394
  export async function aksUp(opts = {}) {
@@ -246,6 +418,10 @@ export async function aksUp(opts = {}) {
246
418
  kvLine("Nodes", DIM(`${nodeCount} x ${nodeVmSize} (autoscale ${minCount}–${maxCount})`));
247
419
  kvLine("K8s", DIM(k8sVersion));
248
420
  kvLine("Tier", DIM(tier));
421
+ if (opts.ha) {
422
+ const haRegion = HA_REPLICA_REGIONS[location] || "northeurope";
423
+ kvLine("HA", OK(`enabled (replica: ${haRegion})`));
424
+ }
249
425
 
250
426
  // Check if cluster already exists
251
427
  const { exitCode: exists } = await execa("az", [
@@ -302,6 +478,89 @@ export async function aksUp(opts = {}) {
302
478
  }
303
479
  }
304
480
 
481
+ // Set up HA for existing cluster if --ha flag is set
482
+ if (opts.ha === true) {
483
+ const haRegion = HA_REPLICA_REGIONS[location];
484
+ if (haRegion) {
485
+ // Storage replication
486
+ try {
487
+ const { stdout: freshJson } = await execa("az", [
488
+ "aks", "show", "-g", rg, "-n", clusterName, "--output", "json",
489
+ ...subArgs(sub),
490
+ ], { timeout: 30000 });
491
+ const freshCluster = JSON.parse(freshJson);
492
+ const storageCtx = { execa, clusterName, rg, sub, cluster: freshCluster, opts };
493
+ await reconcileStorageReplication(storageCtx);
494
+ } catch (err) {
495
+ console.log(WARN(` ⚠ Storage HA: ${(err.message || "").split("\n")[0]}`));
496
+ }
497
+
498
+ // Postgres read replica
499
+ try {
500
+ hint(`Checking Postgres replica in ${haRegion}…`);
501
+ await aksPostgresReplicaCreate({
502
+ clusterName,
503
+ profile: sub,
504
+ region: haRegion,
505
+ });
506
+ } catch (err) {
507
+ if (!err.message?.includes("already exists")) {
508
+ console.log(WARN(` ⚠ Postgres replica: ${(err.message || "").split("\n")[0]}`));
509
+ }
510
+ }
511
+
512
+ // Standby AKS cluster
513
+ const standbyClusterName = `${clusterName}-standby`;
514
+ try {
515
+ const { exitCode: standbyExists } = await execa("az", [
516
+ "aks", "show", "-g", rg, "-n", standbyClusterName, "--output", "none",
517
+ ...subArgs(sub),
518
+ ], { reject: false, timeout: 30000 });
519
+
520
+ if (standbyExists === 0) {
521
+ console.log(OK(` ✓ Standby cluster "${standbyClusterName}" already exists`));
522
+ } else {
523
+ const githubToken = resolveGithubToken(opts);
524
+ await aksStandbyCreate(execa, {
525
+ primaryClusterName: clusterName,
526
+ standbyClusterName,
527
+ rg,
528
+ location: haRegion,
529
+ nodeCount,
530
+ minCount,
531
+ maxCount,
532
+ nodeVmSize,
533
+ tier,
534
+ networkPlugin,
535
+ k8sVersion,
536
+ sub,
537
+ githubToken,
538
+ fluxRepo: opts.fluxRepo ?? AKS_DEFAULTS.fluxRepo,
539
+ fluxOwner: opts.fluxOwner ?? AKS_DEFAULTS.fluxOwner,
540
+ fluxBranch: opts.fluxBranch ?? AKS_DEFAULTS.fluxBranch,
541
+ templateRepo: opts.templateRepo ?? TEMPLATE_DEFAULTS.templateRepo,
542
+ templateOwner: opts.templateOwner ?? TEMPLATE_DEFAULTS.templateOwner,
543
+ templateBranch: opts.templateBranch ?? TEMPLATE_DEFAULTS.templateBranch,
544
+ environment: opts.environment || "demo",
545
+ account,
546
+ });
547
+ }
548
+
549
+ writeClusterState(clusterName, {
550
+ ha: {
551
+ enabled: true,
552
+ standbyCluster: standbyClusterName,
553
+ standbyRegion: haRegion,
554
+ configuredAt: new Date().toISOString(),
555
+ },
556
+ });
557
+ } catch (err) {
558
+ console.log(WARN(` ⚠ Standby cluster: ${(err.message || "").split("\n")[0]}`));
559
+ hint(` Create manually: fops azure aks up ${standbyClusterName} --location ${haRegion}`);
560
+ }
561
+ }
562
+ }
563
+
305
564
  const tracked = readClusterState(clusterName);
306
565
  if (tracked) printClusterInfo(tracked);
307
566
  return tracked;
@@ -545,12 +804,13 @@ export async function aksUp(opts = {}) {
545
804
  const pgCtx = { execa, clusterName, rg, sub, cluster: freshCluster, opts };
546
805
  await reconcilePostgres(pgCtx);
547
806
 
548
- // Create geo-replica for HA (default: enabled when in a supported region)
549
- const geoReplicaRegion = PG_REPLICA_REGIONS[location];
550
- const createGeoReplica = opts.geoReplica !== false && geoReplicaRegion;
551
- if (createGeoReplica) {
807
+ // Create geo-replica for HA when --ha flag is set (uses HA_REPLICA_REGIONS)
808
+ // or when --geo-replica is explicitly requested (uses PG_REPLICA_REGIONS)
809
+ const haRegion = opts.ha ? HA_REPLICA_REGIONS[location] : null;
810
+ const geoReplicaRegion = haRegion || (opts.geoReplica !== false ? PG_REPLICA_REGIONS[location] : null);
811
+ if (geoReplicaRegion && (opts.ha || opts.geoReplica !== false)) {
552
812
  try {
553
- console.log(OK(`Creating geo-replica in ${geoReplicaRegion} for HA…`));
813
+ hint(`Creating Postgres read replica in ${geoReplicaRegion} for HA…`);
554
814
  await aksPostgresReplicaCreate({
555
815
  clusterName,
556
816
  profile: sub,
@@ -570,6 +830,81 @@ export async function aksUp(opts = {}) {
570
830
  }
571
831
  }
572
832
 
833
+ // Set up cross-region storage replication when --ha flag is set
834
+ if (opts.ha === true) {
835
+ try {
836
+ const { stdout: freshJson } = await execa("az", [
837
+ "aks", "show", "-g", rg, "-n", clusterName, "--output", "json",
838
+ ...subArgs(sub),
839
+ ], { timeout: 30000 });
840
+ const freshCluster = JSON.parse(freshJson);
841
+ const storageCtx = { execa, clusterName, rg, sub, cluster: freshCluster, opts };
842
+ await reconcileStorageReplication(storageCtx);
843
+ } catch (err) {
844
+ const msg = err.message || "";
845
+ console.log(WARN(` ⚠ Storage HA setup failed: ${msg.split("\n")[0]}`));
846
+ hint("Retry with: fops azure aks up " + clusterName + " --ha");
847
+ }
848
+
849
+ // Create standby AKS cluster in HA region
850
+ const haRegion = HA_REPLICA_REGIONS[location];
851
+ if (haRegion) {
852
+ const standbyClusterName = `${clusterName}-standby`;
853
+ hint(`\nSetting up standby AKS cluster in ${haRegion}…`);
854
+
855
+ try {
856
+ // Check if standby cluster already exists
857
+ const { exitCode: standbyExists } = await execa("az", [
858
+ "aks", "show", "-g", rg, "-n", standbyClusterName, "--output", "none",
859
+ ...subArgs(sub),
860
+ ], { reject: false, timeout: 30000 });
861
+
862
+ if (standbyExists === 0) {
863
+ console.log(OK(` ✓ Standby cluster "${standbyClusterName}" already exists`));
864
+ } else {
865
+ // Create standby cluster with same config but in HA region
866
+ await aksStandbyCreate(execa, {
867
+ primaryClusterName: clusterName,
868
+ standbyClusterName,
869
+ rg,
870
+ location: haRegion,
871
+ nodeCount,
872
+ minCount,
873
+ maxCount,
874
+ nodeVmSize,
875
+ tier,
876
+ networkPlugin,
877
+ k8sVersion,
878
+ sub,
879
+ githubToken,
880
+ fluxRepo: opts.fluxRepo ?? AKS_DEFAULTS.fluxRepo,
881
+ fluxOwner: opts.fluxOwner ?? AKS_DEFAULTS.fluxOwner,
882
+ fluxBranch: opts.fluxBranch ?? AKS_DEFAULTS.fluxBranch,
883
+ templateRepo: opts.templateRepo ?? TEMPLATE_DEFAULTS.templateRepo,
884
+ templateOwner: opts.templateOwner ?? TEMPLATE_DEFAULTS.templateOwner,
885
+ templateBranch: opts.templateBranch ?? TEMPLATE_DEFAULTS.templateBranch,
886
+ environment: opts.environment || "demo",
887
+ account,
888
+ });
889
+ }
890
+
891
+ // Save HA cluster info to state
892
+ writeClusterState(clusterName, {
893
+ ha: {
894
+ enabled: true,
895
+ standbyCluster: standbyClusterName,
896
+ standbyRegion: haRegion,
897
+ configuredAt: new Date().toISOString(),
898
+ },
899
+ });
900
+ } catch (err) {
901
+ const msg = err.message || "";
902
+ console.log(WARN(` ⚠ Standby cluster creation failed: ${msg.split("\n")[0]}`));
903
+ hint(` Create manually: fops azure aks up ${standbyClusterName} --location ${haRegion}`);
904
+ }
905
+ }
906
+ }
907
+
573
908
  // Provision Event Hubs (managed Kafka) if requested
574
909
  if (opts.managedKafka === true) {
575
910
  try {
@@ -852,6 +1187,12 @@ export async function aksList(opts = {}) {
852
1187
  const qaLabel = cl.qa.passed ? OK(`✓ passed (${qaAge} ago)`) : ERR(`✗ failed (${qaAge} ago)`);
853
1188
  kvLine(" QA", qaLabel, { pad: 12 });
854
1189
  }
1190
+ if (cl.ha?.enabled) {
1191
+ kvLine(" HA", OK(`✓ ${cl.ha.standbyCluster} (${cl.ha.standbyRegion})`), { pad: 12 });
1192
+ }
1193
+ if (cl.isStandby) {
1194
+ kvLine(" Role", WARN(`standby for ${cl.primaryCluster}`), { pad: 12 });
1195
+ }
855
1196
  }
856
1197
  console.log("");
857
1198
  }