@meshxdata/fops 0.1.59 → 0.1.61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,373 @@
1
+ ## [0.1.61] - 2026-03-26
2
+
3
+ - feat(k3s): add 'fops k3s sync' — syncs secrets + refreshes service endpoint IPs from running containers (7cb7680)
4
+ - fix(provision): detect and repair broken submodule mounts from Docker (548422f)
5
+ - fix(provision): recheck Docker after install failure instead of blocking (d7bc11c)
6
+ - bump cli vm lifecycle (a3521ac)
7
+ - fix(k3s): remove minio123 fallback from sync-secrets (e185cb9)
8
+ - fix: add frontend-prod profile to fopsUpCmd, FOUNDATION_ROOT always wins in rootDir (a412709)
9
+ - bump storage (a1a5761)
10
+ - restore all missing services (pgpool, exporters, grafana, etc), add loki to k3s profile, always activate loki profile in fops up (4e2744a)
11
+ - fix: grafana alert-rules provisioning, ENVIRONMENT_NAME from --url, k3s secret sync, vm-sizes endpoint, project root resolution (9839052)
12
+ - feat(azure): add 'fops azure reconcile <name>' command for VM drift fix (79ba6e2)
13
+ - fix(otel,loki): remove duplicate spanmetrics dimensions, use .env for loki S3 creds (e3d1def)
14
+ - fix(loki): pass S3 credentials from .env so loki works without vault-init (c57906d)
15
+ - fix(azure): improve VM provisioning reliability (2ddd669)
16
+ - cluster discovery (009257d)
17
+ - feat(storage): add loki container to provisioning (898c544)
18
+ - feat(azure): add ping command to check backend health (8336825)
19
+ - operator cli bump 0.1.52 (f052cb5)
20
+ - fix(doctor): set KUBECONFIG for k3s kubectl commands (db9359b)
21
+ - fix(azure): move --landscape to test run command, not separate subcommand (4b9b089)
22
+ - feat(azure): add test integration command with landscape support (b2990a0)
23
+ - fix(fleet): skip VMs without public IPs in fleet exec (39acbaa)
24
+ - feat(azure): detect and fix External Secrets identity issues (f907d11)
25
+ - operator cli bump 0.1.51 (db55bdc)
26
+ - feat: add postgres-exporter and Azure tray menu improvements (2a337ac)
27
+ - operator cli plugin fix (4dae908)
28
+ - operator cli plugin fix (25620cc)
29
+ - operator cli test fixes (1d1c18f)
30
+ - feat(test): add setup-users command for QA test user creation (b929507)
31
+ - feat(aks): show HA standby clusters with visual grouping (8fb640c)
32
+ - refactor(provision): extract VM provisioning to dedicated module (af321a7)
33
+ - refactor(provision): extract post-start health checks to dedicated module (6ed5f2d)
34
+ - fix: ping timeout 15s, fix prometheus sed escaping (d11ac14)
35
+ - refactor(vm): extract terraform HCL generation to dedicated module (896a64b)
36
+ - refactor(keyvault): extract key operations to dedicated module (716bbe4)
37
+ - refactor(azure): extract swarm functions to azure-fleet-swarm.js (4690e34)
38
+ - refactor(azure): extract SSH/remote functions to azure-ops-ssh.js (e62b8f0)
39
+ - refactor(azure): split azure-ops.js into smaller modules (4515425)
40
+ - feat(aks): add --ha flag for full cross-region HA setup (ece68c5)
41
+ - feat(fops): inject ENVIRONMENT_NAME on VM provisioning (6ef2a27)
42
+ - fix(postgres): disable SSL mode to fix connection issues (c789ae9)
43
+ - feat(trino): add caching configuration for docker-compose (3668224)
44
+ - fix(fops-azure): run pytest directly instead of missing scripts (29f8410)
45
+ - add -d detach option for local frontend dev, remove hive cpu limits (3306667)
46
+ - release 0.1.49 (dcca32b)
47
+ - release 0.1.48 (9b195e5)
48
+ - stash on updates (2916c01)
49
+ - stash on updates (b5c14df)
50
+ - stash on updates (d0453d1)
51
+ - frontend dev fixes (0ca7b00)
52
+ - fix: update azure test commands (77c81da)
53
+ - default locust to CLI mode, add --web for UI (ca35bff)
54
+ - add locust command for load testing AKS clusters (1278722)
55
+ - update spot node pool default autoscaling to 1-20 (617c182)
56
+ - module for aks (3dd1a61)
57
+ - add hive to PG_SERVICE_DBS for fops pg-setup (afccb16)
58
+ - feat(azure): enhance aks doctor with ExternalSecrets and PGSSLMODE checks (8b14861)
59
+ - add foundation-postgres ExternalName service to reconciler (ea88e11)
60
+ - new flux templates (0e2e372)
61
+ - feat(azure): add storage-engine secrets to Key Vault (a4f488e)
62
+ - feat(azure-aks): add AUTH0_DOMAIN to template rendering variables (216c37e)
63
+ - feat(azure): add storage account creation per cluster (aa1b138)
64
+ - bump watcher (ab24473)
65
+ - fix: concurrent compute calls (#66) (03e2edf)
66
+ - bump backend version (5058ff5)
67
+ - bump fops to 0.1.44 (8c0ef5d)
68
+ - Mlflow and azure plugin fix (176881f)
69
+ - fix lifecycle (a2cb9e7)
70
+ - callback url for localhost (821fb94)
71
+ - disable 4 scaffolding plugin by default. (bfb2b76)
72
+ - jaccard improvements (b7494a0)
73
+ - refactor azure plugin (68dfef4)
74
+ - refactor azure plugin (b24a008)
75
+ - fix trino catalog missing (4928a55)
76
+ - v36 bump and changelog generation on openai (37a0440)
77
+ - v36 bump and changelog generation on openai (a3b02d9)
78
+ - bump (a990058)
79
+ - status bar fix and new plugin for ttyd (27dde1e)
80
+ - file demo and tray (1a3e704)
81
+ - electron app (59ad0bb)
82
+ - compose and fops file plugin (1cf0e81)
83
+ - bump (346ffc1)
84
+ - localhost replaced by 127.0.0.1 (82b9f30)
85
+ - .29 (587b0e1)
86
+ - improve up down and bootstrap script (b79ebaf)
87
+ - checksum (22c8086)
88
+ - checksum (96b434f)
89
+ - checksum (15ed3c0)
90
+ - checksum (8a6543a)
91
+ - bump embed trino linksg (8440504)
92
+ - bump data (765ffd9)
93
+ - bump (cb8b232)
94
+ - broken tests (c532229)
95
+ - release 0.1.18, preflight checks (d902249)
96
+ - fix compute display bug (d10f5d9)
97
+ - cleanup packer files (6330f18)
98
+ - plan mode (cb36a8a)
99
+ - bump to 0.1.16 - agent ui (41ac1a2)
100
+ - bump to 0.1.15 - agent ui (4ebe2e1)
101
+ - bump to 0.1.14 (6c3a7fa)
102
+ - bump to 0.1.13 (8db570f)
103
+ - release 0.1.12 (c1c79e5)
104
+ - bump (11aa3b0)
105
+ - git keep and bump tui (be1678e)
106
+ - skills, index, rrf, compacted context (100k > 10k) (7b2fffd)
107
+ - cloudflare and token consumption, graphs indexing (0ad9eec)
108
+ - bump storage default (22c83ba)
109
+ - storage fix (68a22a0)
110
+ - skills update (7f56500)
111
+ - v9 bump (3864446)
112
+ - bump (c95eedc)
113
+ - rrf (dbf8c95)
114
+ - feat: warning when running predictions (95e8c52)
115
+ - feat: support for local predictions (45cf26b)
116
+ - feat: wip support for predictions + mlflow (3457052)
117
+ - add Reciprocal Rank Fusion (RRF) to knowledge and skill retrieval (61549bc)
118
+ - validate CSV headers in compute_run readiness check (a8c7a43)
119
+ - fix corrupted Iceberg metadata: probe tables + force cleanup on re-apply (50578af)
120
+ - enforce: never use foundation_apply to fix broken products (2e049bf)
121
+ - update SKILL.md with complete tool reference for knowledge retrieval (30b1924)
122
+ - add storage read, input DP table probe, and compute_run improvements (34e6c4c)
123
+ - skills update (1220385)
124
+ - skills update (bb66958)
125
+ - some tui improvement andd tools apply overwrite (e90c35c)
126
+ - skills update (e9227a1)
127
+ - skills update (669c4b3)
128
+ - fix plugin pre-flight checks (f741743)
129
+ - increase agent context (6479aaa)
130
+ - skills and init sql fixes (5fce35e)
131
+ - checksum (3518b56)
132
+ - penging job limit (a139861)
133
+ - checksum (575d28c)
134
+ - bump (92049ba)
135
+ - fix bug per tab status (0a33657)
136
+ - fix bug per tab status (50457c6)
137
+ - checksumming (0ad842e)
138
+ - shot af mardkwon overlapping (51f63b9)
139
+ - add spark dockerfile for multiarch builds (95abbd1)
140
+ - fix plugin initialization (16b9782)
141
+ - split index.js (50902a2)
142
+ - cloudflare cidr (cc4e021)
143
+ - cloduflare restrictions (2f6ba2d)
144
+ - sequential start (86b496e)
145
+ - sequential start (4930fe1)
146
+ - sequential start (353f014)
147
+ - qa tests (2dc6a1a)
148
+ - bump sha for .85 (dc2edfe)
149
+ - preserve env on sudo (7831227)
150
+ - bump sha for .84 (6c052f9)
151
+ - non interactive for azure vms (0aa8a2f)
152
+ - keep .env if present (d072450)
153
+ - bump (7a8e732)
154
+ - ensure opa is on compose if not set (f4a5228)
155
+ - checksum bump (a2ccc20)
156
+ - netrc defensive checks (a0b0ccc)
157
+ - netrc defensive checks (ae37403)
158
+ - checksum (ec45d11)
159
+ - update sync and fix up (7f9af72)
160
+ - expand test for azure and add new per app tag support (388a168)
161
+ - checksum on update (44005fc)
162
+ - cleanup for later (15e5313)
163
+ - cleanup for later (11c9597)
164
+ - switch branch feature (822fecc)
165
+ - add pull (d1c19ab)
166
+ - Bump hono from 4.11.9 to 4.12.0 in /operator-cli (ad25144)
167
+ - tests (f180a9a)
168
+ - cleanup (39c49a3)
169
+ - registry (7b7126a)
170
+ - reconcile kafka (832d0db)
171
+ - gh login bug (025886c)
172
+ - cleanup (bb96cab)
173
+ - strip envs from process (2421180)
174
+ - force use of gh creds not tokens in envs var (fff7787)
175
+ - resolve import between npm installs and npm link (79522e1)
176
+ - fix gh scope and azure states (afd846c)
177
+ - refactoring (da50352)
178
+ - split fops repo (d447638)
179
+ - aks (b791f8f)
180
+ - refactor azure (67d3bad)
181
+ - wildcard (391f023)
182
+ - azure plugin (c074074)
183
+
184
+ # Changelog
185
+
186
+ All notable changes to @meshxdata/fops (Foundation Operator CLI) are documented here.
187
+
188
+ ## [0.1.60] - 2026-03-26
189
+
190
+ - fix(provision): detect and repair broken submodule mounts from Docker (13269c5)
191
+ - fix(provision): recheck Docker after install failure instead of blocking (d7bc11c)
192
+ - bump cli vm lifecycle (a3521ac)
193
+ - fix(k3s): remove minio123 fallback from sync-secrets (e185cb9)
194
+ - fix: add frontend-prod profile to fopsUpCmd, FOUNDATION_ROOT always wins in rootDir (a412709)
195
+ - bump storage (a1a5761)
196
+ - restore all missing services (pgpool, exporters, grafana, etc), add loki to k3s profile, always activate loki profile in fops up (4e2744a)
197
+ - fix: grafana alert-rules provisioning, ENVIRONMENT_NAME from --url, k3s secret sync, vm-sizes endpoint, project root resolution (9839052)
198
+ - feat(azure): add 'fops azure reconcile <name>' command for VM drift fix (79ba6e2)
199
+ - fix(otel,loki): remove duplicate spanmetrics dimensions, use .env for loki S3 creds (e3d1def)
200
+ - fix(loki): pass S3 credentials from .env so loki works without vault-init (c57906d)
201
+ - fix(azure): improve VM provisioning reliability (2ddd669)
202
+ - cluster discovery (009257d)
203
+ - feat(storage): add loki container to provisioning (898c544)
204
+ - feat(azure): add ping command to check backend health (8336825)
205
+ - operator cli bump 0.1.52 (f052cb5)
206
+ - fix(doctor): set KUBECONFIG for k3s kubectl commands (db9359b)
207
+ - fix(azure): move --landscape to test run command, not separate subcommand (4b9b089)
208
+ - feat(azure): add test integration command with landscape support (b2990a0)
209
+ - fix(fleet): skip VMs without public IPs in fleet exec (39acbaa)
210
+ - feat(azure): detect and fix External Secrets identity issues (f907d11)
211
+ - operator cli bump 0.1.51 (db55bdc)
212
+ - feat: add postgres-exporter and Azure tray menu improvements (2a337ac)
213
+ - operator cli plugin fix (4dae908)
214
+ - operator cli plugin fix (25620cc)
215
+ - operator cli test fixes (1d1c18f)
216
+ - feat(test): add setup-users command for QA test user creation (b929507)
217
+ - feat(aks): show HA standby clusters with visual grouping (8fb640c)
218
+ - refactor(provision): extract VM provisioning to dedicated module (af321a7)
219
+ - refactor(provision): extract post-start health checks to dedicated module (6ed5f2d)
220
+ - fix: ping timeout 15s, fix prometheus sed escaping (d11ac14)
221
+ - refactor(vm): extract terraform HCL generation to dedicated module (896a64b)
222
+ - refactor(keyvault): extract key operations to dedicated module (716bbe4)
223
+ - refactor(azure): extract swarm functions to azure-fleet-swarm.js (4690e34)
224
+ - refactor(azure): extract SSH/remote functions to azure-ops-ssh.js (e62b8f0)
225
+ - refactor(azure): split azure-ops.js into smaller modules (4515425)
226
+ - feat(aks): add --ha flag for full cross-region HA setup (ece68c5)
227
+ - feat(fops): inject ENVIRONMENT_NAME on VM provisioning (6ef2a27)
228
+ - fix(postgres): disable SSL mode to fix connection issues (c789ae9)
229
+ - feat(trino): add caching configuration for docker-compose (3668224)
230
+ - fix(fops-azure): run pytest directly instead of missing scripts (29f8410)
231
+ - add -d detach option for local frontend dev, remove hive cpu limits (3306667)
232
+ - release 0.1.49 (dcca32b)
233
+ - release 0.1.48 (9b195e5)
234
+ - stash on updates (2916c01)
235
+ - stash on updates (b5c14df)
236
+ - stash on updates (d0453d1)
237
+ - frontend dev fixes (0ca7b00)
238
+ - fix: update azure test commands (77c81da)
239
+ - default locust to CLI mode, add --web for UI (ca35bff)
240
+ - add locust command for load testing AKS clusters (1278722)
241
+ - update spot node pool default autoscaling to 1-20 (617c182)
242
+ - module for aks (3dd1a61)
243
+ - add hive to PG_SERVICE_DBS for fops pg-setup (afccb16)
244
+ - feat(azure): enhance aks doctor with ExternalSecrets and PGSSLMODE checks (8b14861)
245
+ - add foundation-postgres ExternalName service to reconciler (ea88e11)
246
+ - new flux templates (0e2e372)
247
+ - feat(azure): add storage-engine secrets to Key Vault (a4f488e)
248
+ - feat(azure-aks): add AUTH0_DOMAIN to template rendering variables (216c37e)
249
+ - feat(azure): add storage account creation per cluster (aa1b138)
250
+ - bump watcher (ab24473)
251
+ - fix: concurrent compute calls (#66) (03e2edf)
252
+ - bump backend version (5058ff5)
253
+ - bump fops to 0.1.44 (8c0ef5d)
254
+ - Mlflow and azure plugin fix (176881f)
255
+ - fix lifecycle (a2cb9e7)
256
+ - callback url for localhost (821fb94)
257
+ - disable 4 scaffolding plugin by default. (bfb2b76)
258
+ - jaccard improvements (b7494a0)
259
+ - refactor azure plugin (68dfef4)
260
+ - refactor azure plugin (b24a008)
261
+ - fix trino catalog missing (4928a55)
262
+ - v36 bump and changelog generation on openai (37a0440)
263
+ - v36 bump and changelog generation on openai (a3b02d9)
264
+ - bump (a990058)
265
+ - status bar fix and new plugin for ttyd (27dde1e)
266
+ - file demo and tray (1a3e704)
267
+ - electron app (59ad0bb)
268
+ - compose and fops file plugin (1cf0e81)
269
+ - bump (346ffc1)
270
+ - localhost replaced by 127.0.0.1 (82b9f30)
271
+ - .29 (587b0e1)
272
+ - improve up down and bootstrap script (b79ebaf)
273
+ - checksum (22c8086)
274
+ - checksum (96b434f)
275
+ - checksum (15ed3c0)
276
+ - checksum (8a6543a)
277
+ - bump embed trino linksg (8440504)
278
+ - bump data (765ffd9)
279
+ - bump (cb8b232)
280
+ - broken tests (c532229)
281
+ - release 0.1.18, preflight checks (d902249)
282
+ - fix compute display bug (d10f5d9)
283
+ - cleanup packer files (6330f18)
284
+ - plan mode (cb36a8a)
285
+ - bump to 0.1.16 - agent ui (41ac1a2)
286
+ - bump to 0.1.15 - agent ui (4ebe2e1)
287
+ - bump to 0.1.14 (6c3a7fa)
288
+ - bump to 0.1.13 (8db570f)
289
+ - release 0.1.12 (c1c79e5)
290
+ - bump (11aa3b0)
291
+ - git keep and bump tui (be1678e)
292
+ - skills, index, rrf, compacted context (100k > 10k) (7b2fffd)
293
+ - cloudflare and token consumption, graphs indexing (0ad9eec)
294
+ - bump storage default (22c83ba)
295
+ - storage fix (68a22a0)
296
+ - skills update (7f56500)
297
+ - v9 bump (3864446)
298
+ - bump (c95eedc)
299
+ - rrf (dbf8c95)
300
+ - feat: warning when running predictions (95e8c52)
301
+ - feat: support for local predictions (45cf26b)
302
+ - feat: wip support for predictions + mlflow (3457052)
303
+ - add Reciprocal Rank Fusion (RRF) to knowledge and skill retrieval (61549bc)
304
+ - validate CSV headers in compute_run readiness check (a8c7a43)
305
+ - fix corrupted Iceberg metadata: probe tables + force cleanup on re-apply (50578af)
306
+ - enforce: never use foundation_apply to fix broken products (2e049bf)
307
+ - update SKILL.md with complete tool reference for knowledge retrieval (30b1924)
308
+ - add storage read, input DP table probe, and compute_run improvements (34e6c4c)
309
+ - skills update (1220385)
310
+ - skills update (bb66958)
311
+ - some tui improvement andd tools apply overwrite (e90c35c)
312
+ - skills update (e9227a1)
313
+ - skills update (669c4b3)
314
+ - fix plugin pre-flight checks (f741743)
315
+ - increase agent context (6479aaa)
316
+ - skills and init sql fixes (5fce35e)
317
+ - checksum (3518b56)
318
+ - penging job limit (a139861)
319
+ - checksum (575d28c)
320
+ - bump (92049ba)
321
+ - fix bug per tab status (0a33657)
322
+ - fix bug per tab status (50457c6)
323
+ - checksumming (0ad842e)
324
+ - shot af mardkwon overlapping (51f63b9)
325
+ - add spark dockerfile for multiarch builds (95abbd1)
326
+ - fix plugin initialization (16b9782)
327
+ - split index.js (50902a2)
328
+ - cloudflare cidr (cc4e021)
329
+ - cloduflare restrictions (2f6ba2d)
330
+ - sequential start (86b496e)
331
+ - sequential start (4930fe1)
332
+ - sequential start (353f014)
333
+ - qa tests (2dc6a1a)
334
+ - bump sha for .85 (dc2edfe)
335
+ - preserve env on sudo (7831227)
336
+ - bump sha for .84 (6c052f9)
337
+ - non interactive for azure vms (0aa8a2f)
338
+ - keep .env if present (d072450)
339
+ - bump (7a8e732)
340
+ - ensure opa is on compose if not set (f4a5228)
341
+ - checksum bump (a2ccc20)
342
+ - netrc defensive checks (a0b0ccc)
343
+ - netrc defensive checks (ae37403)
344
+ - checksum (ec45d11)
345
+ - update sync and fix up (7f9af72)
346
+ - expand test for azure and add new per app tag support (388a168)
347
+ - checksum on update (44005fc)
348
+ - cleanup for later (15e5313)
349
+ - cleanup for later (11c9597)
350
+ - switch branch feature (822fecc)
351
+ - add pull (d1c19ab)
352
+ - Bump hono from 4.11.9 to 4.12.0 in /operator-cli (ad25144)
353
+ - tests (f180a9a)
354
+ - cleanup (39c49a3)
355
+ - registry (7b7126a)
356
+ - reconcile kafka (832d0db)
357
+ - gh login bug (025886c)
358
+ - cleanup (bb96cab)
359
+ - strip envs from process (2421180)
360
+ - force use of gh creds not tokens in envs var (fff7787)
361
+ - resolve import between npm installs and npm link (79522e1)
362
+ - fix gh scope and azure states (afd846c)
363
+ - refactoring (da50352)
364
+ - split fops repo (d447638)
365
+ - aks (b791f8f)
366
+ - refactor azure (67d3bad)
367
+ - wildcard (391f023)
368
+ - azure plugin (c074074)
369
+ - zap (d7e6e7f)
370
+
1
371
  ## [0.1.59] - 2026-03-26
2
372
 
3
373
  - fix(k3s): remove minio123 fallback from sync-secrets (e185cb9)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@meshxdata/fops",
3
- "version": "0.1.59",
3
+ "version": "0.1.61",
4
4
  "description": "CLI to install and manage data mesh platforms",
5
5
  "keywords": [
6
6
  "fops",
@@ -84,7 +84,87 @@ export async function syncSecrets(root) {
84
84
  ], { timeout: 10000, reject: false });
85
85
  console.log(OK(` ✓ ${sparkSecretName} (with label)`));
86
86
 
87
- console.log(OK("\n ✓ All storage secrets synced to k3s"));
87
+ // 4. Update stale credentials in the database (public.secret table)
88
+ console.log(DIM(" Updating storage credentials in database..."));
89
+ const secretKeys = ["AWS_SECRET_ACCESS_KEY", "MY_S3_SECRET", "S3_SECRET", "S3_SECRET_KEY"];
90
+ const accessKeys = ["AWS_ACCESS_KEY_ID", "MY_S3_ACCESS", "S3_ACCESS", "S3_ACCESS_KEY"];
91
+ const updateSql = [
92
+ ...secretKeys.map(k => `UPDATE public.secret SET value = '${s3Pw.replace(/'/g, "''")}' WHERE key = '${k}' AND value != '${s3Pw.replace(/'/g, "''")}';`),
93
+ ...accessKeys.map(k => `UPDATE public.secret SET value = '${s3Id.replace(/'/g, "''")}' WHERE key = '${k}' AND value != '${s3Id.replace(/'/g, "''")}';`),
94
+ ].join("\n");
95
+ const dbResult = await execa("docker", [
96
+ "compose", "exec", "-T", "postgres",
97
+ "psql", "-U", "foundation", "-d", "foundation", "-c", updateSql,
98
+ ], { cwd: root, timeout: 15000, reject: false });
99
+ if (dbResult.exitCode === 0) {
100
+ console.log(OK(" ✓ Database secrets updated"));
101
+ } else {
102
+ console.log(WARN(" ⚠ Database secret update failed (postgres may not be running)"));
103
+ }
104
+
105
+ console.log(OK("\n ✓ All storage secrets synced"));
106
+ }
107
+
108
+ const ENDPOINT_SERVICES = [
109
+ { name: "foundation-storage-engine", namespace: "spark-jobs", container: "foundation-storage-engine", ports: [{ name: "http", port: 8080 }] },
110
+ { name: "kafka", namespace: "spark-jobs", container: "kafka", ports: [{ name: "kafka", port: 9092 }, { name: "kafka-internal", port: 29092 }] },
111
+ { name: "foundation-kafka-kafka-bootstrap", namespace: "spark-jobs", container: "kafka", ports: [{ name: "kafka", port: 9092 }, { name: "kafka-internal", port: 29092 }] },
112
+ ];
113
+
114
+ async function syncEndpoints(root) {
115
+ const { execa } = await import("execa");
116
+
117
+ console.log(DIM("\n Refreshing k3s service endpoints..."));
118
+
119
+ for (const svc of ENDPOINT_SERVICES) {
120
+ // Get current container IP
121
+ const { stdout: ip, exitCode } = await execa("docker", [
122
+ "inspect", svc.container, "--format", "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}",
123
+ ], { timeout: 5000, reject: false });
124
+
125
+ if (exitCode !== 0 || !ip?.trim()) {
126
+ console.log(WARN(` ⚠ ${svc.name}: container "${svc.container}" not found, skipping`));
127
+ continue;
128
+ }
129
+
130
+ const currentIp = ip.trim();
131
+
132
+ // Check if endpoint already has the right IP
133
+ const { stdout: existingIp } = await execa("docker", [
134
+ ...K3S_KUBECTL, "get", "endpoints", svc.name, "-n", svc.namespace,
135
+ "-o", "jsonpath={.subsets[0].addresses[0].ip}",
136
+ ], { timeout: 10000, reject: false });
137
+
138
+ if (existingIp?.trim() === currentIp) {
139
+ console.log(OK(` ✓ ${svc.name} → ${currentIp} (unchanged)`));
140
+ continue;
141
+ }
142
+
143
+ // Build endpoint YAML
144
+ const portsYaml = svc.ports.map(p => ` - name: "${p.name}"\n port: ${p.port}\n protocol: TCP`).join("\n");
145
+ const yaml = `apiVersion: v1
146
+ kind: Endpoints
147
+ metadata:
148
+ name: ${svc.name}
149
+ namespace: ${svc.namespace}
150
+ subsets:
151
+ - addresses:
152
+ - ip: ${currentIp}
153
+ ports:
154
+ ${portsYaml}`;
155
+
156
+ const result = await execa("docker", [
157
+ ...K3S_KUBECTL_I, "apply", "-f", "-",
158
+ ], { input: yaml, timeout: 10000, reject: false });
159
+
160
+ if (result.exitCode === 0) {
161
+ console.log(OK(` ✓ ${svc.name} → ${currentIp} (updated)`));
162
+ } else {
163
+ console.log(WARN(` ⚠ ${svc.name}: failed to update endpoint`));
164
+ }
165
+ }
166
+
167
+ console.log(OK("\n ✓ All k3s endpoints refreshed"));
88
168
  }
89
169
 
90
170
  export function registerK3sCommands(program) {
@@ -92,6 +172,20 @@ export function registerK3sCommands(program) {
92
172
  .command("k3s")
93
173
  .description("Manage local k3s Kubernetes cluster");
94
174
 
175
+ k3s
176
+ .command("sync")
177
+ .description("Sync secrets and refresh service endpoints in k3s")
178
+ .action(async () => {
179
+ const root = requireRoot(program);
180
+ try {
181
+ await syncSecrets(root);
182
+ await syncEndpoints(root);
183
+ } catch (err) {
184
+ console.error(ERR(` ✗ ${err.message}`));
185
+ process.exitCode = 1;
186
+ }
187
+ });
188
+
95
189
  k3s
96
190
  .command("sync-secrets")
97
191
  .description("Sync storage secrets from .env into k3s (fixes S3 AccessDenied)")
@@ -95,8 +95,15 @@ export async function configureVm(execa, ip, user, publicUrl, { githubToken, k3s
95
95
  if (installExit === 0) {
96
96
  if (!quiet) console.log(chalk.green(" ✓ Docker installed"));
97
97
  } else {
98
- console.log(chalk.red(" ✗ Docker installation failed container operations will not work"));
99
- console.log(chalk.dim(` SSH in and check: ssh ${user}@${ip} "sudo apt-get install -y docker-ce"`));
98
+ // apt can fail due to lock conflicts with cloud-init but Docker may have installed anyway
99
+ const { exitCode: recheck } = await ssh("sudo docker info >/dev/null 2>&1");
100
+ if (recheck === 0) {
101
+ if (!quiet) console.log(chalk.yellow(" ⚠ Docker install had warnings but Docker is working"));
102
+ } else {
103
+ console.log(chalk.red(" ✗ Docker installation failed — cannot continue provisioning"));
104
+ console.log(chalk.dim(` SSH in and check: ssh ${user}@${ip} "sudo apt-get install -y docker-ce"`));
105
+ throw new Error("Docker installation failed");
106
+ }
100
107
  }
101
108
  }
102
109
 
@@ -1281,6 +1288,18 @@ async function vmReconcileRepo(ctx) {
1281
1288
 
1282
1289
  const { stdout: exists } = await ssh("[ -d /opt/foundation-compose/.git ] && echo yes || echo no");
1283
1290
  if (exists?.trim() === "yes") {
1291
+ // Fix broken submodule mounts: Docker creates empty dirs when bind-mounting missing files.
1292
+ // Detect and repair: if a submodule dir exists but is empty or has no real files, re-init it.
1293
+ const checkSubs = "cd /opt/foundation-compose && for d in foundation-backend foundation-frontend foundation-watcher foundation-processor foundation-scheduler foundation-storage-engine; do [ -d $d ] && [ ! -f $d/pyproject.toml ] && [ ! -f $d/package.json ] && [ ! -f $d/Makefile ] && echo $d; done";
1294
+ const { stdout: brokenSubs } = await ssh(checkSubs);
1295
+ const broken = (brokenSubs || "").trim().split("\n").filter(Boolean);
1296
+ if (broken.length > 0) {
1297
+ console.log(chalk.yellow(` ↻ Fixing ${broken.length} broken submodule(s): ${broken.join(", ")}`));
1298
+ for (const sub of broken) {
1299
+ await ssh(`cd /opt/foundation-compose && sudo rm -rf ${sub} && git checkout ${sub} && git submodule update --init --recursive --depth 1 ${sub}`, 60000);
1300
+ }
1301
+ await ssh("sudo chown -R azureuser:azureuser /opt/foundation-compose");
1302
+ }
1284
1303
  reconcileOk("Repository", "/opt/foundation-compose");
1285
1304
  return;
1286
1305
  }
@@ -47,6 +47,8 @@ export class AzureService {
47
47
  publicIp: vm.publicIp,
48
48
  publicUrl: vm.publicUrl,
49
49
  subscriptionId: vm.subscriptionId,
50
+ vmSize: vm.vmSize || null,
51
+ image: vm.image || null,
50
52
  active: name === activeVm,
51
53
  createdAt: vm.createdAt || vm.discoveredAt || null,
52
54
  }));
@@ -263,7 +263,7 @@ export async function azureUp(opts = {}) {
263
263
 
264
264
  // Persist IP immediately so it's never lost if later steps fail or user Ctrl+C's
265
265
  const publicUrl = opts.url || defaultUrl;
266
- writeVmState(vmName, { resourceGroup: rg, location, publicIp, publicUrl, subscriptionId: subId, createdAt: new Date().toISOString() });
266
+ writeVmState(vmName, { resourceGroup: rg, location, publicIp, publicUrl, subscriptionId: subId, vmSize, image, createdAt: new Date().toISOString() });
267
267
 
268
268
  hint("Enabling accelerated networking…");
269
269
  const nicName = `${vmName}VMNic`;