kova-node-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1562 @@
1
+ // deployment executor - runs deployments from sdl manifests
2
+ // handles multi-service deployments, persistent volumes, port exposure
3
+ import { EventEmitter } from 'events';
4
+ import { PassThrough } from 'stream';
5
+ import { logger } from '../lib/logger.js';
6
+ import Docker from 'dockerode';
7
+ import { createHash } from 'crypto';
8
+ export class DeploymentExecutor extends EventEmitter {
9
+ docker;
10
+ executions = new Map();
11
+ orchestratorUrl;
12
+ apiKey;
13
+ constructor(config) {
14
+ super();
15
+ this.docker = new Docker();
16
+ this.orchestratorUrl = config?.orchestratorUrl || process.env.KOVA_ORCHESTRATOR_URL || 'http://localhost:3000';
17
+ this.apiKey = config?.apiKey || '';
18
+ }
19
+ // execute deployment from manifest
20
+ async executeDeployment(options) {
21
+ const { deploymentId, leaseId, manifest } = options;
22
+ logger.info({ deploymentId, leaseId }, 'executing deployment');
23
+ const execution = {
24
+ deploymentId,
25
+ leaseId,
26
+ manifest,
27
+ containers: new Map(),
28
+ networks: [],
29
+ volumes: []
30
+ };
31
+ this.executions.set(deploymentId, execution);
32
+ try {
33
+ // create isolated network for this deployment
34
+ const networkName = `kova-deploy-${deploymentId.slice(-8)}`;
35
+ let network;
36
+ try {
37
+ network = await this.docker.createNetwork({
38
+ Name: networkName,
39
+ Driver: 'bridge',
40
+ Internal: false
41
+ });
42
+ logger.info({ deploymentId, networkName }, 'created deployment network');
43
+ }
44
+ catch (err) {
45
+ if (err.statusCode === 409 || err.message?.includes('already exists')) {
46
+ // network already exists, get it
47
+ const networks = await this.docker.listNetworks({
48
+ filters: { name: [networkName] }
49
+ });
50
+ network = networks[0] ? this.docker.getNetwork(networks[0].Id) : null;
51
+ if (network) {
52
+ logger.info({ deploymentId, networkName }, 'using existing network');
53
+ }
54
+ else {
55
+ throw new Error(`network ${networkName} exists but could not be retrieved`);
56
+ }
57
+ }
58
+ else {
59
+ throw err;
60
+ }
61
+ }
62
+ execution.networks.push(network.id);
63
+ // create persistent volumes if needed
64
+ for (const [serviceName, service] of Object.entries(manifest.services)) {
65
+ if (service.params?.storage) {
66
+ for (const [volumeName, volumeConfig] of Object.entries(service.params.storage)) {
67
+ const volumeFullName = `kova-${deploymentId}-${serviceName}-${volumeName}`;
68
+ const volume = await this.docker.createVolume({
69
+ Name: volumeFullName,
70
+ Driver: 'local'
71
+ });
72
+ execution.volumes.push(volume.Name);
73
+ logger.info({ deploymentId, volumeName: volumeFullName }, 'created persistent volume');
74
+ // if source is "uploads", download and populate volume
75
+ if (volumeConfig.source === 'uploads') {
76
+ await this.populateVolumeFromUploads(deploymentId, serviceName, volumeFullName);
77
+ }
78
+ }
79
+ }
80
+ }
81
+ // sort services by depends_on so dependencies start first (topological order)
82
+ const serviceEntries = Object.entries(manifest.services);
83
+ const sorted = this.topologicalSort(serviceEntries);
84
+ // start each service in dependency order, respecting replica count
85
+ for (const [serviceName, service] of sorted) {
86
+ // extract gpu config from profiles if available
87
+ let gpu;
88
+ let replicaCount = 1;
89
+ if (manifest.profiles?.compute) {
90
+ // find matching compute profile for this service
91
+ for (const [profileName, profile] of Object.entries(manifest.profiles.compute)) {
92
+ if (profile.resources?.gpu) {
93
+ gpu = profile.resources.gpu;
94
+ break;
95
+ }
96
+ }
97
+ }
98
+ // get replica count from deployment section
99
+ const serviceDeployment = manifest.deployment?.[serviceName];
100
+ if (serviceDeployment) {
101
+ for (const [, config] of Object.entries(serviceDeployment)) {
102
+ if (config?.count && config.count > 1) {
103
+ replicaCount = Math.min(config.count, 20); // cap at 20 replicas
104
+ break;
105
+ }
106
+ }
107
+ }
108
+ if (replicaCount > 1) {
109
+ logger.info({ deploymentId, serviceName, replicaCount }, 'starting service replicas');
110
+ for (let i = 0; i < replicaCount; i++) {
111
+ const replicaName = `${serviceName}-${i}`;
112
+ await this.startService(deploymentId, replicaName, service, execution, networkName, gpu);
113
+ }
114
+ }
115
+ else {
116
+ await this.startService(deploymentId, serviceName, service, execution, networkName, gpu);
117
+ }
118
+ }
119
+ this.emit('deployment-started', { deploymentId, leaseId });
120
+ logger.info({ deploymentId, services: execution.containers.size }, 'deployment running');
121
+ }
122
+ catch (err) {
123
+ logger.error({ err, deploymentId }, 'deployment execution failed');
124
+ await this.cleanupDeployment(deploymentId, true); // cleanup all resources on failure
125
+ throw err;
126
+ }
127
+ }
128
+ // parse memory string like "512Mi", "2Gi" into bytes
129
+ parseMemoryToBytes(size) {
130
+ const units = {
131
+ 'K': 1000, 'M': 1000 ** 2, 'G': 1000 ** 3, 'T': 1000 ** 4,
132
+ 'Ki': 1024, 'Mi': 1024 ** 2, 'Gi': 1024 ** 3, 'Ti': 1024 ** 4,
133
+ };
134
+ const match = size.match(/^(\d+(?:\.\d+)?)\s*([A-Za-z]+)$/);
135
+ if (!match)
136
+ return 4 * 1024 * 1024 * 1024; // 4gb fallback
137
+ const value = parseFloat(match[1]);
138
+ const unit = match[2];
139
+ return Math.floor(value * (units[unit] || 1));
140
+ }
141
+ // start a single service
142
+ async startService(deploymentId, serviceName, service, execution, networkName, gpu) {
143
+ logger.info({ deploymentId, serviceName, image: service.image }, 'starting service');
144
+ // convert env to docker format
145
+ let env = [];
146
+ if (service.env) {
147
+ if (Array.isArray(service.env)) {
148
+ env = service.env;
149
+ }
150
+ else {
151
+ env = Object.entries(service.env).map(([k, v]) => `${k}=${v}`);
152
+ }
153
+ }
154
+ // setup volume binds and tmpfs mounts for ram class storage
155
+ const binds = [];
156
+ const tmpfs = {};
157
+ if (service.params?.storage) {
158
+ // look up storage resources from the compute profile to check classes
159
+ const serviceDeployment = execution?.manifest?.deployment?.[serviceName];
160
+ let storageResources = [];
161
+ if (serviceDeployment) {
162
+ for (const [, config] of Object.entries(serviceDeployment)) {
163
+ if (config?.profile) {
164
+ const profile = execution?.manifest?.profiles?.compute?.[config.profile];
165
+ if (profile?.resources?.storage) {
166
+ storageResources = Array.isArray(profile.resources.storage)
167
+ ? profile.resources.storage : [profile.resources.storage];
168
+ }
169
+ break;
170
+ }
171
+ }
172
+ }
173
+ for (const [volumeName, volumeConfig] of Object.entries(service.params.storage)) {
174
+ const mountPath = volumeConfig.mount;
175
+ // check if this volume's storage class is 'ram' (shared memory / tmpfs)
176
+ const matchingStorage = storageResources.find((s) => s.name === volumeName);
177
+ if (matchingStorage?.attributes?.class === 'ram') {
178
+ // create tmpfs mount instead of docker volume
179
+ const sizeBytes = this.parseMemoryToBytes(matchingStorage.size || '64Mi');
180
+ tmpfs[mountPath] = `size=${sizeBytes}`;
181
+ logger.info({ deploymentId, serviceName, volumeName, mountPath, size: matchingStorage.size }, 'using tmpfs for ram class storage');
182
+ }
183
+ else {
184
+ const volumeFullName = `kova-${deploymentId}-${serviceName}-${volumeName}`;
185
+ const mode = volumeConfig.readOnly ? 'ro' : 'rw';
186
+ binds.push(`${volumeFullName}:${mountPath}:${mode}`);
187
+ }
188
+ }
189
+ }
190
+ // setup port exposure (internal only, no host binding)
191
+ // ingress controller will proxy to these ports via docker network
192
+ const exposedPorts = {};
193
+ if (service.expose) {
194
+ for (const expose of service.expose) {
195
+ const containerPort = expose.port;
196
+ exposedPorts[`${containerPort}/tcp`] = {};
197
+ }
198
+ }
199
+ // pull image first (pass credentials for private registries)
200
+ try {
201
+ await this.pullImage(service.image, deploymentId, serviceName, service.credentials);
202
+ }
203
+ catch (err) {
204
+ logger.error({ err, image: service.image }, 'failed to pull image');
205
+ throw err;
206
+ }
207
+ // create container
208
+ const containerName = `kova-${deploymentId}-${serviceName}`;
209
+ let container;
210
+ let isExisting = false;
211
+ // check if container already exists
212
+ try {
213
+ const existing = this.docker.getContainer(containerName);
214
+ const info = await existing.inspect();
215
+ if (info.State.Running) {
216
+ // container is already running, reuse it
217
+ container = existing;
218
+ isExisting = true;
219
+ logger.info({ containerName, containerId: info.Id }, 'reusing existing running container');
220
+ }
221
+ else {
222
+ // container exists but not running, remove and recreate
223
+ await existing.remove({ force: true });
224
+ logger.info({ containerName }, 'removed stopped container');
225
+ }
226
+ }
227
+ catch (err) {
228
+ // container doesn't exist, will create new one
229
+ }
230
+ if (!isExisting) {
231
+ // figure out resource limits from the service's compute profile
232
+ let memoryLimit = 4 * 1024 * 1024 * 1024; // 4gb default
233
+ let cpuCores = 4; // 4 cores default
234
+ // look up the compute profile mapped to this specific service
235
+ const serviceDeployment = execution.manifest.deployment?.[serviceName];
236
+ let profileName = null;
237
+ if (serviceDeployment) {
238
+ // deployment section: { serviceName: { placementName: { profile: "profileName", count: N } } }
239
+ for (const [, config] of Object.entries(serviceDeployment)) {
240
+ if (config?.profile) {
241
+ profileName = config.profile;
242
+ break;
243
+ }
244
+ }
245
+ }
246
+ const profiles = execution.manifest.profiles?.compute;
247
+ if (profiles) {
248
+ // use the mapped profile for this service, or fall back to first available
249
+ const profile = profileName && profiles[profileName]
250
+ ? profiles[profileName]
251
+ : Object.values(profiles)[0];
252
+ if (profile?.resources) {
253
+ const res = profile.resources;
254
+ if (res.memory?.size) {
255
+ memoryLimit = this.parseMemoryToBytes(res.memory.size);
256
+ }
257
+ if (res.cpu?.units) {
258
+ cpuCores = parseFloat(res.cpu.units) || 4;
259
+ }
260
+ }
261
+ }
262
+ // clamp to sane limits
263
+ const maxMemory = 32 * 1024 * 1024 * 1024; // 32gb hard ceiling
264
+ memoryLimit = Math.min(memoryLimit, maxMemory);
265
+ cpuCores = Math.min(cpuCores, 32);
266
+ const containerConfig = {
267
+ name: containerName,
268
+ Image: service.image,
269
+ Env: env,
270
+ ExposedPorts: exposedPorts,
271
+ HostConfig: {
272
+ NetworkMode: networkName,
273
+ Binds: binds,
274
+ ReadonlyRootfs: false,
275
+ AutoRemove: false,
276
+ RestartPolicy: {
277
+ Name: execution.restartPolicy || 'on-failure',
278
+ MaximumRetryCount: (execution.restartPolicy || 'on-failure') === 'on-failure'
279
+ ? (execution.restartMaxRetries || 3)
280
+ : 0
281
+ },
282
+ // resource limits based on what was ordered
283
+ Memory: memoryLimit,
284
+ MemorySwap: memoryLimit,
285
+ CpuPeriod: 100000,
286
+ CpuQuota: Math.floor(cpuCores * 100000),
287
+ Privileged: false,
288
+ PidsLimit: 256,
289
+ SecurityOpt: ['no-new-privileges:true'],
290
+ CapDrop: ['ALL'],
291
+ CapAdd: ['CHOWN', 'NET_BIND_SERVICE'],
292
+ // tmpfs mounts for ram class storage (shared memory)
293
+ ...(Object.keys(tmpfs).length > 0 ? { Tmpfs: tmpfs } : {})
294
+ },
295
+ Labels: {
296
+ 'kova.deployment': deploymentId,
297
+ 'kova.service': serviceName,
298
+ 'kova.lease': execution.leaseId
299
+ }
300
+ };
301
+ // add gpu device request if specified
302
+ if (gpu && gpu.units > 0) {
303
+ containerConfig.HostConfig.DeviceRequests = [{
304
+ Driver: '',
305
+ Count: gpu.units,
306
+ DeviceIDs: [],
307
+ Capabilities: [['gpu']],
308
+ Options: {}
309
+ }];
310
+ logger.info({ deploymentId, serviceName, gpuUnits: gpu.units }, 'requesting gpu access');
311
+ }
312
+ // add command override if specified (docker CMD)
313
+ if (service.command && service.command.length > 0) {
314
+ containerConfig.Cmd = service.command;
315
+ logger.info({ deploymentId, serviceName, command: service.command }, 'using custom command');
316
+ }
317
+ // add entrypoint args if specified
318
+ if (service.args && service.args.length > 0) {
319
+ containerConfig.Entrypoint = service.args;
320
+ logger.info({ deploymentId, serviceName, args: service.args }, 'using custom entrypoint');
321
+ }
322
+ container = await this.docker.createContainer(containerConfig);
323
+ // start container
324
+ await container.start();
325
+ logger.info({ deploymentId, serviceName, containerId: container.id }, 'service started');
326
+ }
327
+ execution.containers.set(serviceName, container.id);
328
+ // start streaming logs
329
+ this.streamLogs(container, deploymentId, serviceName);
330
+ }
331
+ // pull docker image with progress, optionally using private registry credentials
332
+ async pullImage(image, deploymentId, serviceName, credentials) {
333
+ const pullOptions = {};
334
+ if (credentials) {
335
+ pullOptions.authconfig = {
336
+ username: credentials.username,
337
+ password: credentials.password,
338
+ serveraddress: credentials.host,
339
+ ...(credentials.email ? { email: credentials.email } : {})
340
+ };
341
+ logger.info({ deploymentId, serviceName, registry: credentials.host }, 'using private registry credentials');
342
+ }
343
+ return new Promise((resolve, reject) => {
344
+ this.docker.pull(image, pullOptions, (err, stream) => {
345
+ if (err) {
346
+ return reject(err);
347
+ }
348
+ this.docker.modem.followProgress(stream, (err) => {
349
+ if (err) {
350
+ this.emitLog(deploymentId, serviceName, `failed to pull ${image}: ${err.message}`, 'stderr');
351
+ return reject(err);
352
+ }
353
+ this.emitLog(deploymentId, serviceName, `pulled ${image}`, 'stdout');
354
+ resolve();
355
+ }, (event) => {
356
+ if (event.status) {
357
+ this.emitLog(deploymentId, serviceName, `[pull] ${event.status}`, 'stdout');
358
+ }
359
+ });
360
+ });
361
+ });
362
+ }
363
+ // stream container logs
364
+ streamLogs(container, deploymentId, serviceName) {
365
+ container.logs({
366
+ follow: true,
367
+ stdout: true,
368
+ stderr: true,
369
+ timestamps: false
370
+ }, (err, stream) => {
371
+ if (err) {
372
+ logger.error({ err }, 'failed to attach to container logs');
373
+ return;
374
+ }
375
+ // docker multiplexes stdout/stderr streams, need to demux
376
+ const stdout = new PassThrough();
377
+ const stderr = new PassThrough();
378
+ container.modem.demuxStream(stream, stdout, stderr);
379
+ stdout.on('data', (chunk) => {
380
+ const logLine = chunk.toString('utf8').trim();
381
+ if (logLine) {
382
+ this.emitLog(deploymentId, serviceName, logLine, 'stdout');
383
+ }
384
+ });
385
+ stderr.on('data', (chunk) => {
386
+ const logLine = chunk.toString('utf8').trim();
387
+ if (logLine) {
388
+ this.emitLog(deploymentId, serviceName, logLine, 'stderr');
389
+ }
390
+ });
391
+ stream.on('end', () => {
392
+ logger.info({ deploymentId, serviceName }, 'log stream ended');
393
+ });
394
+ stream.on('error', (err) => {
395
+ logger.error({ err, deploymentId, serviceName }, 'log stream error');
396
+ });
397
+ });
398
+ }
399
+ // emit log entry
400
+ emitLog(deploymentId, serviceName, logLine, stream) {
401
+ this.emit('log', {
402
+ deploymentId,
403
+ serviceName,
404
+ logLine,
405
+ stream,
406
+ timestamp: new Date()
407
+ });
408
+ }
409
+ // stop deployment (preserves persistent volumes for restart)
410
+ async stopDeployment(deploymentId) {
411
+ const execution = this.executions.get(deploymentId);
412
+ if (!execution) {
413
+ logger.warn({ deploymentId }, 'deployment not found');
414
+ return;
415
+ }
416
+ await this.cleanupDeployment(deploymentId, false);
417
+ logger.info({ deploymentId }, 'deployment stopped');
418
+ }
419
+ // close deployment permanently (deletes all resources including persistent volumes)
420
+ async closeDeployment(deploymentId) {
421
+ const execution = this.executions.get(deploymentId);
422
+ if (!execution) {
423
+ // try to find and clean up volumes anyway
424
+ await this.cleanupVolumes(deploymentId);
425
+ logger.warn({ deploymentId }, 'deployment not in memory, cleaned up volumes');
426
+ return;
427
+ }
428
+ await this.cleanupDeployment(deploymentId, true);
429
+ logger.info({ deploymentId }, 'deployment closed permanently');
430
+ }
431
+ // cleanup deployment resources
432
+ async cleanupDeployment(deploymentId, deleteVolumes) {
433
+ const execution = this.executions.get(deploymentId);
434
+ if (!execution)
435
+ return;
436
+ // stop and remove containers
437
+ for (const [serviceName, containerId] of execution.containers.entries()) {
438
+ try {
439
+ const container = this.docker.getContainer(containerId);
440
+ await container.stop({ t: 10 });
441
+ await container.remove();
442
+ logger.info({ deploymentId, serviceName }, 'container removed');
443
+ }
444
+ catch (err) {
445
+ logger.debug({ err, containerId }, 'failed to remove container');
446
+ }
447
+ }
448
+ // remove networks
449
+ for (const networkId of execution.networks) {
450
+ try {
451
+ const network = this.docker.getNetwork(networkId);
452
+ await network.remove();
453
+ logger.info({ deploymentId, networkId }, 'network removed');
454
+ }
455
+ catch (err) {
456
+ logger.debug({ err, networkId }, 'failed to remove network');
457
+ }
458
+ }
459
+ // remove volumes if requested (deployment closed permanently)
460
+ if (deleteVolumes) {
461
+ await this.cleanupVolumes(deploymentId);
462
+ }
463
+ else {
464
+ logger.info({ deploymentId, volumeCount: execution.volumes.length }, 'preserving persistent volumes');
465
+ }
466
+ this.executions.delete(deploymentId);
467
+ }
468
+ // cleanup volumes for a deployment
469
+ async cleanupVolumes(deploymentId) {
470
+ try {
471
+ const volumes = await this.docker.listVolumes({
472
+ filters: {
473
+ name: [`kova-${deploymentId}`]
474
+ }
475
+ });
476
+ for (const vol of volumes.Volumes || []) {
477
+ try {
478
+ const volume = this.docker.getVolume(vol.Name);
479
+ await volume.remove();
480
+ logger.info({ volumeName: vol.Name }, 'volume removed');
481
+ }
482
+ catch (err) {
483
+ logger.debug({ err, volumeName: vol.Name }, 'failed to remove volume');
484
+ }
485
+ }
486
+ }
487
+ catch (err) {
488
+ logger.error({ err, deploymentId }, 'failed to cleanup volumes');
489
+ }
490
+ }
491
+ // topological sort of services by depends_on (dependencies start first)
492
+ topologicalSort(services) {
493
+ const serviceMap = new Map(services);
494
+ const sorted = [];
495
+ const visited = new Set();
496
+ const visiting = new Set(); // cycle detection
497
+ const visit = (name) => {
498
+ if (visited.has(name))
499
+ return;
500
+ if (visiting.has(name)) {
501
+ logger.warn({ service: name }, 'circular dependency detected, breaking cycle');
502
+ return;
503
+ }
504
+ visiting.add(name);
505
+ const service = serviceMap.get(name);
506
+ if (service?.depends_on) {
507
+ for (const dep of service.depends_on) {
508
+ if (serviceMap.has(dep)) {
509
+ visit(dep);
510
+ }
511
+ else {
512
+ logger.warn({ service: name, dependency: dep }, 'depends_on references unknown service, ignoring');
513
+ }
514
+ }
515
+ }
516
+ visiting.delete(name);
517
+ visited.add(name);
518
+ if (service) {
519
+ sorted.push([name, service]);
520
+ }
521
+ };
522
+ for (const [name] of services) {
523
+ visit(name);
524
+ }
525
+ return sorted;
526
+ }
527
+ // get docker events for containers in a deployment
528
+ async getContainerEvents(deploymentId) {
529
+ const execution = this.executions.get(deploymentId);
530
+ if (!execution) {
531
+ return { error: 'deployment not found', events: [] };
532
+ }
533
+ const containerIds = Array.from(execution.containers.values());
534
+ if (containerIds.length === 0) {
535
+ return { deploymentId, events: [] };
536
+ }
537
+ const events = [];
538
+ for (const [serviceName, containerId] of execution.containers.entries()) {
539
+ try {
540
+ const container = this.docker.getContainer(containerId);
541
+ const info = await container.inspect();
542
+ // synthesize events from container state
543
+ events.push({
544
+ type: 'container',
545
+ action: 'create',
546
+ service: serviceName,
547
+ containerId: containerId.slice(0, 12),
548
+ image: info.Config.Image,
549
+ time: new Date(info.Created).toISOString()
550
+ });
551
+ if (info.State.StartedAt && info.State.StartedAt !== '0001-01-01T00:00:00Z') {
552
+ events.push({
553
+ type: 'container',
554
+ action: 'start',
555
+ service: serviceName,
556
+ containerId: containerId.slice(0, 12),
557
+ image: info.Config.Image,
558
+ time: info.State.StartedAt
559
+ });
560
+ }
561
+ if (info.State.FinishedAt && info.State.FinishedAt !== '0001-01-01T00:00:00Z' && !info.State.Running) {
562
+ events.push({
563
+ type: 'container',
564
+ action: 'stop',
565
+ service: serviceName,
566
+ containerId: containerId.slice(0, 12),
567
+ exitCode: info.State.ExitCode,
568
+ time: info.State.FinishedAt
569
+ });
570
+ }
571
+ // check health status if configured
572
+ if (info.State.Health) {
573
+ const health = info.State.Health;
574
+ events.push({
575
+ type: 'health',
576
+ action: health.Status, // healthy, unhealthy, starting
577
+ service: serviceName,
578
+ containerId: containerId.slice(0, 12),
579
+ failingStreak: health.FailingStreak,
580
+ time: health.Log?.length > 0
581
+ ? health.Log[health.Log.length - 1].End
582
+ : new Date().toISOString()
583
+ });
584
+ }
585
+ }
586
+ catch (err) {
587
+ events.push({
588
+ type: 'error',
589
+ action: 'inspect_failed',
590
+ service: serviceName,
591
+ containerId: containerId.slice(0, 12),
592
+ error: err.message,
593
+ time: new Date().toISOString()
594
+ });
595
+ }
596
+ }
597
+ // sort events by time
598
+ events.sort((a, b) => new Date(a.time).getTime() - new Date(b.time).getTime());
599
+ return { deploymentId, timestamp: Date.now(), events };
600
+ }
601
+ // get running deployments
602
+ getRunningDeployments() {
603
+ return Array.from(this.executions.keys());
604
+ }
605
+ // get deployment info
606
+ getDeployment(deploymentId) {
607
+ return this.executions.get(deploymentId);
608
+ }
609
+ // get real-time docker stats for all containers in a deployment
610
+ async getDeploymentStats(deploymentId) {
611
+ const execution = this.executions.get(deploymentId);
612
+ if (!execution) {
613
+ return { error: 'deployment not found', services: {} };
614
+ }
615
+ const services = {};
616
+ for (const [serviceName, containerId] of execution.containers.entries()) {
617
+ try {
618
+ const container = this.docker.getContainer(containerId);
619
+ // one-shot stats (stream: false) to avoid hanging
620
+ const stats = await container.stats({ stream: false });
621
+ // calculate cpu usage percentage
622
+ const cpuDelta = stats.cpu_stats.cpu_usage.total_usage - (stats.precpu_stats?.cpu_usage?.total_usage || 0);
623
+ const systemDelta = stats.cpu_stats.system_cpu_usage - (stats.precpu_stats?.system_cpu_usage || 0);
624
+ const numCpus = stats.cpu_stats.online_cpus || stats.cpu_stats.cpu_usage?.percpu_usage?.length || 1;
625
+ const cpuPercent = systemDelta > 0 ? (cpuDelta / systemDelta) * numCpus * 100 : 0;
626
+ // memory
627
+ const memUsage = stats.memory_stats.usage || 0;
628
+ const memLimit = stats.memory_stats.limit || 0;
629
+ const memCache = stats.memory_stats.stats?.cache || 0;
630
+ const memActual = memUsage - memCache;
631
+ const memPercent = memLimit > 0 ? (memActual / memLimit) * 100 : 0;
632
+ // network i/o
633
+ let netRx = 0, netTx = 0;
634
+ if (stats.networks) {
635
+ for (const iface of Object.values(stats.networks)) {
636
+ netRx += iface.rx_bytes || 0;
637
+ netTx += iface.tx_bytes || 0;
638
+ }
639
+ }
640
+ // block i/o
641
+ let blockRead = 0, blockWrite = 0;
642
+ if (stats.blkio_stats?.io_service_bytes_recursive) {
643
+ for (const entry of stats.blkio_stats.io_service_bytes_recursive) {
644
+ if (entry.op === 'read' || entry.op === 'Read')
645
+ blockRead += entry.value;
646
+ if (entry.op === 'write' || entry.op === 'Write')
647
+ blockWrite += entry.value;
648
+ }
649
+ }
650
+ services[serviceName] = {
651
+ containerId: containerId.slice(0, 12),
652
+ cpu: { percent: Math.round(cpuPercent * 100) / 100, cores: numCpus },
653
+ memory: {
654
+ used: memActual,
655
+ limit: memLimit,
656
+ percent: Math.round(memPercent * 100) / 100,
657
+ usedFormatted: this.formatBytes(memActual),
658
+ limitFormatted: this.formatBytes(memLimit)
659
+ },
660
+ network: {
661
+ rx: netRx,
662
+ tx: netTx,
663
+ rxFormatted: this.formatBytes(netRx),
664
+ txFormatted: this.formatBytes(netTx)
665
+ },
666
+ blockIo: {
667
+ read: blockRead,
668
+ write: blockWrite,
669
+ readFormatted: this.formatBytes(blockRead),
670
+ writeFormatted: this.formatBytes(blockWrite)
671
+ },
672
+ pids: stats.pids_stats?.current || 0
673
+ };
674
+ }
675
+ catch (err) {
676
+ services[serviceName] = { error: err.message, containerId: containerId.slice(0, 12) };
677
+ }
678
+ }
679
+ return { deploymentId, timestamp: Date.now(), services };
680
+ }
681
+ // format bytes to human readable
682
+ formatBytes(bytes) {
683
+ if (bytes === 0)
684
+ return '0 B';
685
+ const units = ['B', 'KB', 'MB', 'GB', 'TB'];
686
+ const i = Math.floor(Math.log(bytes) / Math.log(1024));
687
+ return `${(bytes / Math.pow(1024, i)).toFixed(1)} ${units[i]}`;
688
+ }
689
+ // get container running status for all services in a deployment
690
+ async getDeploymentStatus(deploymentId) {
691
+ const execution = this.executions.get(deploymentId);
692
+ if (!execution) {
693
+ return { error: 'deployment not found', services: {} };
694
+ }
695
+ const services = {};
696
+ for (const [serviceName, containerId] of execution.containers.entries()) {
697
+ try {
698
+ const container = this.docker.getContainer(containerId);
699
+ const info = await container.inspect();
700
+ services[serviceName] = {
701
+ containerId: containerId.slice(0, 12),
702
+ running: info.State.Running,
703
+ status: info.State.Status, // running, exited, paused, restarting, dead
704
+ startedAt: info.State.StartedAt,
705
+ finishedAt: info.State.FinishedAt,
706
+ exitCode: info.State.ExitCode,
707
+ restartCount: info.RestartCount,
708
+ image: info.Config.Image,
709
+ ports: Object.keys(info.Config.ExposedPorts || {}).map(p => {
710
+ const [port, proto] = p.split('/');
711
+ return { port: parseInt(port), protocol: proto || 'tcp' };
712
+ })
713
+ };
714
+ }
715
+ catch (err) {
716
+ services[serviceName] = { error: err.message, containerId: containerId.slice(0, 12) };
717
+ }
718
+ }
719
+ return { deploymentId, timestamp: Date.now(), services };
720
+ }
721
+ // discover existing deployments on startup
722
+ async discoverExistingDeployments() {
723
+ logger.info('discovering existing kova deployments...');
724
+ try {
725
+ // find all containers with kova.deployment label
726
+ const containers = await this.docker.listContainers({
727
+ filters: { label: ['kova.deployment'] }
728
+ });
729
+ for (const containerInfo of containers) {
730
+ const deploymentId = containerInfo.Labels['kova.deployment'];
731
+ const serviceName = containerInfo.Labels['kova.service'] || 'web';
732
+ if (!deploymentId)
733
+ continue;
734
+ // skip if already tracked
735
+ if (this.executions.has(deploymentId))
736
+ continue;
737
+ logger.info({ deploymentId, serviceName, containerId: containerInfo.Id }, 'discovered existing deployment');
738
+ // get full container details
739
+ const container = this.docker.getContainer(containerInfo.Id);
740
+ const inspect = await container.inspect();
741
+ // extract volumes from mounts
742
+ const volumes = [];
743
+ for (const mount of inspect.Mounts || []) {
744
+ if (mount.Type === 'volume' && mount.Name) {
745
+ volumes.push(mount.Name);
746
+ }
747
+ }
748
+ // extract network
749
+ const networks = Object.keys(inspect.NetworkSettings.Networks || {});
750
+ const networkId = networks.length > 0 ? inspect.NetworkSettings.Networks[networks[0]].NetworkID : '';
751
+ // create execution record
752
+ const execution = {
753
+ deploymentId,
754
+ leaseId: containerInfo.Labels['kova.lease'] || '',
755
+ manifest: {
756
+ version: '2.0',
757
+ services: {},
758
+ profiles: {},
759
+ deployment: {}
760
+ },
761
+ containers: new Map([[serviceName, containerInfo.Id]]),
762
+ volumes,
763
+ networks: networkId ? [networkId] : []
764
+ };
765
+ this.executions.set(deploymentId, execution);
766
+ // start streaming logs from discovered container
767
+ try {
768
+ const container = this.docker.getContainer(containerInfo.Id);
769
+ this.streamLogs(container, deploymentId, serviceName);
770
+ logger.info({ deploymentId, serviceName }, 'log streaming attached to discovered container');
771
+ }
772
+ catch (err) {
773
+ logger.warn({ err, deploymentId }, 'failed to attach log streaming to discovered container');
774
+ }
775
+ logger.info({ deploymentId, volumes: volumes.length }, 'deployment state restored');
776
+ }
777
+ logger.info({ count: this.executions.size }, 'deployment discovery complete');
778
+ }
779
+ catch (err) {
780
+ logger.error({ err }, 'failed to discover existing deployments');
781
+ }
782
+ }
783
+ // download and populate volume with uploaded files from orchestrator
784
+ async populateVolumeFromUploads(deploymentId, serviceName, volumeName) {
785
+ const https = await import('https');
786
+ const http = await import('http');
787
+ const fs = await import('fs');
788
+ const tar = await import('tar');
789
+ const path = await import('path');
790
+ const os = await import('os');
791
+ logger.info({ deploymentId, serviceName, volumeName }, 'downloading files from orchestrator');
792
+ const orchestratorUrl = this.orchestratorUrl;
793
+ const downloadUrl = `${orchestratorUrl}/api/v1/deployments/${deploymentId}/services/${serviceName}/files/download`;
794
+ // use api key for auth
795
+ const authToken = this.apiKey || process.env.PROVIDER_TOKEN || '';
796
+ // max download size (100mb to prevent disk exhaustion)
797
+ const maxDownloadSize = 100 * 1024 * 1024;
798
+ try {
799
+ // download tarball to temp file
800
+ const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'kova-download-'));
801
+ const tarballPath = path.join(tempDir, 'files.tar.gz');
802
+ const downloadResult = await new Promise((resolve, reject) => {
803
+ const proto = orchestratorUrl.startsWith('https') ? https : http;
804
+ const hash = createHash('sha256');
805
+ let downloadedSize = 0;
806
+ const req = proto.get(downloadUrl, {
807
+ headers: {
808
+ 'Authorization': `Bearer ${authToken}`
809
+ }
810
+ }, (res) => {
811
+ if (res.statusCode === 404) {
812
+ logger.info({ deploymentId, serviceName }, 'no uploaded files found, skipping');
813
+ resolve({});
814
+ return;
815
+ }
816
+ if (res.statusCode !== 200) {
817
+ reject(new Error(`failed to download files: ${res.statusCode} ${res.statusMessage}`));
818
+ return;
819
+ }
820
+ // get expected checksum from header if provided
821
+ const expectedChecksum = res.headers['x-checksum'];
822
+ const fileStream = fs.createWriteStream(tarballPath);
823
+ res.on('data', (chunk) => {
824
+ downloadedSize += chunk.length;
825
+ // check size limit
826
+ if (downloadedSize > maxDownloadSize) {
827
+ req.destroy();
828
+ fileStream.destroy();
829
+ fs.rmSync(tempDir, { recursive: true, force: true });
830
+ reject(new Error(`download exceeds size limit of ${maxDownloadSize} bytes`));
831
+ return;
832
+ }
833
+ hash.update(chunk);
834
+ });
835
+ res.pipe(fileStream);
836
+ fileStream.on('finish', () => {
837
+ fileStream.close();
838
+ const actualChecksum = hash.digest('hex');
839
+ // verify checksum if provided
840
+ if (expectedChecksum && actualChecksum !== expectedChecksum) {
841
+ fs.rmSync(tempDir, { recursive: true, force: true });
842
+ reject(new Error(`checksum mismatch: expected ${expectedChecksum}, got ${actualChecksum}`));
843
+ return;
844
+ }
845
+ logger.info({ deploymentId, size: downloadedSize, checksum: actualChecksum }, 'file download verified');
846
+ resolve({ checksum: actualChecksum });
847
+ });
848
+ fileStream.on('error', reject);
849
+ });
850
+ req.on('error', reject);
851
+ req.end();
852
+ });
853
+ // check if tarball was downloaded
854
+ if (!fs.existsSync(tarballPath)) {
855
+ logger.info({ deploymentId, serviceName }, 'no files to populate volume');
856
+ fs.rmSync(tempDir, { recursive: true, force: true });
857
+ return;
858
+ }
859
+ // extract tarball to temp directory
860
+ const extractDir = path.join(tempDir, 'extracted');
861
+ fs.mkdirSync(extractDir, { recursive: true });
862
+ await tar.extract({
863
+ file: tarballPath,
864
+ cwd: extractDir,
865
+ // prevent zip-slip: strip leading slashes and block path traversal
866
+ strip: 0,
867
+ filter: (path) => {
868
+ if (path.includes('..')) {
869
+ logger.warn({ path }, 'blocked path traversal attempt in tar');
870
+ return false;
871
+ }
872
+ return true;
873
+ }
874
+ });
875
+ // copy files to volume using a temporary container
876
+ // mount volume and copy files from temp directory
877
+ const containerName = `kova-temp-copy-${Date.now()}`;
878
+ await this.docker.run('alpine:latest', ['sh', '-c', `cp -r /source/. /dest/`], process.stdout, {
879
+ name: containerName,
880
+ HostConfig: {
881
+ Binds: [
882
+ `${volumeName}:/dest`,
883
+ `${extractDir}:/source:ro`
884
+ ],
885
+ AutoRemove: true
886
+ }
887
+ });
888
+ logger.info({ deploymentId, serviceName, volumeName }, 'files populated to volume');
889
+ // cleanup temp directory
890
+ fs.rmSync(tempDir, { recursive: true, force: true });
891
+ }
892
+ catch (err) {
893
+ logger.error({ err, deploymentId, serviceName }, 'failed to populate volume from uploads');
894
+ throw err;
895
+ }
896
+ }
897
+ // update files in existing deployment volume and restart containers
898
+ async updateDeploymentFiles(deploymentId, serviceName) {
899
+ const execution = this.executions.get(deploymentId);
900
+ if (!execution) {
901
+ throw new Error('deployment not found');
902
+ }
903
+ logger.info({ deploymentId, serviceName }, 'updating deployment files');
904
+ // find volume for this service
905
+ const volumePrefix = `kova-${deploymentId}-${serviceName}-`;
906
+ let volumeName = execution.volumes.find(v => v && v.startsWith(volumePrefix));
907
+ // check docker if not in memory
908
+ if (!volumeName) {
909
+ logger.info({ deploymentId, serviceName, volumePrefix }, 'volume not in memory, querying docker');
910
+ try {
911
+ const volumes = await this.docker.listVolumes();
912
+ const matchingVolume = volumes.Volumes?.find(v => v.Name?.startsWith(volumePrefix));
913
+ if (matchingVolume) {
914
+ volumeName = matchingVolume.Name;
915
+ execution.volumes.push(volumeName);
916
+ logger.info({ deploymentId, serviceName, volumeName }, 'found existing volume in docker');
917
+ }
918
+ }
919
+ catch (err) {
920
+ logger.error({ err, deploymentId, serviceName }, 'failed to query docker volumes');
921
+ }
922
+ }
923
+ if (!volumeName) {
924
+ throw new Error(`no volume found for service ${serviceName} (expected prefix: ${volumePrefix})`);
925
+ }
926
+ logger.info({ deploymentId, serviceName, volumeName }, 'found volume for update');
927
+ // stop containers for this service
928
+ const containerId = execution.containers.get(serviceName);
929
+ if (containerId) {
930
+ try {
931
+ const container = this.docker.getContainer(containerId);
932
+ await container.stop({ t: 10 });
933
+ logger.info({ deploymentId, serviceName, containerId }, 'container stopped for file update');
934
+ }
935
+ catch (err) {
936
+ logger.warn({ err, containerId }, 'failed to stop container');
937
+ }
938
+ }
939
+ // backup volume to temp before clearing, so we can restore on failure
940
+ const backupVolumeName = `${volumeName}-backup-${Date.now()}`;
941
+ try {
942
+ await this.docker.createVolume({ Name: backupVolumeName });
943
+ await this.docker.run('alpine:latest', ['sh', '-c', 'cp -a /source/. /backup/'], process.stdout, {
944
+ HostConfig: {
945
+ Binds: [`${volumeName}:/source:ro`, `${backupVolumeName}:/backup`],
946
+ AutoRemove: true
947
+ }
948
+ });
949
+ logger.info({ deploymentId, serviceName, backupVolumeName }, 'volume backed up');
950
+ }
951
+ catch (err) {
952
+ logger.warn({ err, deploymentId }, 'volume backup failed, proceeding without safety net');
953
+ }
954
+ try {
955
+ // clear volume contents
956
+ await this.docker.run('alpine:latest', ['sh', '-c', 'rm -rf /dest/*'], process.stdout, {
957
+ HostConfig: {
958
+ Binds: [`${volumeName}:/dest`],
959
+ AutoRemove: true
960
+ }
961
+ });
962
+ logger.info({ deploymentId, serviceName, volumeName }, 'volume contents cleared');
963
+ // re-download and populate volume
964
+ await this.populateVolumeFromUploads(deploymentId, serviceName, volumeName);
965
+ // restart container
966
+ if (containerId) {
967
+ try {
968
+ const container = this.docker.getContainer(containerId);
969
+ await container.start();
970
+ logger.info({ deploymentId, serviceName, containerId }, 'container restarted after file update');
971
+ }
972
+ catch (err) {
973
+ logger.error({ err, containerId }, 'failed to restart container after file update');
974
+ throw err;
975
+ }
976
+ }
977
+ logger.info({ deploymentId, serviceName }, 'deployment files updated successfully');
978
+ }
979
+ catch (err) {
980
+ // restore from backup if download failed
981
+ try {
982
+ await this.docker.run('alpine:latest', ['sh', '-c', 'cp -a /backup/. /dest/'], process.stdout, {
983
+ HostConfig: {
984
+ Binds: [`${backupVolumeName}:/backup:ro`, `${volumeName}:/dest`],
985
+ AutoRemove: true
986
+ }
987
+ });
988
+ logger.info({ deploymentId, serviceName }, 'restored volume from backup after failed update');
989
+ }
990
+ catch (restoreErr) {
991
+ logger.error({ err: restoreErr }, 'failed to restore volume from backup');
992
+ }
993
+ // restart container even if update failed
994
+ if (containerId) {
995
+ try {
996
+ const container = this.docker.getContainer(containerId);
997
+ await container.start();
998
+ }
999
+ catch (restartErr) {
1000
+ logger.error({ err: restartErr }, 'failed to restart container after failed update');
1001
+ }
1002
+ }
1003
+ throw err;
1004
+ }
1005
+ finally {
1006
+ // clean up backup volume
1007
+ try {
1008
+ const backup = this.docker.getVolume(backupVolumeName);
1009
+ await backup.remove();
1010
+ }
1011
+ catch {
1012
+ // ignore cleanup errors
1013
+ }
1014
+ }
1015
+ }
1016
+ // browse files inside a running container
1017
+ async browseFiles(deploymentId, serviceName, dirPath = '/') {
1018
+ const execution = this.executions.get(deploymentId);
1019
+ if (!execution) {
1020
+ return { error: 'deployment not found', files: [] };
1021
+ }
1022
+ let containerId = execution.containers.get(serviceName);
1023
+ if (!containerId && execution.containers.size > 0) {
1024
+ containerId = execution.containers.entries().next().value[1];
1025
+ }
1026
+ if (!containerId) {
1027
+ return { error: 'no containers found', files: [] };
1028
+ }
1029
+ try {
1030
+ const container = this.docker.getContainer(containerId);
1031
+ // try gnu ls first, fall back to plain ls -la for busybox
1032
+ let lsCmd = ['ls', '-laF', dirPath];
1033
+ const exec = await container.exec({
1034
+ Cmd: lsCmd,
1035
+ AttachStdout: true,
1036
+ AttachStderr: true,
1037
+ });
1038
+ const stream = await exec.start({ hijack: true, stdin: false });
1039
+ const output = await this.collectExecOutput(stream);
1040
+ const files = [];
1041
+ const lines = output.stdout.split('\n').filter((l) => l.trim() && !l.startsWith('total'));
1042
+ for (const line of lines) {
1043
+ // try iso format first: -rw-r--r-- 1 root root 123 2026-02-11 09:54 file.txt
1044
+ let match = line.match(/^([drwxlstSTrw\-\.]+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s+(.+)$/);
1045
+ let dateStr = '';
1046
+ if (match) {
1047
+ dateStr = `${match[6]} ${match[7]}`;
1048
+ }
1049
+ else {
1050
+ // busybox format: -rw-r--r-- 1 root root 123 Feb 11 09:54 file.txt
1051
+ match = line.match(/^([drwxlstSTrw\-\.]+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\w{3}\s+\d{1,2}\s+[\d:]+)\s+(.+)$/);
1052
+ if (match) {
1053
+ dateStr = match[6];
1054
+ // shift: busybox match has 7 groups (date is one field, name is match[7])
1055
+ match = [match[0], match[1], match[2], match[3], match[4], match[5], match[6], '', match[7]];
1056
+ }
1057
+ }
1058
+ if (!match)
1059
+ continue;
1060
+ const permissions = match[1];
1061
+ const owner = match[3];
1062
+ const group = match[4];
1063
+ const size = match[5];
1064
+ const rawName = match[8];
1065
+ const isDir = permissions.startsWith('d');
1066
+ const isLink = permissions.startsWith('l');
1067
+ let name = rawName;
1068
+ let linkTarget = '';
1069
+ // remove trailing / or @ or * from name (added by -F flag)
1070
+ if (isDir && name.endsWith('/'))
1071
+ name = name.slice(0, -1);
1072
+ if (name.endsWith('*'))
1073
+ name = name.slice(0, -1);
1074
+ if (name.endsWith('@'))
1075
+ name = name.slice(0, -1);
1076
+ // handle symlinks: name -> target
1077
+ if (isLink && name.includes(' -> ')) {
1078
+ const parts = name.split(' -> ');
1079
+ name = parts[0];
1080
+ linkTarget = parts[1];
1081
+ }
1082
+ // skip . and ..
1083
+ if (name === '.' || name === '..')
1084
+ continue;
1085
+ files.push({
1086
+ name,
1087
+ path: dirPath === '/' ? `/${name}` : `${dirPath}/${name}`,
1088
+ type: isDir ? 'directory' : isLink ? 'link' : 'file',
1089
+ size: parseInt(size),
1090
+ permissions,
1091
+ owner,
1092
+ group,
1093
+ modified: dateStr,
1094
+ linkTarget: linkTarget || undefined
1095
+ });
1096
+ }
1097
+ // sort: directories first, then alphabetical
1098
+ files.sort((a, b) => {
1099
+ if (a.type === 'directory' && b.type !== 'directory')
1100
+ return -1;
1101
+ if (a.type !== 'directory' && b.type === 'directory')
1102
+ return 1;
1103
+ return a.name.localeCompare(b.name);
1104
+ });
1105
+ return { path: dirPath, files };
1106
+ }
1107
+ catch (err) {
1108
+ logger.error({ err, deploymentId, dirPath }, 'failed to browse files');
1109
+ return { error: err.message, files: [] };
1110
+ }
1111
+ }
1112
+ // read a file from inside a container
1113
+ async readContainerFile(deploymentId, serviceName, filePath) {
1114
+ const execution = this.executions.get(deploymentId);
1115
+ if (!execution) {
1116
+ return { error: 'deployment not found' };
1117
+ }
1118
+ let containerId = execution.containers.get(serviceName);
1119
+ if (!containerId && execution.containers.size > 0) {
1120
+ containerId = execution.containers.entries().next().value[1];
1121
+ }
1122
+ if (!containerId) {
1123
+ return { error: 'no containers found' };
1124
+ }
1125
+ try {
1126
+ const container = this.docker.getContainer(containerId);
1127
+ // check file size first (reject files > 5MB)
1128
+ const statExec = await container.exec({
1129
+ Cmd: ['stat', '-c', '%s', filePath],
1130
+ AttachStdout: true,
1131
+ AttachStderr: true,
1132
+ });
1133
+ const statStream = await statExec.start({ hijack: true, stdin: false });
1134
+ const statOutput = await this.collectExecOutput(statStream);
1135
+ if (statOutput.stderr.includes('No such file')) {
1136
+ return { error: 'file not found' };
1137
+ }
1138
+ const fileSize = parseInt(statOutput.stdout.trim());
1139
+ if (fileSize > 5 * 1024 * 1024) {
1140
+ return { error: 'file too large (max 5MB)', size: fileSize };
1141
+ }
1142
+ // read the file content using base64 to handle binary safely
1143
+ const exec = await container.exec({
1144
+ Cmd: ['base64', filePath],
1145
+ AttachStdout: true,
1146
+ AttachStderr: true,
1147
+ });
1148
+ const stream = await exec.start({ hijack: true, stdin: false });
1149
+ const output = await this.collectExecOutput(stream);
1150
+ if (output.stderr && output.stderr.includes('No such file')) {
1151
+ return { error: 'file not found' };
1152
+ }
1153
+ const content = Buffer.from(output.stdout.replace(/\s/g, ''), 'base64').toString('utf8');
1154
+ // detect if binary (has null bytes or high ratio of non-printable chars)
1155
+ const nonPrintable = content.split('').filter(c => {
1156
+ const code = c.charCodeAt(0);
1157
+ return code < 32 && code !== 9 && code !== 10 && code !== 13;
1158
+ }).length;
1159
+ const isBinary = nonPrintable > content.length * 0.1;
1160
+ return {
1161
+ path: filePath,
1162
+ size: fileSize,
1163
+ content: isBinary ? undefined : content,
1164
+ binary: isBinary,
1165
+ encoding: isBinary ? 'base64' : 'utf8',
1166
+ rawBase64: isBinary ? output.stdout.replace(/\s/g, '') : undefined
1167
+ };
1168
+ }
1169
+ catch (err) {
1170
+ logger.error({ err, deploymentId, filePath }, 'failed to read container file');
1171
+ return { error: err.message };
1172
+ }
1173
+ }
1174
+ // upload a file into a running container
1175
+ async uploadFileToContainer(deploymentId, serviceName, filePath, content, encoding = 'utf8') {
1176
+ const execution = this.executions.get(deploymentId);
1177
+ if (!execution) {
1178
+ return { success: false, error: 'deployment not found' };
1179
+ }
1180
+ let containerId = execution.containers.get(serviceName);
1181
+ if (!containerId && execution.containers.size > 0) {
1182
+ containerId = execution.containers.entries().next().value[1];
1183
+ }
1184
+ if (!containerId) {
1185
+ return { success: false, error: 'no containers found' };
1186
+ }
1187
+ // validate path (prevent path traversal)
1188
+ if (filePath.includes('..') || !filePath.startsWith('/')) {
1189
+ return { success: false, error: 'invalid file path' };
1190
+ }
1191
+ try {
1192
+ const container = this.docker.getContainer(containerId);
1193
+ // ensure parent directory exists
1194
+ const parentDir = filePath.substring(0, filePath.lastIndexOf('/')) || '/';
1195
+ const mkdirExec = await container.exec({
1196
+ Cmd: ['mkdir', '-p', parentDir],
1197
+ AttachStdout: true,
1198
+ AttachStderr: true,
1199
+ });
1200
+ const mkdirStream = await mkdirExec.start({ hijack: true, stdin: false });
1201
+ await this.collectExecOutput(mkdirStream);
1202
+ // write file using base64 decode via shell
1203
+ const b64Content = encoding === 'base64' ? content : Buffer.from(content, 'utf8').toString('base64');
1204
+ const exec = await container.exec({
1205
+ Cmd: ['sh', '-c', `echo '${b64Content}' | base64 -d > ${filePath}`],
1206
+ AttachStdout: true,
1207
+ AttachStderr: true,
1208
+ });
1209
+ const stream = await exec.start({ hijack: true, stdin: false });
1210
+ const output = await this.collectExecOutput(stream);
1211
+ if (output.stderr && !output.stderr.includes('warning')) {
1212
+ return { success: false, error: output.stderr.trim() };
1213
+ }
1214
+ logger.info({ deploymentId, filePath }, 'file uploaded to container');
1215
+ return { success: true, path: filePath };
1216
+ }
1217
+ catch (err) {
1218
+ logger.error({ err, deploymentId, filePath }, 'failed to upload file to container');
1219
+ return { success: false, error: err.message };
1220
+ }
1221
+ }
1222
+ // collect output from a docker exec stream
1223
+ collectExecOutput(stream) {
1224
+ return new Promise((resolve) => {
1225
+ let stdout = '';
1226
+ let stderr = '';
1227
+ const timeout = setTimeout(() => {
1228
+ resolve({ stdout, stderr: stderr || 'command timed out' });
1229
+ }, 10000);
1230
+ stream.on('data', (chunk) => {
1231
+ // docker multiplexes: first 8 bytes are header
1232
+ // byte 0: stream type (1=stdout, 2=stderr)
1233
+ // bytes 4-7: payload size
1234
+ const data = chunk.toString('utf8');
1235
+ stdout += data;
1236
+ });
1237
+ stream.on('end', () => {
1238
+ clearTimeout(timeout);
1239
+ // strip docker header bytes if present
1240
+ const clean = stdout.replace(/[\x00-\x08]/g, '');
1241
+ resolve({ stdout: clean, stderr });
1242
+ });
1243
+ stream.on('error', (err) => {
1244
+ clearTimeout(timeout);
1245
+ resolve({ stdout, stderr: err.message });
1246
+ });
1247
+ });
1248
+ }
1249
+ // shell session tracking
1250
+ shellSessions = new Map();
1251
+ // start interactive shell session in container
1252
+ // returns { success: true } or { success: false, error: string }
1253
+ async startShellSession(sessionId, deploymentId, serviceName, onOutput) {
1254
+ const execution = this.executions.get(deploymentId);
1255
+ if (!execution) {
1256
+ logger.warn({ deploymentId, sessionId }, 'shell: deployment not found');
1257
+ return { success: false, error: 'deployment not found on this provider' };
1258
+ }
1259
+ // try requested service first, then fall back to first available service
1260
+ let containerId = execution.containers.get(serviceName);
1261
+ let actualServiceName = serviceName;
1262
+ if (!containerId && execution.containers.size > 0) {
1263
+ // fall back to first available service
1264
+ const firstEntry = execution.containers.entries().next().value;
1265
+ if (firstEntry) {
1266
+ actualServiceName = firstEntry[0];
1267
+ containerId = firstEntry[1];
1268
+ logger.info({ deploymentId, requestedService: serviceName, actualService: actualServiceName }, 'shell: using fallback service');
1269
+ }
1270
+ }
1271
+ if (!containerId) {
1272
+ logger.warn({ deploymentId, serviceName, sessionId, availableServices: Array.from(execution.containers.keys()) }, 'shell: no services found');
1273
+ return { success: false, error: 'no containers found for this service' };
1274
+ }
1275
+ try {
1276
+ const container = this.docker.getContainer(containerId);
1277
+ const info = await container.inspect();
1278
+ let execContainer = container;
1279
+ let debugContainer = null;
1280
+ if (info.State.Running) {
1281
+ // container is running, exec directly into it
1282
+ logger.info({ deploymentId, containerId }, 'shell: container running, attaching');
1283
+ }
1284
+ else {
1285
+ // container is stopped - try to start it
1286
+ logger.info({ deploymentId, containerId, state: info.State.Status }, 'shell: container not running, starting it');
1287
+ try {
1288
+ await container.start();
1289
+ }
1290
+ catch (startErr) {
1291
+ // ignore "already started" race
1292
+ if (!startErr.message?.includes('already started')) {
1293
+ logger.warn({ err: startErr, containerId }, 'shell: failed to start container');
1294
+ }
1295
+ }
1296
+ await new Promise(r => setTimeout(r, 1500));
1297
+ // check if it actually stayed running
1298
+ const recheck = await container.inspect();
1299
+ if (!recheck.State.Running) {
1300
+ // container exits immediately (e.g. node:20-alpine with no long-running process)
1301
+ // commit the stopped container to preserve its filesystem, then run with sleep
1302
+ logger.info({ deploymentId, containerId, image: info.Config.Image }, 'shell: container exits immediately, creating debug container from snapshot');
1303
+ const debugTag = `kova-debug:${containerId.slice(0, 12)}`;
1304
+ const debugName = `kova-debug-${sessionId.replace(/[^a-zA-Z0-9-]/g, '-').slice(0, 60)}`;
1305
+ // snapshot the stopped container's filesystem
1306
+ const commitResult = await container.commit({
1307
+ repo: 'kova-debug',
1308
+ tag: containerId.slice(0, 12),
1309
+ comment: 'debug shell snapshot'
1310
+ });
1311
+ logger.info({ debugTag, imageId: commitResult.Id }, 'shell: committed container snapshot');
1312
+ debugContainer = await this.docker.createContainer({
1313
+ name: debugName,
1314
+ Image: debugTag,
1315
+ Cmd: ['sh', '-c', 'trap "exit 0" TERM INT; while true; do sleep 1; done'],
1316
+ Tty: true,
1317
+ OpenStdin: true,
1318
+ WorkingDir: info.Config.WorkingDir || '/',
1319
+ Env: info.Config.Env || [],
1320
+ HostConfig: {
1321
+ NetworkMode: info.HostConfig.NetworkMode || 'bridge',
1322
+ Binds: info.HostConfig.Binds || [],
1323
+ AutoRemove: true
1324
+ },
1325
+ Labels: {
1326
+ 'kova.deployment': deploymentId,
1327
+ 'kova.service': serviceName,
1328
+ 'kova.debug-shell': 'true'
1329
+ }
1330
+ });
1331
+ await debugContainer.start();
1332
+ execContainer = debugContainer;
1333
+ logger.info({ deploymentId, debugName }, 'shell: debug container started');
1334
+ }
1335
+ }
1336
+ // create exec instance for interactive shell
1337
+ const exec = await execContainer.exec({
1338
+ Cmd: ['/bin/sh'],
1339
+ AttachStdin: true,
1340
+ AttachStdout: true,
1341
+ AttachStderr: true,
1342
+ Tty: true
1343
+ });
1344
+ // start the exec and get stream
1345
+ const stream = await exec.start({
1346
+ hijack: true,
1347
+ stdin: true,
1348
+ Tty: true
1349
+ });
1350
+ // store session (including debug container + image ref for cleanup)
1351
+ this.shellSessions.set(sessionId, {
1352
+ exec,
1353
+ stream,
1354
+ deploymentId,
1355
+ serviceName,
1356
+ debugContainer,
1357
+ debugImageTag: debugContainer ? `kova-debug:${containerId.slice(0, 12)}` : undefined
1358
+ });
1359
+ // forward output to callback
1360
+ stream.on('data', (chunk) => {
1361
+ const output = chunk.toString('utf8');
1362
+ onOutput(output);
1363
+ });
1364
+ stream.on('end', () => {
1365
+ logger.info({ sessionId }, 'shell session stream ended');
1366
+ this.cleanupShellSession(sessionId);
1367
+ this.emit('shell-closed', { sessionId });
1368
+ });
1369
+ stream.on('error', (err) => {
1370
+ logger.error({ err, sessionId }, 'shell session stream error');
1371
+ this.cleanupShellSession(sessionId);
1372
+ });
1373
+ logger.info({ sessionId, deploymentId, serviceName, containerId, debug: !!debugContainer }, 'shell session started');
1374
+ return { success: true };
1375
+ }
1376
+ catch (err) {
1377
+ logger.error({ err, sessionId, deploymentId }, 'failed to start shell session');
1378
+ const msg = err.message || 'failed to start shell';
1379
+ if (msg.includes('is not running')) {
1380
+ return { success: false, error: 'container is not running - it may have crashed' };
1381
+ }
1382
+ if (msg.includes('No such image')) {
1383
+ return { success: false, error: 'container image not available locally' };
1384
+ }
1385
+ return { success: false, error: msg };
1386
+ }
1387
+ }
1388
+ // send input to shell session
1389
+ sendShellInput(sessionId, input) {
1390
+ const session = this.shellSessions.get(sessionId);
1391
+ if (!session) {
1392
+ logger.warn({ sessionId }, 'shell input: session not found');
1393
+ return false;
1394
+ }
1395
+ try {
1396
+ session.stream.write(input);
1397
+ return true;
1398
+ }
1399
+ catch (err) {
1400
+ logger.error({ err, sessionId }, 'failed to send shell input');
1401
+ return false;
1402
+ }
1403
+ }
1404
+ // resize shell terminal
1405
+ resizeShell(sessionId, cols, rows) {
1406
+ const session = this.shellSessions.get(sessionId);
1407
+ if (!session) {
1408
+ return false;
1409
+ }
1410
+ try {
1411
+ // resize the tty
1412
+ session.exec.resize({ h: rows, w: cols });
1413
+ return true;
1414
+ }
1415
+ catch (err) {
1416
+ logger.debug({ err, sessionId }, 'failed to resize shell');
1417
+ return false;
1418
+ }
1419
+ }
1420
+ // clean up a shell session and its debug container/image if any
1421
+ cleanupShellSession(sessionId) {
1422
+ const session = this.shellSessions.get(sessionId);
1423
+ if (!session)
1424
+ return;
1425
+ // stop debug container (AutoRemove will delete it)
1426
+ if (session.debugContainer) {
1427
+ session.debugContainer.stop({ t: 2 }).catch(() => {
1428
+ // ignore - may already be stopped
1429
+ });
1430
+ }
1431
+ // remove the committed snapshot image
1432
+ if (session.debugImageTag) {
1433
+ const img = this.docker.getImage(session.debugImageTag);
1434
+ img.remove({ force: true }).catch(() => {
1435
+ // ignore - best effort cleanup
1436
+ });
1437
+ }
1438
+ this.shellSessions.delete(sessionId);
1439
+ }
1440
+ // restart all containers in a deployment (stop then start)
1441
+ async restartDeployment(deploymentId) {
1442
+ const execution = this.executions.get(deploymentId);
1443
+ if (!execution) {
1444
+ throw new Error('deployment not found');
1445
+ }
1446
+ const restarted = [];
1447
+ for (const [serviceName, containerId] of execution.containers.entries()) {
1448
+ try {
1449
+ const container = this.docker.getContainer(containerId);
1450
+ await container.stop({ t: 10 });
1451
+ await container.start();
1452
+ restarted.push(serviceName);
1453
+ logger.info({ deploymentId, serviceName, containerId }, 'container restarted');
1454
+ }
1455
+ catch (err) {
1456
+ logger.error({ err, deploymentId, serviceName, containerId }, 'failed to restart container');
1457
+ }
1458
+ }
1459
+ return restarted;
1460
+ }
1461
+ // create a snapshot of a service's volume
1462
+ async createVolumeSnapshot(deploymentId, serviceName, snapshotId) {
1463
+ const execution = this.executions.get(deploymentId);
1464
+ if (!execution) {
1465
+ throw new Error('deployment not found');
1466
+ }
1467
+ // find the volume for this service
1468
+ const volumePrefix = `kova-${deploymentId}-${serviceName}-`;
1469
+ let volumeName = execution.volumes.find(v => v.startsWith(volumePrefix));
1470
+ if (!volumeName) {
1471
+ // check docker directly
1472
+ const volumes = await this.docker.listVolumes();
1473
+ const match = volumes.Volumes?.find(v => v.Name?.startsWith(volumePrefix));
1474
+ if (match) {
1475
+ volumeName = match.Name;
1476
+ }
1477
+ }
1478
+ if (!volumeName) {
1479
+ throw new Error(`no volume found for service ${serviceName}`);
1480
+ }
1481
+ const snapshotDir = '/var/kova/snapshots';
1482
+ const snapshotKey = `${snapshotId}.tar.gz`;
1483
+ // ensure snapshot directory exists on host
1484
+ const fs = await import('fs');
1485
+ if (!fs.existsSync(snapshotDir)) {
1486
+ fs.mkdirSync(snapshotDir, { recursive: true });
1487
+ }
1488
+ // create snapshot using a temporary alpine container
1489
+ await this.docker.run('alpine:latest', ['tar', 'czf', `/snapshots/${snapshotKey}`, '-C', '/data', '.'], process.stdout, {
1490
+ HostConfig: {
1491
+ Binds: [
1492
+ `${volumeName}:/data:ro`,
1493
+ `${snapshotDir}:/snapshots`
1494
+ ],
1495
+ AutoRemove: true
1496
+ }
1497
+ });
1498
+ // get snapshot file size
1499
+ const snapshotPath = `${snapshotDir}/${snapshotKey}`;
1500
+ const stat = fs.statSync(snapshotPath);
1501
+ logger.info({ deploymentId, serviceName, volumeName, snapshotId, sizeBytes: stat.size }, 'volume snapshot created');
1502
+ return {
1503
+ volumeName,
1504
+ sizeBytes: stat.size,
1505
+ snapshotKey
1506
+ };
1507
+ }
1508
+ // restore a service's volume from a snapshot
1509
+ async restoreVolumeSnapshot(deploymentId, serviceName, snapshotKey) {
1510
+ const execution = this.executions.get(deploymentId);
1511
+ if (!execution) {
1512
+ throw new Error('deployment not found');
1513
+ }
1514
+ // find the volume for this service
1515
+ const volumePrefix = `kova-${deploymentId}-${serviceName}-`;
1516
+ let volumeName = execution.volumes.find(v => v.startsWith(volumePrefix));
1517
+ if (!volumeName) {
1518
+ const volumes = await this.docker.listVolumes();
1519
+ const match = volumes.Volumes?.find(v => v.Name?.startsWith(volumePrefix));
1520
+ if (match) {
1521
+ volumeName = match.Name;
1522
+ }
1523
+ }
1524
+ if (!volumeName) {
1525
+ throw new Error(`no volume found for service ${serviceName}`);
1526
+ }
1527
+ const snapshotDir = '/var/kova/snapshots';
1528
+ // clear existing volume data
1529
+ await this.docker.run('alpine:latest', ['sh', '-c', 'rm -rf /data/*'], process.stdout, {
1530
+ HostConfig: {
1531
+ Binds: [`${volumeName}:/data`],
1532
+ AutoRemove: true
1533
+ }
1534
+ });
1535
+ // restore from snapshot
1536
+ await this.docker.run('alpine:latest', ['tar', 'xzf', `/snapshots/${snapshotKey}`, '-C', '/data'], process.stdout, {
1537
+ HostConfig: {
1538
+ Binds: [
1539
+ `${volumeName}:/data`,
1540
+ `${snapshotDir}:/snapshots:ro`
1541
+ ],
1542
+ AutoRemove: true
1543
+ }
1544
+ });
1545
+ logger.info({ deploymentId, serviceName, volumeName, snapshotKey }, 'volume snapshot restored');
1546
+ }
1547
+ // close shell session
1548
+ closeShellSession(sessionId) {
1549
+ const session = this.shellSessions.get(sessionId);
1550
+ if (!session) {
1551
+ return;
1552
+ }
1553
+ try {
1554
+ session.stream.end();
1555
+ }
1556
+ catch (err) {
1557
+ // ignore
1558
+ }
1559
+ this.cleanupShellSession(sessionId);
1560
+ logger.info({ sessionId }, 'shell session closed');
1561
+ }
1562
+ }