kailash 0.8.4__py3-none-any.whl → 0.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. kailash/__init__.py +1 -7
  2. kailash/cli/__init__.py +11 -1
  3. kailash/cli/validation_audit.py +570 -0
  4. kailash/core/actors/supervisor.py +1 -1
  5. kailash/core/resilience/circuit_breaker.py +71 -1
  6. kailash/core/resilience/health_monitor.py +172 -0
  7. kailash/edge/compliance.py +33 -0
  8. kailash/edge/consistency.py +609 -0
  9. kailash/edge/coordination/__init__.py +30 -0
  10. kailash/edge/coordination/global_ordering.py +355 -0
  11. kailash/edge/coordination/leader_election.py +217 -0
  12. kailash/edge/coordination/partition_detector.py +296 -0
  13. kailash/edge/coordination/raft.py +485 -0
  14. kailash/edge/discovery.py +63 -1
  15. kailash/edge/migration/__init__.py +19 -0
  16. kailash/edge/migration/edge_migrator.py +832 -0
  17. kailash/edge/monitoring/__init__.py +21 -0
  18. kailash/edge/monitoring/edge_monitor.py +736 -0
  19. kailash/edge/prediction/__init__.py +10 -0
  20. kailash/edge/prediction/predictive_warmer.py +591 -0
  21. kailash/edge/resource/__init__.py +102 -0
  22. kailash/edge/resource/cloud_integration.py +796 -0
  23. kailash/edge/resource/cost_optimizer.py +949 -0
  24. kailash/edge/resource/docker_integration.py +919 -0
  25. kailash/edge/resource/kubernetes_integration.py +893 -0
  26. kailash/edge/resource/platform_integration.py +913 -0
  27. kailash/edge/resource/predictive_scaler.py +959 -0
  28. kailash/edge/resource/resource_analyzer.py +824 -0
  29. kailash/edge/resource/resource_pools.py +610 -0
  30. kailash/integrations/dataflow_edge.py +261 -0
  31. kailash/mcp_server/registry_integration.py +1 -1
  32. kailash/monitoring/__init__.py +18 -0
  33. kailash/monitoring/alerts.py +646 -0
  34. kailash/monitoring/metrics.py +677 -0
  35. kailash/nodes/__init__.py +2 -0
  36. kailash/nodes/ai/semantic_memory.py +2 -2
  37. kailash/nodes/base.py +545 -0
  38. kailash/nodes/edge/__init__.py +36 -0
  39. kailash/nodes/edge/base.py +240 -0
  40. kailash/nodes/edge/cloud_node.py +710 -0
  41. kailash/nodes/edge/coordination.py +239 -0
  42. kailash/nodes/edge/docker_node.py +825 -0
  43. kailash/nodes/edge/edge_data.py +582 -0
  44. kailash/nodes/edge/edge_migration_node.py +392 -0
  45. kailash/nodes/edge/edge_monitoring_node.py +421 -0
  46. kailash/nodes/edge/edge_state.py +673 -0
  47. kailash/nodes/edge/edge_warming_node.py +393 -0
  48. kailash/nodes/edge/kubernetes_node.py +652 -0
  49. kailash/nodes/edge/platform_node.py +766 -0
  50. kailash/nodes/edge/resource_analyzer_node.py +378 -0
  51. kailash/nodes/edge/resource_optimizer_node.py +501 -0
  52. kailash/nodes/edge/resource_scaler_node.py +397 -0
  53. kailash/nodes/ports.py +676 -0
  54. kailash/runtime/local.py +344 -1
  55. kailash/runtime/validation/__init__.py +20 -0
  56. kailash/runtime/validation/connection_context.py +119 -0
  57. kailash/runtime/validation/enhanced_error_formatter.py +202 -0
  58. kailash/runtime/validation/error_categorizer.py +164 -0
  59. kailash/runtime/validation/metrics.py +380 -0
  60. kailash/runtime/validation/performance.py +615 -0
  61. kailash/runtime/validation/suggestion_engine.py +212 -0
  62. kailash/testing/fixtures.py +2 -2
  63. kailash/workflow/builder.py +230 -4
  64. kailash/workflow/contracts.py +418 -0
  65. kailash/workflow/edge_infrastructure.py +369 -0
  66. kailash/workflow/migration.py +3 -3
  67. kailash/workflow/type_inference.py +669 -0
  68. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/METADATA +43 -27
  69. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/RECORD +73 -27
  70. kailash/nexus/__init__.py +0 -21
  71. kailash/nexus/cli/__init__.py +0 -5
  72. kailash/nexus/cli/__main__.py +0 -6
  73. kailash/nexus/cli/main.py +0 -176
  74. kailash/nexus/factory.py +0 -413
  75. kailash/nexus/gateway.py +0 -545
  76. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/WHEEL +0 -0
  77. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/entry_points.txt +0 -0
  78. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/licenses/LICENSE +0 -0
  79. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,919 @@
1
+ """Docker integration for edge resource management."""
2
+
3
+ import asyncio
4
+ import base64
5
+ import json
6
+ from dataclasses import asdict, dataclass
7
+ from datetime import datetime, timedelta
8
+ from enum import Enum
9
+ from typing import Any, Dict, List, Optional, Union
10
+
11
+ try:
12
+ from docker.types import EndpointSpec, LogConfig, RestartPolicy, UpdateConfig
13
+
14
+ import docker
15
+
16
+ DOCKER_AVAILABLE = True
17
+ except ImportError:
18
+ DOCKER_AVAILABLE = False
19
+
20
+
21
+ class ContainerState(Enum):
22
+ """Container states."""
23
+
24
+ CREATED = "created"
25
+ RUNNING = "running"
26
+ PAUSED = "paused"
27
+ RESTARTING = "restarting"
28
+ REMOVING = "removing"
29
+ EXITED = "exited"
30
+ DEAD = "dead"
31
+
32
+
33
+ class RestartPolicyType(Enum):
34
+ """Container restart policies."""
35
+
36
+ NONE = "no"
37
+ ALWAYS = "always"
38
+ UNLESS_STOPPED = "unless-stopped"
39
+ ON_FAILURE = "on-failure"
40
+
41
+
42
+ class NetworkMode(Enum):
43
+ """Docker network modes."""
44
+
45
+ BRIDGE = "bridge"
46
+ HOST = "host"
47
+ NONE = "none"
48
+ CONTAINER = "container"
49
+ CUSTOM = "custom"
50
+
51
+
52
+ @dataclass
53
+ class ContainerSpec:
54
+ """Docker container specification."""
55
+
56
+ name: str
57
+ image: str
58
+ command: Optional[List[str]] = None
59
+ environment: Optional[Dict[str, str]] = None
60
+ ports: Optional[Dict[str, int]] = None # container_port -> host_port
61
+ volumes: Optional[Dict[str, str]] = None # host_path -> container_path
62
+ restart_policy: RestartPolicyType = RestartPolicyType.UNLESS_STOPPED
63
+ memory_limit: Optional[str] = None # e.g., "512m", "1g"
64
+ cpu_limit: Optional[float] = None # CPU cores
65
+ network_mode: NetworkMode = NetworkMode.BRIDGE
66
+ labels: Optional[Dict[str, str]] = None
67
+ edge_node: Optional[str] = None
68
+ healthcheck: Optional[Dict[str, Any]] = None
69
+
70
+ def __post_init__(self):
71
+ if self.environment is None:
72
+ self.environment = {}
73
+ if self.ports is None:
74
+ self.ports = {}
75
+ if self.volumes is None:
76
+ self.volumes = {}
77
+ if self.labels is None:
78
+ self.labels = {}
79
+
80
+ def to_dict(self) -> Dict[str, Any]:
81
+ """Convert to dictionary."""
82
+ data = asdict(self)
83
+ data["restart_policy"] = self.restart_policy.value
84
+ data["network_mode"] = self.network_mode.value
85
+ return data
86
+
87
+ def to_docker_config(self) -> Dict[str, Any]:
88
+ """Convert to Docker API configuration."""
89
+ config = {
90
+ "image": self.image,
91
+ "name": self.name,
92
+ "environment": list(f"{k}={v}" for k, v in self.environment.items()),
93
+ "labels": self.labels.copy(),
94
+ }
95
+
96
+ # Add edge node label if specified
97
+ if self.edge_node:
98
+ config["labels"]["edge-node"] = self.edge_node
99
+
100
+ # Command
101
+ if self.command:
102
+ config["command"] = self.command
103
+
104
+ # Port bindings
105
+ if self.ports:
106
+ config["ports"] = self.ports
107
+ config["host_config"] = config.get("host_config", {})
108
+ config["host_config"]["port_bindings"] = {
109
+ f"{container_port}/tcp": host_port
110
+ for container_port, host_port in self.ports.items()
111
+ }
112
+
113
+ # Volume bindings
114
+ if self.volumes:
115
+ config["host_config"] = config.get("host_config", {})
116
+ config["host_config"]["binds"] = [
117
+ f"{host_path}:{container_path}"
118
+ for host_path, container_path in self.volumes.items()
119
+ ]
120
+
121
+ # Restart policy
122
+ if self.restart_policy != RestartPolicyType.NONE:
123
+ config["host_config"] = config.get("host_config", {})
124
+ config["host_config"]["restart_policy"] = {
125
+ "Name": self.restart_policy.value
126
+ }
127
+
128
+ # Resource limits
129
+ if self.memory_limit or self.cpu_limit:
130
+ config["host_config"] = config.get("host_config", {})
131
+ if self.memory_limit:
132
+ config["host_config"]["mem_limit"] = self.memory_limit
133
+ if self.cpu_limit:
134
+ config["host_config"]["nano_cpus"] = int(self.cpu_limit * 1e9)
135
+
136
+ # Network mode
137
+ if self.network_mode != NetworkMode.BRIDGE:
138
+ config["host_config"] = config.get("host_config", {})
139
+ config["host_config"]["network_mode"] = self.network_mode.value
140
+
141
+ # Health check
142
+ if self.healthcheck:
143
+ config["healthcheck"] = self.healthcheck
144
+
145
+ return config
146
+
147
+
148
+ @dataclass
149
+ class ServiceSpec:
150
+ """Docker Swarm service specification."""
151
+
152
+ name: str
153
+ image: str
154
+ replicas: int = 1
155
+ command: Optional[List[str]] = None
156
+ environment: Optional[Dict[str, str]] = None
157
+ ports: Optional[List[Dict[str, Any]]] = None
158
+ volumes: Optional[List[Dict[str, str]]] = None
159
+ constraints: Optional[List[str]] = None
160
+ placement_preferences: Optional[List[Dict[str, Any]]] = None
161
+ restart_policy: Optional[Dict[str, Any]] = None
162
+ update_config: Optional[Dict[str, Any]] = None
163
+ rollback_config: Optional[Dict[str, Any]] = None
164
+ labels: Optional[Dict[str, str]] = None
165
+ edge_node: Optional[str] = None
166
+
167
+ def __post_init__(self):
168
+ if self.environment is None:
169
+ self.environment = {}
170
+ if self.ports is None:
171
+ self.ports = []
172
+ if self.volumes is None:
173
+ self.volumes = []
174
+ if self.constraints is None:
175
+ self.constraints = []
176
+ if self.placement_preferences is None:
177
+ self.placement_preferences = []
178
+ if self.labels is None:
179
+ self.labels = {}
180
+
181
+ def to_dict(self) -> Dict[str, Any]:
182
+ """Convert to dictionary."""
183
+ return asdict(self)
184
+
185
+ def to_docker_service_spec(self) -> Dict[str, Any]:
186
+ """Convert to Docker service specification."""
187
+ task_template = {
188
+ "ContainerSpec": {
189
+ "Image": self.image,
190
+ "Env": [f"{k}={v}" for k, v in self.environment.items()],
191
+ "Labels": self.labels.copy(),
192
+ },
193
+ "Placement": {
194
+ "Constraints": self.constraints.copy(),
195
+ "Preferences": self.placement_preferences.copy(),
196
+ },
197
+ }
198
+
199
+ # Add edge node constraint if specified
200
+ if self.edge_node:
201
+ task_template["Placement"]["Constraints"].append(
202
+ f"node.labels.edge-node=={self.edge_node}"
203
+ )
204
+
205
+ # Command
206
+ if self.command:
207
+ task_template["ContainerSpec"]["Command"] = self.command
208
+
209
+ # Restart policy
210
+ if self.restart_policy:
211
+ task_template["RestartPolicy"] = self.restart_policy
212
+
213
+ spec = {
214
+ "Name": self.name,
215
+ "TaskTemplate": task_template,
216
+ "Mode": {"Replicated": {"Replicas": self.replicas}},
217
+ "Labels": self.labels.copy(),
218
+ }
219
+
220
+ # Update configuration
221
+ if self.update_config:
222
+ spec["UpdateConfig"] = self.update_config
223
+
224
+ # Rollback configuration
225
+ if self.rollback_config:
226
+ spec["RollbackConfig"] = self.rollback_config
227
+
228
+ # Endpoint spec for ports
229
+ if self.ports:
230
+ spec["EndpointSpec"] = {"Ports": self.ports}
231
+
232
+ return spec
233
+
234
+
235
+ @dataclass
236
+ class ContainerMetrics:
237
+ """Container resource metrics."""
238
+
239
+ container_id: str
240
+ container_name: str
241
+ timestamp: datetime
242
+ cpu_usage_percent: float
243
+ memory_usage_bytes: int
244
+ memory_limit_bytes: int
245
+ network_rx_bytes: int
246
+ network_tx_bytes: int
247
+ block_read_bytes: int
248
+ block_write_bytes: int
249
+
250
+ @property
251
+ def memory_usage_percent(self) -> float:
252
+ """Calculate memory usage percentage."""
253
+ if self.memory_limit_bytes > 0:
254
+ return (self.memory_usage_bytes / self.memory_limit_bytes) * 100
255
+ return 0.0
256
+
257
+ def to_dict(self) -> Dict[str, Any]:
258
+ """Convert to dictionary."""
259
+ data = asdict(self)
260
+ data["timestamp"] = self.timestamp.isoformat()
261
+ data["memory_usage_percent"] = self.memory_usage_percent
262
+ return data
263
+
264
+
265
+ class DockerIntegration:
266
+ """Docker integration for edge resource management."""
267
+
268
+ def __init__(
269
+ self,
270
+ docker_host: Optional[str] = None,
271
+ api_version: str = "auto",
272
+ timeout: int = 60,
273
+ ):
274
+ """Initialize Docker integration.
275
+
276
+ Args:
277
+ docker_host: Docker daemon socket (default: system default)
278
+ api_version: Docker API version
279
+ timeout: API timeout in seconds
280
+ """
281
+ if not DOCKER_AVAILABLE:
282
+ raise ImportError(
283
+ "Docker client not available. Install with: pip install docker"
284
+ )
285
+
286
+ self.docker_host = docker_host
287
+ self.api_version = api_version
288
+ self.timeout = timeout
289
+
290
+ # Docker clients
291
+ self.docker_client: Optional[docker.DockerClient] = None
292
+ self.swarm_enabled = False
293
+
294
+ # Container tracking
295
+ self.containers: Dict[str, ContainerSpec] = {}
296
+ self.services: Dict[str, ServiceSpec] = {}
297
+ self.container_metrics: Dict[str, ContainerMetrics] = {}
298
+
299
+ # Background tasks
300
+ self._monitoring_task: Optional[asyncio.Task] = None
301
+ self._metrics_task: Optional[asyncio.Task] = None
302
+
303
+ # Configuration
304
+ self.monitoring_interval = 30 # seconds
305
+ self.metrics_interval = 10 # seconds
306
+ self.auto_pull_images = True
307
+
308
+ async def initialize(self) -> None:
309
+ """Initialize Docker client."""
310
+ try:
311
+ if self.docker_host:
312
+ self.docker_client = docker.DockerClient(
313
+ base_url=self.docker_host,
314
+ version=self.api_version,
315
+ timeout=self.timeout,
316
+ )
317
+ else:
318
+ self.docker_client = docker.from_env(
319
+ version=self.api_version, timeout=self.timeout
320
+ )
321
+
322
+ # Test connection
323
+ await asyncio.to_thread(self.docker_client.ping)
324
+
325
+ # Check if Swarm is enabled
326
+ try:
327
+ swarm_info = await asyncio.to_thread(self.docker_client.swarm.attrs)
328
+ self.swarm_enabled = True
329
+ except:
330
+ self.swarm_enabled = False
331
+
332
+ except Exception as e:
333
+ raise RuntimeError(f"Failed to initialize Docker client: {e}")
334
+
335
+ async def create_container(self, container_spec: ContainerSpec) -> Dict[str, Any]:
336
+ """Create Docker container.
337
+
338
+ Args:
339
+ container_spec: Container specification
340
+
341
+ Returns:
342
+ Creation result
343
+ """
344
+ if not self.docker_client:
345
+ await self.initialize()
346
+
347
+ try:
348
+ # Pull image if auto-pull is enabled
349
+ if self.auto_pull_images:
350
+ try:
351
+ await asyncio.to_thread(
352
+ self.docker_client.images.pull, container_spec.image
353
+ )
354
+ except Exception as e:
355
+ # Continue if image already exists locally
356
+ pass
357
+
358
+ # Create container
359
+ docker_config = container_spec.to_docker_config()
360
+ container = await asyncio.to_thread(
361
+ self.docker_client.containers.create, **docker_config
362
+ )
363
+
364
+ # Store container spec
365
+ self.containers[container.id] = container_spec
366
+
367
+ return {
368
+ "status": "created",
369
+ "container_id": container.id,
370
+ "container_name": container_spec.name,
371
+ "image": container_spec.image,
372
+ "created_at": datetime.now().isoformat(),
373
+ }
374
+
375
+ except Exception as e:
376
+ return {"status": "error", "error": f"Failed to create container: {e}"}
377
+
378
+ async def start_container(self, container_id: str) -> Dict[str, Any]:
379
+ """Start Docker container.
380
+
381
+ Args:
382
+ container_id: Container ID or name
383
+
384
+ Returns:
385
+ Start result
386
+ """
387
+ if not self.docker_client:
388
+ await self.initialize()
389
+
390
+ try:
391
+ container = await asyncio.to_thread(
392
+ self.docker_client.containers.get, container_id
393
+ )
394
+ await asyncio.to_thread(container.start)
395
+
396
+ return {
397
+ "status": "started",
398
+ "container_id": container.id,
399
+ "container_name": container.name,
400
+ "started_at": datetime.now().isoformat(),
401
+ }
402
+
403
+ except Exception as e:
404
+ return {"status": "error", "error": f"Failed to start container: {e}"}
405
+
406
+ async def stop_container(
407
+ self, container_id: str, timeout: int = 10
408
+ ) -> Dict[str, Any]:
409
+ """Stop Docker container.
410
+
411
+ Args:
412
+ container_id: Container ID or name
413
+ timeout: Stop timeout in seconds
414
+
415
+ Returns:
416
+ Stop result
417
+ """
418
+ if not self.docker_client:
419
+ await self.initialize()
420
+
421
+ try:
422
+ container = await asyncio.to_thread(
423
+ self.docker_client.containers.get, container_id
424
+ )
425
+ await asyncio.to_thread(container.stop, timeout=timeout)
426
+
427
+ return {
428
+ "status": "stopped",
429
+ "container_id": container.id,
430
+ "container_name": container.name,
431
+ "stopped_at": datetime.now().isoformat(),
432
+ }
433
+
434
+ except Exception as e:
435
+ return {"status": "error", "error": f"Failed to stop container: {e}"}
436
+
437
+ async def remove_container(
438
+ self, container_id: str, force: bool = False
439
+ ) -> Dict[str, Any]:
440
+ """Remove Docker container.
441
+
442
+ Args:
443
+ container_id: Container ID or name
444
+ force: Force removal
445
+
446
+ Returns:
447
+ Removal result
448
+ """
449
+ if not self.docker_client:
450
+ await self.initialize()
451
+
452
+ try:
453
+ container = await asyncio.to_thread(
454
+ self.docker_client.containers.get, container_id
455
+ )
456
+ await asyncio.to_thread(container.remove, force=force)
457
+
458
+ # Remove from tracking
459
+ self.containers.pop(container.id, None)
460
+ self.container_metrics.pop(container.id, None)
461
+
462
+ return {
463
+ "status": "removed",
464
+ "container_id": container.id,
465
+ "container_name": container.name,
466
+ "removed_at": datetime.now().isoformat(),
467
+ }
468
+
469
+ except Exception as e:
470
+ return {"status": "error", "error": f"Failed to remove container: {e}"}
471
+
472
+ async def get_container_status(self, container_id: str) -> Dict[str, Any]:
473
+ """Get container status.
474
+
475
+ Args:
476
+ container_id: Container ID or name
477
+
478
+ Returns:
479
+ Container status
480
+ """
481
+ if not self.docker_client:
482
+ await self.initialize()
483
+
484
+ try:
485
+ container = await asyncio.to_thread(
486
+ self.docker_client.containers.get, container_id
487
+ )
488
+ await asyncio.to_thread(container.reload)
489
+
490
+ return {
491
+ "container_id": container.id,
492
+ "container_name": container.name,
493
+ "status": container.status,
494
+ "state": container.attrs["State"],
495
+ "image": (
496
+ container.image.tags[0]
497
+ if container.image.tags
498
+ else container.image.id
499
+ ),
500
+ "created_at": container.attrs["Created"],
501
+ "started_at": container.attrs["State"].get("StartedAt"),
502
+ "finished_at": container.attrs["State"].get("FinishedAt"),
503
+ "ports": container.ports,
504
+ "labels": container.labels,
505
+ "mounts": [
506
+ {
507
+ "source": mount["Source"],
508
+ "destination": mount["Destination"],
509
+ "mode": mount["Mode"],
510
+ "type": mount["Type"],
511
+ }
512
+ for mount in container.attrs.get("Mounts", [])
513
+ ],
514
+ }
515
+
516
+ except Exception as e:
517
+ return {"status": "error", "error": f"Failed to get container status: {e}"}
518
+
519
+ async def list_containers(
520
+ self, all_containers: bool = False, filters: Optional[Dict[str, Any]] = None
521
+ ) -> List[Dict[str, Any]]:
522
+ """List Docker containers.
523
+
524
+ Args:
525
+ all_containers: Include stopped containers
526
+ filters: Container filters
527
+
528
+ Returns:
529
+ List of containers
530
+ """
531
+ if not self.docker_client:
532
+ await self.initialize()
533
+
534
+ try:
535
+ containers = await asyncio.to_thread(
536
+ self.docker_client.containers.list,
537
+ all=all_containers,
538
+ filters=filters or {},
539
+ )
540
+
541
+ container_list = []
542
+ for container in containers:
543
+ container_info = {
544
+ "container_id": container.id,
545
+ "container_name": container.name,
546
+ "status": container.status,
547
+ "image": (
548
+ container.image.tags[0]
549
+ if container.image.tags
550
+ else container.image.id
551
+ ),
552
+ "created_at": container.attrs["Created"],
553
+ "labels": container.labels,
554
+ "ports": container.ports,
555
+ }
556
+ container_list.append(container_info)
557
+
558
+ return container_list
559
+
560
+ except Exception as e:
561
+ raise RuntimeError(f"Failed to list containers: {e}")
562
+
563
+ async def create_service(self, service_spec: ServiceSpec) -> Dict[str, Any]:
564
+ """Create Docker Swarm service.
565
+
566
+ Args:
567
+ service_spec: Service specification
568
+
569
+ Returns:
570
+ Creation result
571
+ """
572
+ if not self.docker_client:
573
+ await self.initialize()
574
+
575
+ if not self.swarm_enabled:
576
+ return {"status": "error", "error": "Docker Swarm is not enabled"}
577
+
578
+ try:
579
+ # Pull image if auto-pull is enabled
580
+ if self.auto_pull_images:
581
+ try:
582
+ await asyncio.to_thread(
583
+ self.docker_client.images.pull, service_spec.image
584
+ )
585
+ except Exception:
586
+ pass
587
+
588
+ # Create service
589
+ docker_spec = service_spec.to_docker_service_spec()
590
+ service = await asyncio.to_thread(
591
+ self.docker_client.services.create, **docker_spec
592
+ )
593
+
594
+ # Store service spec
595
+ self.services[service.id] = service_spec
596
+
597
+ return {
598
+ "status": "created",
599
+ "service_id": service.id,
600
+ "service_name": service_spec.name,
601
+ "image": service_spec.image,
602
+ "replicas": service_spec.replicas,
603
+ "created_at": datetime.now().isoformat(),
604
+ }
605
+
606
+ except Exception as e:
607
+ return {"status": "error", "error": f"Failed to create service: {e}"}
608
+
609
+ async def update_service(
610
+ self, service_id: str, service_spec: ServiceSpec
611
+ ) -> Dict[str, Any]:
612
+ """Update Docker Swarm service.
613
+
614
+ Args:
615
+ service_id: Service ID or name
616
+ service_spec: Updated service specification
617
+
618
+ Returns:
619
+ Update result
620
+ """
621
+ if not self.docker_client:
622
+ await self.initialize()
623
+
624
+ if not self.swarm_enabled:
625
+ return {"status": "error", "error": "Docker Swarm is not enabled"}
626
+
627
+ try:
628
+ service = await asyncio.to_thread(
629
+ self.docker_client.services.get, service_id
630
+ )
631
+ docker_spec = service_spec.to_docker_service_spec()
632
+
633
+ await asyncio.to_thread(service.update, **docker_spec)
634
+
635
+ # Update stored spec
636
+ self.services[service.id] = service_spec
637
+
638
+ return {
639
+ "status": "updated",
640
+ "service_id": service.id,
641
+ "service_name": service_spec.name,
642
+ "updated_at": datetime.now().isoformat(),
643
+ }
644
+
645
+ except Exception as e:
646
+ return {"status": "error", "error": f"Failed to update service: {e}"}
647
+
648
+ async def scale_service(self, service_id: str, replicas: int) -> Dict[str, Any]:
649
+ """Scale Docker Swarm service.
650
+
651
+ Args:
652
+ service_id: Service ID or name
653
+ replicas: Target replica count
654
+
655
+ Returns:
656
+ Scaling result
657
+ """
658
+ if not self.docker_client:
659
+ await self.initialize()
660
+
661
+ if not self.swarm_enabled:
662
+ return {"status": "error", "error": "Docker Swarm is not enabled"}
663
+
664
+ try:
665
+ service = await asyncio.to_thread(
666
+ self.docker_client.services.get, service_id
667
+ )
668
+ await asyncio.to_thread(service.scale, replicas)
669
+
670
+ return {
671
+ "status": "scaled",
672
+ "service_id": service.id,
673
+ "service_name": service.name,
674
+ "target_replicas": replicas,
675
+ "scaled_at": datetime.now().isoformat(),
676
+ }
677
+
678
+ except Exception as e:
679
+ return {"status": "error", "error": f"Failed to scale service: {e}"}
680
+
681
+ async def get_service_status(self, service_id: str) -> Dict[str, Any]:
682
+ """Get service status.
683
+
684
+ Args:
685
+ service_id: Service ID or name
686
+
687
+ Returns:
688
+ Service status
689
+ """
690
+ if not self.docker_client:
691
+ await self.initialize()
692
+
693
+ if not self.swarm_enabled:
694
+ return {"status": "error", "error": "Docker Swarm is not enabled"}
695
+
696
+ try:
697
+ service = await asyncio.to_thread(
698
+ self.docker_client.services.get, service_id
699
+ )
700
+ tasks = await asyncio.to_thread(service.tasks)
701
+
702
+ running_tasks = sum(
703
+ 1 for task in tasks if task.get("Status", {}).get("State") == "running"
704
+ )
705
+ total_tasks = len(tasks)
706
+
707
+ return {
708
+ "service_id": service.id,
709
+ "service_name": service.name,
710
+ "mode": service.attrs["Spec"]["Mode"],
711
+ "replicas": service.attrs["Spec"]["Mode"]
712
+ .get("Replicated", {})
713
+ .get("Replicas", 0),
714
+ "running_tasks": running_tasks,
715
+ "total_tasks": total_tasks,
716
+ "image": service.attrs["Spec"]["TaskTemplate"]["ContainerSpec"][
717
+ "Image"
718
+ ],
719
+ "created_at": service.attrs["CreatedAt"],
720
+ "updated_at": service.attrs["UpdatedAt"],
721
+ "labels": service.attrs["Spec"].get("Labels", {}),
722
+ "tasks": [
723
+ {
724
+ "id": task["ID"],
725
+ "state": task.get("Status", {}).get("State"),
726
+ "desired_state": task.get("DesiredState"),
727
+ "node_id": task.get("NodeID"),
728
+ "timestamp": task.get("Status", {}).get("Timestamp"),
729
+ }
730
+ for task in tasks
731
+ ],
732
+ }
733
+
734
+ except Exception as e:
735
+ return {"status": "error", "error": f"Failed to get service status: {e}"}
736
+
737
+ async def collect_container_metrics(
738
+ self, container_id: str
739
+ ) -> Optional[ContainerMetrics]:
740
+ """Collect container resource metrics.
741
+
742
+ Args:
743
+ container_id: Container ID
744
+
745
+ Returns:
746
+ Container metrics or None if failed
747
+ """
748
+ if not self.docker_client:
749
+ await self.initialize()
750
+
751
+ try:
752
+ container = await asyncio.to_thread(
753
+ self.docker_client.containers.get, container_id
754
+ )
755
+ stats = await asyncio.to_thread(container.stats, stream=False)
756
+
757
+ # Calculate CPU usage percentage
758
+ cpu_delta = (
759
+ stats["cpu_stats"]["cpu_usage"]["total_usage"]
760
+ - stats["precpu_stats"]["cpu_usage"]["total_usage"]
761
+ )
762
+ system_delta = (
763
+ stats["cpu_stats"]["system_cpu_usage"]
764
+ - stats["precpu_stats"]["system_cpu_usage"]
765
+ )
766
+
767
+ cpu_usage_percent = 0.0
768
+ if system_delta > 0:
769
+ cpu_usage_percent = (
770
+ (cpu_delta / system_delta)
771
+ * len(stats["cpu_stats"]["cpu_usage"].get("percpu_usage", [1]))
772
+ * 100
773
+ )
774
+
775
+ # Memory usage
776
+ memory_usage = stats["memory_stats"]["usage"]
777
+ memory_limit = stats["memory_stats"]["limit"]
778
+
779
+ # Network I/O
780
+ networks = stats.get("networks", {})
781
+ network_rx = sum(net["rx_bytes"] for net in networks.values())
782
+ network_tx = sum(net["tx_bytes"] for net in networks.values())
783
+
784
+ # Block I/O
785
+ blkio_stats = stats.get("blkio_stats", {}).get(
786
+ "io_service_bytes_recursive", []
787
+ )
788
+ block_read = sum(
789
+ entry["value"] for entry in blkio_stats if entry["op"] == "Read"
790
+ )
791
+ block_write = sum(
792
+ entry["value"] for entry in blkio_stats if entry["op"] == "Write"
793
+ )
794
+
795
+ metrics = ContainerMetrics(
796
+ container_id=container.id,
797
+ container_name=container.name,
798
+ timestamp=datetime.now(),
799
+ cpu_usage_percent=cpu_usage_percent,
800
+ memory_usage_bytes=memory_usage,
801
+ memory_limit_bytes=memory_limit,
802
+ network_rx_bytes=network_rx,
803
+ network_tx_bytes=network_tx,
804
+ block_read_bytes=block_read,
805
+ block_write_bytes=block_write,
806
+ )
807
+
808
+ # Store metrics
809
+ self.container_metrics[container_id] = metrics
810
+
811
+ return metrics
812
+
813
+ except Exception:
814
+ return None
815
+
816
+ async def get_system_info(self) -> Dict[str, Any]:
817
+ """Get Docker system information.
818
+
819
+ Returns:
820
+ System information
821
+ """
822
+ if not self.docker_client:
823
+ await self.initialize()
824
+
825
+ try:
826
+ info = await asyncio.to_thread(self.docker_client.info)
827
+ version = await asyncio.to_thread(self.docker_client.version)
828
+
829
+ return {
830
+ "system_info": {
831
+ "containers": info.get("Containers", 0),
832
+ "containers_running": info.get("ContainersRunning", 0),
833
+ "containers_paused": info.get("ContainersPaused", 0),
834
+ "containers_stopped": info.get("ContainersStopped", 0),
835
+ "images": info.get("Images", 0),
836
+ "driver": info.get("Driver"),
837
+ "memory_limit": info.get("MemoryLimit"),
838
+ "swap_limit": info.get("SwapLimit"),
839
+ "cpus": info.get("NCPU", 0),
840
+ "memory": info.get("MemTotal", 0),
841
+ "docker_root_dir": info.get("DockerRootDir"),
842
+ "swarm": info.get("Swarm", {}),
843
+ },
844
+ "version_info": version,
845
+ "swarm_enabled": self.swarm_enabled,
846
+ }
847
+
848
+ except Exception as e:
849
+ return {"status": "error", "error": f"Failed to get system info: {e}"}
850
+
851
+ async def start_monitoring(self) -> None:
852
+ """Start container monitoring."""
853
+ if self._monitoring_task and not self._monitoring_task.done():
854
+ return
855
+
856
+ self._monitoring_task = asyncio.create_task(self._monitor_containers())
857
+ self._metrics_task = asyncio.create_task(self._collect_metrics())
858
+
859
+ async def stop_monitoring(self) -> None:
860
+ """Stop container monitoring."""
861
+ if self._monitoring_task and not self._monitoring_task.done():
862
+ self._monitoring_task.cancel()
863
+ try:
864
+ await self._monitoring_task
865
+ except asyncio.CancelledError:
866
+ pass
867
+
868
+ if self._metrics_task and not self._metrics_task.done():
869
+ self._metrics_task.cancel()
870
+ try:
871
+ await self._metrics_task
872
+ except asyncio.CancelledError:
873
+ pass
874
+
875
+ async def _monitor_containers(self) -> None:
876
+ """Monitor containers continuously."""
877
+ while True:
878
+ try:
879
+ # Get list of running containers
880
+ containers = await self.list_containers(all_containers=False)
881
+
882
+ # Update container status for tracked containers
883
+ for container_id in list(self.containers.keys()):
884
+ try:
885
+ status = await self.get_container_status(container_id)
886
+ # Update internal tracking based on status
887
+ except Exception:
888
+ # Container might have been removed
889
+ self.containers.pop(container_id, None)
890
+
891
+ await asyncio.sleep(self.monitoring_interval)
892
+
893
+ except asyncio.CancelledError:
894
+ break
895
+ except Exception as e:
896
+ # Log error and continue monitoring
897
+ print(f"Container monitoring error: {e}")
898
+ await asyncio.sleep(self.monitoring_interval)
899
+
900
+ async def _collect_metrics(self) -> None:
901
+ """Collect container metrics continuously."""
902
+ while True:
903
+ try:
904
+ # Collect metrics for all running containers
905
+ containers = await self.list_containers(all_containers=False)
906
+
907
+ for container_info in containers:
908
+ container_id = container_info["container_id"]
909
+ if container_info["status"] == "running":
910
+ await self.collect_container_metrics(container_id)
911
+
912
+ await asyncio.sleep(self.metrics_interval)
913
+
914
+ except asyncio.CancelledError:
915
+ break
916
+ except Exception as e:
917
+ # Log error and continue collecting
918
+ print(f"Metrics collection error: {e}")
919
+ await asyncio.sleep(self.metrics_interval)