kailash 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. kailash/__init__.py +1 -1
  2. kailash/access_control/__init__.py +1 -1
  3. kailash/client/__init__.py +12 -0
  4. kailash/client/enhanced_client.py +306 -0
  5. kailash/core/actors/__init__.py +16 -0
  6. kailash/core/actors/adaptive_pool_controller.py +630 -0
  7. kailash/core/actors/connection_actor.py +566 -0
  8. kailash/core/actors/supervisor.py +364 -0
  9. kailash/core/ml/__init__.py +1 -0
  10. kailash/core/ml/query_patterns.py +544 -0
  11. kailash/core/monitoring/__init__.py +19 -0
  12. kailash/core/monitoring/connection_metrics.py +488 -0
  13. kailash/core/optimization/__init__.py +1 -0
  14. kailash/core/resilience/__init__.py +17 -0
  15. kailash/core/resilience/circuit_breaker.py +382 -0
  16. kailash/edge/__init__.py +16 -0
  17. kailash/edge/compliance.py +834 -0
  18. kailash/edge/discovery.py +659 -0
  19. kailash/edge/location.py +582 -0
  20. kailash/gateway/__init__.py +33 -0
  21. kailash/gateway/api.py +289 -0
  22. kailash/gateway/enhanced_gateway.py +357 -0
  23. kailash/gateway/resource_resolver.py +217 -0
  24. kailash/gateway/security.py +227 -0
  25. kailash/middleware/auth/access_control.py +6 -6
  26. kailash/middleware/auth/models.py +2 -2
  27. kailash/middleware/communication/ai_chat.py +7 -7
  28. kailash/middleware/communication/api_gateway.py +5 -15
  29. kailash/middleware/database/base_models.py +1 -7
  30. kailash/middleware/gateway/__init__.py +22 -0
  31. kailash/middleware/gateway/checkpoint_manager.py +398 -0
  32. kailash/middleware/gateway/deduplicator.py +382 -0
  33. kailash/middleware/gateway/durable_gateway.py +417 -0
  34. kailash/middleware/gateway/durable_request.py +498 -0
  35. kailash/middleware/gateway/event_store.py +499 -0
  36. kailash/middleware/mcp/enhanced_server.py +2 -2
  37. kailash/nodes/admin/permission_check.py +817 -33
  38. kailash/nodes/admin/role_management.py +1242 -108
  39. kailash/nodes/admin/schema_manager.py +438 -0
  40. kailash/nodes/admin/user_management.py +1124 -1582
  41. kailash/nodes/code/__init__.py +8 -1
  42. kailash/nodes/code/async_python.py +1035 -0
  43. kailash/nodes/code/python.py +1 -0
  44. kailash/nodes/data/async_sql.py +9 -3
  45. kailash/nodes/data/query_pipeline.py +641 -0
  46. kailash/nodes/data/query_router.py +895 -0
  47. kailash/nodes/data/sql.py +20 -11
  48. kailash/nodes/data/workflow_connection_pool.py +1071 -0
  49. kailash/nodes/monitoring/__init__.py +3 -5
  50. kailash/nodes/monitoring/connection_dashboard.py +822 -0
  51. kailash/nodes/rag/__init__.py +2 -7
  52. kailash/resources/__init__.py +40 -0
  53. kailash/resources/factory.py +533 -0
  54. kailash/resources/health.py +319 -0
  55. kailash/resources/reference.py +288 -0
  56. kailash/resources/registry.py +392 -0
  57. kailash/runtime/async_local.py +711 -302
  58. kailash/testing/__init__.py +34 -0
  59. kailash/testing/async_test_case.py +353 -0
  60. kailash/testing/async_utils.py +345 -0
  61. kailash/testing/fixtures.py +458 -0
  62. kailash/testing/mock_registry.py +495 -0
  63. kailash/workflow/__init__.py +8 -0
  64. kailash/workflow/async_builder.py +621 -0
  65. kailash/workflow/async_patterns.py +766 -0
  66. kailash/workflow/cyclic_runner.py +107 -16
  67. kailash/workflow/graph.py +7 -2
  68. kailash/workflow/resilience.py +11 -1
  69. {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/METADATA +19 -4
  70. {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/RECORD +74 -28
  71. {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/WHEEL +0 -0
  72. {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/entry_points.txt +0 -0
  73. {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/licenses/LICENSE +0 -0
  74. {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1071 @@
1
+ """Workflow-scoped connection pool for production-grade database management.
2
+
3
+ This module implements a connection pool that is scoped to workflow lifecycle,
4
+ providing better resource management and isolation compared to global pools.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ import time
10
+ import uuid
11
+ from collections import defaultdict
12
+ from datetime import datetime
13
+ from typing import Any, Dict, List, Optional, Set
14
+
15
+ from kailash.core.actors import (
16
+ ActorConnection,
17
+ ActorSupervisor,
18
+ ConnectionActor,
19
+ ConnectionState,
20
+ SupervisionStrategy,
21
+ )
22
+ from kailash.core.actors.adaptive_pool_controller import AdaptivePoolController
23
+ from kailash.core.ml.query_patterns import QueryPatternTracker
24
+ from kailash.core.monitoring.connection_metrics import (
25
+ ConnectionMetricsCollector,
26
+ ErrorCategory,
27
+ )
28
+ from kailash.core.resilience.circuit_breaker import (
29
+ CircuitBreakerConfig,
30
+ CircuitBreakerError,
31
+ ConnectionCircuitBreaker,
32
+ )
33
+ from kailash.nodes.base import NodeParameter, register_node
34
+ from kailash.nodes.base_async import AsyncNode
35
+ from kailash.sdk_exceptions import NodeExecutionError
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class ConnectionPoolMetrics:
41
+ """Metrics collector for connection pool monitoring."""
42
+
43
+ def __init__(self, pool_name: str):
44
+ self.pool_name = pool_name
45
+ self.connections_created = 0
46
+ self.connections_recycled = 0
47
+ self.connections_failed = 0
48
+ self.queries_executed = 0
49
+ self.query_errors = 0
50
+ self.acquisition_wait_times: List[float] = []
51
+ self.health_check_results: List[bool] = []
52
+ self.start_time = time.time()
53
+
54
+ def record_acquisition_time(self, wait_time: float):
55
+ """Record time waited to acquire connection."""
56
+ self.acquisition_wait_times.append(wait_time)
57
+ # Keep only last 1000 measurements
58
+ if len(self.acquisition_wait_times) > 1000:
59
+ self.acquisition_wait_times = self.acquisition_wait_times[-1000:]
60
+
61
+ def get_stats(self) -> Dict[str, Any]:
62
+ """Get comprehensive pool statistics."""
63
+ uptime = time.time() - self.start_time
64
+
65
+ # Calculate averages
66
+ avg_wait_time = (
67
+ sum(self.acquisition_wait_times) / len(self.acquisition_wait_times)
68
+ if self.acquisition_wait_times
69
+ else 0.0
70
+ )
71
+
72
+ health_success_rate = (
73
+ sum(1 for h in self.health_check_results if h)
74
+ / len(self.health_check_results)
75
+ if self.health_check_results
76
+ else 1.0
77
+ )
78
+
79
+ return {
80
+ "pool_name": self.pool_name,
81
+ "uptime_seconds": uptime,
82
+ "connections": {
83
+ "created": self.connections_created,
84
+ "recycled": self.connections_recycled,
85
+ "failed": self.connections_failed,
86
+ },
87
+ "queries": {
88
+ "executed": self.queries_executed,
89
+ "errors": self.query_errors,
90
+ "error_rate": (
91
+ self.query_errors / self.queries_executed
92
+ if self.queries_executed > 0
93
+ else 0
94
+ ),
95
+ },
96
+ "performance": {
97
+ "avg_acquisition_time_ms": avg_wait_time * 1000,
98
+ "p99_acquisition_time_ms": (
99
+ sorted(self.acquisition_wait_times)[
100
+ int(len(self.acquisition_wait_times) * 0.99)
101
+ ]
102
+ * 1000
103
+ if self.acquisition_wait_times
104
+ else 0
105
+ ),
106
+ },
107
+ "health": {
108
+ "success_rate": health_success_rate,
109
+ "checks_performed": len(self.health_check_results),
110
+ },
111
+ }
112
+
113
+
114
+ class WorkflowPatternAnalyzer:
115
+ """Analyzes workflow patterns for optimization."""
116
+
117
+ def __init__(self):
118
+ self.workflow_patterns: Dict[str, Dict[str, Any]] = {}
119
+ self.connection_usage: Dict[str, List[float]] = defaultdict(list)
120
+
121
+ def record_workflow_start(self, workflow_id: str, workflow_type: str):
122
+ """Record workflow start for pattern analysis."""
123
+ self.workflow_patterns[workflow_id] = {
124
+ "type": workflow_type,
125
+ "start_time": time.time(),
126
+ "connections_used": 0,
127
+ "peak_connections": 0,
128
+ }
129
+
130
+ def record_connection_usage(self, workflow_id: str, active_connections: int):
131
+ """Record connection usage for workflow."""
132
+ if workflow_id in self.workflow_patterns:
133
+ pattern = self.workflow_patterns[workflow_id]
134
+ pattern["connections_used"] = max(
135
+ pattern["connections_used"], active_connections
136
+ )
137
+ self.connection_usage[workflow_id].append(active_connections)
138
+
139
+ def get_expected_connections(self, workflow_type: str) -> int:
140
+ """Get expected connection count for workflow type."""
141
+ # Analyze historical data for this workflow type
142
+ similar_workflows = [
143
+ p
144
+ for p in self.workflow_patterns.values()
145
+ if p["type"] == workflow_type and "connections_used" in p
146
+ ]
147
+
148
+ if not similar_workflows:
149
+ return 2 # Default
150
+
151
+ # Return 90th percentile of historical usage
152
+ usage_values = sorted([w["connections_used"] for w in similar_workflows])
153
+ percentile_index = int(len(usage_values) * 0.9)
154
+ return usage_values[percentile_index] if usage_values else 2
155
+
156
+
157
+ @register_node()
158
+ class WorkflowConnectionPool(AsyncNode):
159
+ """
160
+ Workflow-scoped connection pool with production-grade features.
161
+
162
+ This node provides:
163
+ - Connections scoped to workflow lifecycle
164
+ - Actor-based isolation for each connection
165
+ - Automatic health monitoring and recycling
166
+ - Pattern-based pre-warming
167
+ - Comprehensive metrics and monitoring
168
+
169
+ Example:
170
+ >>> pool = WorkflowConnectionPool(
171
+ ... name="workflow_db_pool",
172
+ ... database_type="postgresql",
173
+ ... host="localhost",
174
+ ... database="myapp",
175
+ ... user="dbuser",
176
+ ... password="dbpass",
177
+ ... min_connections=2,
178
+ ... max_connections=10
179
+ ... )
180
+ >>>
181
+ >>> # Get connection
182
+ >>> result = await pool.process({"operation": "acquire"})
183
+ >>> conn_id = result["connection_id"]
184
+ >>>
185
+ >>> # Execute query
186
+ >>> query_result = await pool.process({
187
+ ... "operation": "execute",
188
+ ... "connection_id": conn_id,
189
+ ... "query": "SELECT * FROM users WHERE active = true",
190
+ ... })
191
+ """
192
+
193
+ def __init__(self, **config):
194
+ super().__init__(**config)
195
+
196
+ # Pool configuration
197
+ self.min_connections = config.get("min_connections", 2)
198
+ self.max_connections = config.get("max_connections", 10)
199
+ self.health_threshold = config.get("health_threshold", 50)
200
+ self.pre_warm_enabled = config.get("pre_warm", True)
201
+ self.adaptive_sizing_enabled = config.get("adaptive_sizing", False)
202
+ self.enable_query_routing = config.get("enable_query_routing", False)
203
+
204
+ # Database configuration
205
+ self.db_config = {
206
+ "type": config.get("database_type", "postgresql"),
207
+ "host": config.get("host"),
208
+ "port": config.get("port"),
209
+ "database": config.get("database"),
210
+ "user": config.get("user"),
211
+ "password": config.get("password"),
212
+ "connection_string": config.get("connection_string"),
213
+ }
214
+
215
+ # Actor supervision
216
+ self.supervisor = ActorSupervisor(
217
+ name=f"{self.metadata.name}_supervisor",
218
+ strategy=SupervisionStrategy.ONE_FOR_ONE,
219
+ max_restarts=3,
220
+ restart_window=60.0,
221
+ )
222
+
223
+ # Connection tracking
224
+ self.available_connections: asyncio.Queue = asyncio.Queue()
225
+ self.active_connections: Dict[str, ConnectionActor] = {}
226
+ self.all_connections: Dict[str, ConnectionActor] = {}
227
+
228
+ # Workflow integration
229
+ self.workflow_id: Optional[str] = None
230
+ self.pattern_analyzer = WorkflowPatternAnalyzer()
231
+
232
+ # Metrics
233
+ self.metrics = ConnectionPoolMetrics(self.metadata.name)
234
+
235
+ # State
236
+ self._initialized = False
237
+ self._closing = False
238
+
239
+ # Phase 2 components
240
+ self.query_pattern_tracker = None
241
+ self.adaptive_controller = None
242
+
243
+ if self.enable_query_routing:
244
+ self.query_pattern_tracker = QueryPatternTracker()
245
+
246
+ if self.adaptive_sizing_enabled:
247
+ self.adaptive_controller = AdaptivePoolController(
248
+ min_size=self.min_connections, max_size=self.max_connections
249
+ )
250
+
251
+ # Phase 3 components
252
+ # Circuit breaker for connection failures
253
+ self.circuit_breaker_config = CircuitBreakerConfig(
254
+ failure_threshold=config.get("circuit_breaker_failure_threshold", 5),
255
+ recovery_timeout=config.get("circuit_breaker_recovery_timeout", 60),
256
+ error_rate_threshold=config.get("circuit_breaker_error_rate", 0.5),
257
+ )
258
+ self.circuit_breaker = ConnectionCircuitBreaker(self.circuit_breaker_config)
259
+
260
+ # Comprehensive metrics collector
261
+ self.metrics_collector = ConnectionMetricsCollector(
262
+ pool_name=self.metadata.name,
263
+ retention_minutes=config.get("metrics_retention_minutes", 60),
264
+ )
265
+
266
+ # Enable query pipelining support
267
+ self.enable_pipelining = config.get("enable_pipelining", False)
268
+ self.pipeline_batch_size = config.get("pipeline_batch_size", 100)
269
+
270
+ # Monitoring dashboard integration
271
+ self.enable_monitoring = config.get("enable_monitoring", False)
272
+ self.monitoring_port = config.get("monitoring_port", 8080)
273
+
274
+ def get_parameters(self) -> Dict[str, NodeParameter]:
275
+ """Define node parameters."""
276
+ params = [
277
+ # Database connection parameters
278
+ NodeParameter(
279
+ name="database_type",
280
+ type=str,
281
+ required=True,
282
+ default="postgresql",
283
+ description="Database type: postgresql, mysql, or sqlite",
284
+ ),
285
+ NodeParameter(
286
+ name="connection_string",
287
+ type=str,
288
+ required=False,
289
+ description="Full connection string (overrides individual params)",
290
+ ),
291
+ NodeParameter(
292
+ name="host", type=str, required=False, description="Database host"
293
+ ),
294
+ NodeParameter(
295
+ name="port", type=int, required=False, description="Database port"
296
+ ),
297
+ NodeParameter(
298
+ name="database", type=str, required=False, description="Database name"
299
+ ),
300
+ NodeParameter(
301
+ name="user", type=str, required=False, description="Database user"
302
+ ),
303
+ NodeParameter(
304
+ name="password",
305
+ type=str,
306
+ required=False,
307
+ description="Database password",
308
+ ),
309
+ # Pool configuration
310
+ NodeParameter(
311
+ name="min_connections",
312
+ type=int,
313
+ required=False,
314
+ default=2,
315
+ description="Minimum pool connections",
316
+ ),
317
+ NodeParameter(
318
+ name="max_connections",
319
+ type=int,
320
+ required=False,
321
+ default=10,
322
+ description="Maximum pool connections",
323
+ ),
324
+ NodeParameter(
325
+ name="health_threshold",
326
+ type=int,
327
+ required=False,
328
+ default=50,
329
+ description="Minimum health score to keep connection",
330
+ ),
331
+ NodeParameter(
332
+ name="pre_warm",
333
+ type=bool,
334
+ required=False,
335
+ default=True,
336
+ description="Enable pattern-based pre-warming",
337
+ ),
338
+ NodeParameter(
339
+ name="adaptive_sizing",
340
+ type=bool,
341
+ required=False,
342
+ default=False,
343
+ description="Enable adaptive pool sizing based on workload",
344
+ ),
345
+ NodeParameter(
346
+ name="enable_query_routing",
347
+ type=bool,
348
+ required=False,
349
+ default=False,
350
+ description="Enable query pattern tracking for routing optimization",
351
+ ),
352
+ # Phase 3 parameters
353
+ NodeParameter(
354
+ name="circuit_breaker_failure_threshold",
355
+ type=int,
356
+ required=False,
357
+ default=5,
358
+ description="Failures before circuit breaker opens",
359
+ ),
360
+ NodeParameter(
361
+ name="circuit_breaker_recovery_timeout",
362
+ type=int,
363
+ required=False,
364
+ default=60,
365
+ description="Seconds before circuit breaker tries recovery",
366
+ ),
367
+ NodeParameter(
368
+ name="circuit_breaker_error_rate",
369
+ type=float,
370
+ required=False,
371
+ default=0.5,
372
+ description="Error rate threshold to open circuit",
373
+ ),
374
+ NodeParameter(
375
+ name="metrics_retention_minutes",
376
+ type=int,
377
+ required=False,
378
+ default=60,
379
+ description="How long to retain detailed metrics",
380
+ ),
381
+ NodeParameter(
382
+ name="enable_pipelining",
383
+ type=bool,
384
+ required=False,
385
+ default=False,
386
+ description="Enable query pipelining for batch operations",
387
+ ),
388
+ NodeParameter(
389
+ name="pipeline_batch_size",
390
+ type=int,
391
+ required=False,
392
+ default=100,
393
+ description="Maximum queries per pipeline batch",
394
+ ),
395
+ NodeParameter(
396
+ name="enable_monitoring",
397
+ type=bool,
398
+ required=False,
399
+ default=False,
400
+ description="Enable monitoring dashboard",
401
+ ),
402
+ NodeParameter(
403
+ name="monitoring_port",
404
+ type=int,
405
+ required=False,
406
+ default=8080,
407
+ description="Port for monitoring dashboard",
408
+ ),
409
+ # Operation parameters
410
+ NodeParameter(
411
+ name="operation",
412
+ type=str,
413
+ required=True,
414
+ description="Operation: initialize, acquire, release, execute, stats",
415
+ ),
416
+ NodeParameter(
417
+ name="connection_id",
418
+ type=str,
419
+ required=False,
420
+ description="Connection ID for operations",
421
+ ),
422
+ NodeParameter(
423
+ name="query",
424
+ type=str,
425
+ required=False,
426
+ description="SQL query to execute",
427
+ ),
428
+ NodeParameter(
429
+ name="params", type=Any, required=False, description="Query parameters"
430
+ ),
431
+ NodeParameter(
432
+ name="fetch_mode",
433
+ type=str,
434
+ required=False,
435
+ default="all",
436
+ description="Fetch mode: one, all, many",
437
+ ),
438
+ ]
439
+
440
+ # Convert list to dict as required by base class
441
+ return {param.name: param for param in params}
442
+
443
+ async def on_workflow_start(
444
+ self, workflow_id: str, workflow_type: Optional[str] = None
445
+ ):
446
+ """Called when workflow starts - pre-warm connections."""
447
+ self.workflow_id = workflow_id
448
+ self.pattern_analyzer.record_workflow_start(
449
+ workflow_id, workflow_type or "unknown"
450
+ )
451
+
452
+ if self.pre_warm_enabled and workflow_type:
453
+ expected_connections = self.pattern_analyzer.get_expected_connections(
454
+ workflow_type
455
+ )
456
+ await self._pre_warm_connections(expected_connections)
457
+
458
+ async def on_workflow_complete(self, workflow_id: str):
459
+ """Called when workflow completes - clean up resources."""
460
+ if workflow_id == self.workflow_id:
461
+ await self._cleanup()
462
+
463
+ async def async_run(self, **inputs) -> Dict[str, Any]:
464
+ """Process connection pool operations."""
465
+ operation = inputs.get("operation")
466
+
467
+ if operation == "initialize":
468
+ return await self._initialize()
469
+ elif operation == "acquire":
470
+ return await self._acquire_connection()
471
+ elif operation == "release":
472
+ return await self._release_connection(inputs.get("connection_id"))
473
+ elif operation == "execute":
474
+ return await self._execute_query(inputs)
475
+ elif operation == "stats":
476
+ return await self._get_stats()
477
+ elif operation == "get_status":
478
+ return await self._get_pool_status()
479
+ elif operation == "adjust_pool_size":
480
+ return await self.adjust_pool_size(inputs.get("new_size"))
481
+ elif operation == "get_pool_statistics":
482
+ return await self.get_pool_statistics()
483
+ elif operation == "get_comprehensive_status":
484
+ return await self.get_comprehensive_status()
485
+ elif operation == "start_monitoring":
486
+ return await self._start_monitoring_dashboard()
487
+ elif operation == "stop_monitoring":
488
+ return await self._stop_monitoring_dashboard()
489
+ elif operation == "export_metrics":
490
+ return {"prometheus_metrics": self.metrics_collector.export_prometheus()}
491
+ else:
492
+ raise NodeExecutionError(f"Unknown operation: {operation}")
493
+
494
+ async def _initialize(self) -> Dict[str, Any]:
495
+ """Initialize the connection pool."""
496
+ if self._initialized:
497
+ return {"status": "already_initialized"}
498
+
499
+ try:
500
+ # Start supervisor
501
+ await self.supervisor.start()
502
+
503
+ # Set up callbacks
504
+ self.supervisor.on_actor_failure = self._on_connection_failure
505
+ self.supervisor.on_actor_restart = self._on_connection_restart
506
+
507
+ # Create minimum connections
508
+ await self._ensure_min_connections()
509
+
510
+ # Start adaptive controller if enabled
511
+ if self.adaptive_controller:
512
+ await self.adaptive_controller.start(
513
+ pool_ref=self, pattern_tracker=self.query_pattern_tracker
514
+ )
515
+
516
+ self._initialized = True
517
+
518
+ return {
519
+ "status": "initialized",
520
+ "min_connections": self.min_connections,
521
+ "max_connections": self.max_connections,
522
+ "adaptive_sizing": self.adaptive_sizing_enabled,
523
+ "query_routing": self.enable_query_routing,
524
+ }
525
+
526
+ except Exception as e:
527
+ logger.error(f"Failed to initialize pool: {e}")
528
+ raise NodeExecutionError(f"Pool initialization failed: {e}")
529
+
530
+ async def _acquire_connection(self) -> Dict[str, Any]:
531
+ """Acquire a connection from the pool."""
532
+ if not self._initialized:
533
+ await self._initialize()
534
+
535
+ start_time = time.time()
536
+
537
+ try:
538
+ # Use circuit breaker to protect connection acquisition
539
+ async def acquire_with_circuit_breaker():
540
+ # Try to get available connection
541
+ connection = None
542
+
543
+ # Fast path: try to get immediately available connection
544
+ try:
545
+ connection = await asyncio.wait_for(
546
+ self.available_connections.get(), timeout=0.1
547
+ )
548
+ except asyncio.TimeoutError:
549
+ # Need to create new connection or wait
550
+ if len(self.all_connections) < self.max_connections:
551
+ # Create new connection
552
+ connection = await self._create_connection()
553
+ # Don't put it in available queue - we'll use it directly
554
+ else:
555
+ # Wait for available connection
556
+ connection = await self.available_connections.get()
557
+
558
+ return connection
559
+
560
+ # Execute with circuit breaker protection
561
+ connection = await self.circuit_breaker.call(acquire_with_circuit_breaker)
562
+
563
+ # Record acquisition time
564
+ wait_time = time.time() - start_time
565
+ self.metrics.record_acquisition_time(wait_time)
566
+
567
+ # Track in comprehensive metrics
568
+ with self.metrics_collector.track_acquisition() as timer:
569
+ pass # Already acquired, just recording time
570
+
571
+ # Move to active
572
+ self.active_connections[connection.id] = connection
573
+
574
+ # Update pattern analyzer
575
+ if self.workflow_id:
576
+ self.pattern_analyzer.record_connection_usage(
577
+ self.workflow_id, len(self.active_connections)
578
+ )
579
+
580
+ return {
581
+ "connection_id": connection.id,
582
+ "health_score": connection.health_score,
583
+ "acquisition_time_ms": wait_time * 1000,
584
+ }
585
+
586
+ except CircuitBreakerError as e:
587
+ # Circuit is open - pool is experiencing failures
588
+ self.metrics_collector.track_pool_exhaustion()
589
+ logger.error(f"Circuit breaker open: {e}")
590
+ raise NodeExecutionError(f"Connection pool circuit breaker open: {e}")
591
+ except Exception as e:
592
+ logger.error(f"Failed to acquire connection: {e}")
593
+ self.metrics_collector.track_query_error("ACQUIRE", e)
594
+ raise NodeExecutionError(f"Connection acquisition failed: {e}")
595
+
596
+ async def _release_connection(self, connection_id: Optional[str]) -> Dict[str, Any]:
597
+ """Release a connection back to the pool."""
598
+ if not connection_id:
599
+ raise NodeExecutionError("connection_id required for release")
600
+
601
+ if connection_id not in self.active_connections:
602
+ raise NodeExecutionError(f"Connection {connection_id} not active")
603
+
604
+ connection = self.active_connections.pop(connection_id)
605
+
606
+ # Check if connection should be recycled
607
+ if connection.health_score < self.health_threshold:
608
+ await self._recycle_connection(connection)
609
+ return {"status": "recycled", "connection_id": connection_id}
610
+ else:
611
+ # Return to available pool
612
+ await self.available_connections.put(connection)
613
+ return {"status": "released", "connection_id": connection_id}
614
+
615
+ async def _execute_query(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
616
+ """Execute a query on a specific connection."""
617
+ connection_id = inputs.get("connection_id")
618
+ if not connection_id or connection_id not in self.active_connections:
619
+ raise NodeExecutionError(f"Invalid connection_id: {connection_id}")
620
+
621
+ connection = self.active_connections[connection_id]
622
+
623
+ # Determine query type for metrics
624
+ query = inputs.get("query", "").strip().upper()
625
+ query_type = "UNKNOWN"
626
+ if query.startswith("SELECT"):
627
+ query_type = "SELECT"
628
+ elif query.startswith("INSERT"):
629
+ query_type = "INSERT"
630
+ elif query.startswith("UPDATE"):
631
+ query_type = "UPDATE"
632
+ elif query.startswith("DELETE"):
633
+ query_type = "DELETE"
634
+
635
+ try:
636
+ # Execute query with comprehensive metrics tracking
637
+ with self.metrics_collector.track_query(query_type) as timer:
638
+ result = await connection.execute(
639
+ query=inputs.get("query"),
640
+ params=inputs.get("params"),
641
+ fetch_mode=inputs.get("fetch_mode", "all"),
642
+ )
643
+
644
+ # Update metrics
645
+ self.metrics.queries_executed += 1
646
+ if not result.success:
647
+ self.metrics.query_errors += 1
648
+ self.metrics_collector.track_query_error(
649
+ query_type, Exception(result.error)
650
+ )
651
+
652
+ # Track query pattern if enabled
653
+ if self.query_pattern_tracker and inputs.get("query"):
654
+ self.query_pattern_tracker.record_execution(
655
+ fingerprint=inputs.get("query_fingerprint", inputs.get("query")),
656
+ execution_time_ms=result.execution_time * 1000,
657
+ connection_id=connection_id,
658
+ parameters=inputs.get("params", {}),
659
+ success=result.success,
660
+ result_size=len(result.data) if result.data else 0,
661
+ )
662
+
663
+ return {
664
+ "success": result.success,
665
+ "data": result.data,
666
+ "error": result.error,
667
+ "execution_time_ms": result.execution_time * 1000,
668
+ "connection_id": connection_id,
669
+ }
670
+
671
+ except Exception as e:
672
+ self.metrics.query_errors += 1
673
+ logger.error(f"Query execution failed: {e}")
674
+ raise NodeExecutionError(f"Query execution failed: {e}")
675
+
676
+ async def _get_stats(self) -> Dict[str, Any]:
677
+ """Get comprehensive pool statistics."""
678
+ pool_stats = self.metrics.get_stats()
679
+ supervisor_stats = self.supervisor.get_stats()
680
+
681
+ # Add current pool state
682
+ pool_stats["current_state"] = {
683
+ "total_connections": len(self.all_connections),
684
+ "active_connections": len(self.active_connections),
685
+ "available_connections": self.available_connections.qsize(),
686
+ "health_scores": {
687
+ conn_id: conn.health_score
688
+ for conn_id, conn in self.all_connections.items()
689
+ },
690
+ }
691
+
692
+ pool_stats["supervisor"] = supervisor_stats
693
+
694
+ return pool_stats
695
+
696
+ async def _create_connection(self) -> ConnectionActor:
697
+ """Create a new connection actor."""
698
+ conn_id = f"conn_{uuid.uuid4().hex[:8]}"
699
+
700
+ # Create actor connection
701
+ actor_conn = ActorConnection(
702
+ connection_id=conn_id,
703
+ db_config=self.db_config,
704
+ health_check_interval=30.0,
705
+ max_lifetime=3600.0,
706
+ max_idle_time=600.0,
707
+ )
708
+
709
+ # Add to supervisor
710
+ self.supervisor.add_actor(actor_conn)
711
+
712
+ # Create high-level interface
713
+ connection = ConnectionActor(actor_conn)
714
+
715
+ # Track connection
716
+ self.all_connections[conn_id] = connection
717
+ self.metrics.connections_created += 1
718
+
719
+ logger.info(f"Created connection {conn_id} for pool {self.metadata.name}")
720
+
721
+ return connection
722
+
723
+ async def _ensure_min_connections(self):
724
+ """Ensure minimum connections are available."""
725
+ current_count = len(self.all_connections)
726
+
727
+ for _ in range(self.min_connections - current_count):
728
+ connection = await self._create_connection()
729
+ await self.available_connections.put(connection)
730
+
731
+ async def _pre_warm_connections(self, target_count: int):
732
+ """Pre-warm connections based on expected usage."""
733
+ current_count = len(self.all_connections)
734
+ to_create = min(
735
+ target_count - current_count, self.max_connections - current_count
736
+ )
737
+
738
+ if to_create > 0:
739
+ logger.info(
740
+ f"Pre-warming {to_create} connections for pool {self.metadata.name}"
741
+ )
742
+
743
+ # Create connections in parallel
744
+ tasks = [self._create_connection() for _ in range(to_create)]
745
+ connections = await asyncio.gather(*tasks)
746
+
747
+ # Add to available pool
748
+ for conn in connections:
749
+ await self.available_connections.put(conn)
750
+
751
+ async def _recycle_connection(self, connection: ConnectionActor):
752
+ """Recycle a connection."""
753
+ logger.info(
754
+ f"Recycling connection {connection.id} (health: {connection.health_score})"
755
+ )
756
+
757
+ # Remove from all connections
758
+ if connection.id in self.all_connections:
759
+ del self.all_connections[connection.id]
760
+
761
+ # Request recycling
762
+ await connection.recycle()
763
+
764
+ # Update metrics
765
+ self.metrics.connections_recycled += 1
766
+
767
+ # Ensure minimum connections
768
+ await self._ensure_min_connections()
769
+
770
+ async def _cleanup(self):
771
+ """Clean up all connections and resources."""
772
+ if self._closing:
773
+ return
774
+
775
+ self._closing = True
776
+ logger.info(f"Cleaning up pool {self.metadata.name}")
777
+
778
+ # Stop accepting new connections
779
+ self._initialized = False
780
+
781
+ # Stop adaptive controller if running
782
+ if self.adaptive_controller:
783
+ await self.adaptive_controller.stop()
784
+
785
+ # Stop all connection actors gracefully
786
+ actors_to_stop = list(self.all_connections.values())
787
+ for actor in actors_to_stop:
788
+ try:
789
+ await actor.stop()
790
+ except Exception as e:
791
+ logger.warning(f"Error stopping actor {actor.id}: {e}")
792
+
793
+ # Stop supervisor
794
+ try:
795
+ await self.supervisor.stop()
796
+ except Exception as e:
797
+ logger.warning(f"Error stopping supervisor: {e}")
798
+
799
+ # Clear connection tracking
800
+ self.available_connections = asyncio.Queue()
801
+ self.active_connections.clear()
802
+ self.all_connections.clear()
803
+
804
+ logger.info(f"Pool {self.metadata.name} cleaned up")
805
+
806
+ def _on_connection_failure(self, actor_id: str, error: Exception):
807
+ """Handle connection failure."""
808
+ logger.error(f"Connection {actor_id} failed: {error}")
809
+ self.metrics.connections_failed += 1
810
+
811
+ # Remove from tracking
812
+ if actor_id in self.all_connections:
813
+ del self.all_connections[actor_id]
814
+ if actor_id in self.active_connections:
815
+ del self.active_connections[actor_id]
816
+
817
+ def _on_connection_restart(self, actor_id: str, restart_count: int):
818
+ """Handle connection restart."""
819
+ logger.info(f"Connection {actor_id} restarted (count: {restart_count})")
820
+
821
+ async def process(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
822
+ """Async process method for middleware compatibility."""
823
+ return await self.async_run(**inputs)
824
+
825
+ async def __aenter__(self):
826
+ """Context manager entry."""
827
+ await self._initialize()
828
+ return self
829
+
830
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
831
+ """Context manager exit."""
832
+ await self._cleanup()
833
+
834
+ async def _get_pool_status(self) -> Dict[str, Any]:
835
+ """Get pool status for query router."""
836
+ connections = {}
837
+
838
+ for conn_id, conn in self.all_connections.items():
839
+ connections[conn_id] = {
840
+ "health_score": conn.health_score,
841
+ "active_queries": 1 if conn_id in self.active_connections else 0,
842
+ "capabilities": [
843
+ "read",
844
+ "write",
845
+ ], # TODO: Add actual capability detection
846
+ "avg_latency_ms": 0.0, # TODO: Track actual latency
847
+ "last_used": datetime.now().isoformat(),
848
+ }
849
+
850
+ return {
851
+ "connections": connections,
852
+ "pool_size": len(self.all_connections),
853
+ "active_count": len(self.active_connections),
854
+ "available_count": self.available_connections.qsize(),
855
+ }
856
+
857
+ async def adjust_pool_size(self, new_size: int) -> Dict[str, Any]:
858
+ """Dynamically adjust pool size."""
859
+ if new_size < self.min_connections or new_size > self.max_connections:
860
+ return {
861
+ "success": False,
862
+ "reason": f"Size must be between {self.min_connections} and {self.max_connections}",
863
+ }
864
+
865
+ current_size = len(self.all_connections)
866
+
867
+ if new_size > current_size:
868
+ # Scale up
869
+ connections_to_add = new_size - current_size
870
+ for _ in range(connections_to_add):
871
+ try:
872
+ await self._create_connection()
873
+ except Exception as e:
874
+ logger.error(f"Failed to create connection during scale up: {e}")
875
+
876
+ elif new_size < current_size:
877
+ # Scale down - remove idle connections first
878
+ connections_to_remove = current_size - new_size
879
+ removed = 0
880
+
881
+ # Try to remove idle connections
882
+ while (
883
+ removed < connections_to_remove
884
+ and not self.available_connections.empty()
885
+ ):
886
+ try:
887
+ conn = await asyncio.wait_for(
888
+ self.available_connections.get(), timeout=0.1
889
+ )
890
+ await self._recycle_connection(conn)
891
+ removed += 1
892
+ except asyncio.TimeoutError:
893
+ break
894
+
895
+ return {
896
+ "success": True,
897
+ "previous_size": current_size,
898
+ "new_size": len(self.all_connections),
899
+ }
900
+
901
+ async def get_pool_statistics(self) -> Dict[str, Any]:
902
+ """Get detailed pool statistics for adaptive sizing."""
903
+ total_connections = len(self.all_connections)
904
+ active_connections = len(self.active_connections)
905
+ idle_connections = self.available_connections.qsize()
906
+
907
+ # Calculate metrics
908
+ utilization_rate = (
909
+ active_connections / total_connections if total_connections > 0 else 0
910
+ )
911
+
912
+ # Get average health score
913
+ health_scores = [conn.health_score for conn in self.all_connections.values()]
914
+ avg_health_score = (
915
+ sum(health_scores) / len(health_scores) if health_scores else 100
916
+ )
917
+
918
+ # Queue depth (approximate based on waiters)
919
+ queue_depth = 0 # TODO: Track actual queue depth
920
+
921
+ # Get timing metrics from pool metrics
922
+ stats = self.metrics.get_stats()
923
+
924
+ return {
925
+ "total_connections": total_connections,
926
+ "active_connections": active_connections,
927
+ "idle_connections": idle_connections,
928
+ "queue_depth": queue_depth,
929
+ "utilization_rate": utilization_rate,
930
+ "avg_health_score": avg_health_score,
931
+ "avg_acquisition_time_ms": stats["performance"]["avg_acquisition_time_ms"],
932
+ "avg_query_time_ms": 50.0, # TODO: Track actual query time
933
+ "queries_per_second": (
934
+ stats["queries"]["executed"] / stats["uptime_seconds"]
935
+ if stats["uptime_seconds"] > 0
936
+ else 0
937
+ ),
938
+ # Phase 3 additions
939
+ "circuit_breaker_status": self.circuit_breaker.get_status(),
940
+ "comprehensive_metrics": self.metrics_collector.get_all_metrics(),
941
+ "error_rate": self.metrics_collector.get_error_summary()["error_rate"],
942
+ "health_score": avg_health_score,
943
+ "pool_name": self.metadata.name,
944
+ }
945
+
946
+ async def get_comprehensive_status(self) -> Dict[str, Any]:
947
+ """Get comprehensive status including all Phase 3 features."""
948
+ base_stats = await self.get_pool_statistics()
949
+
950
+ # Add circuit breaker details
951
+ cb_status = self.circuit_breaker.get_status()
952
+
953
+ # Add comprehensive metrics
954
+ metrics = self.metrics_collector.get_all_metrics()
955
+
956
+ # Add pattern learning insights if enabled
957
+ pattern_insights = {}
958
+ if self.query_pattern_tracker:
959
+ patterns = self.query_pattern_tracker.get_all_patterns()
960
+ pattern_insights = {
961
+ "detected_patterns": len(patterns),
962
+ "workload_forecast": self.query_pattern_tracker.get_workload_forecast(
963
+ 15
964
+ ),
965
+ }
966
+
967
+ # Add adaptive controller status if enabled
968
+ adaptive_status = {}
969
+ if self.adaptive_controller:
970
+ adaptive_status = {
971
+ "current_size": len(self.all_connections),
972
+ "recommended_size": self.adaptive_controller.get_recommended_size(),
973
+ "last_adjustment": self.adaptive_controller.get_last_adjustment(),
974
+ }
975
+
976
+ return {
977
+ **base_stats,
978
+ "circuit_breaker": {
979
+ "state": cb_status["state"],
980
+ "metrics": cb_status["metrics"],
981
+ "time_until_recovery": cb_status.get("time_until_recovery"),
982
+ },
983
+ "detailed_metrics": {
984
+ "counters": metrics["counters"],
985
+ "gauges": metrics["gauges"],
986
+ "histograms": metrics["histograms"],
987
+ "errors": metrics["errors"],
988
+ "query_summary": metrics["queries"],
989
+ },
990
+ "pattern_insights": pattern_insights,
991
+ "adaptive_control": adaptive_status,
992
+ "monitoring": {
993
+ "dashboard_enabled": self.enable_monitoring,
994
+ "dashboard_url": (
995
+ f"http://localhost:{self.monitoring_port}"
996
+ if self.enable_monitoring
997
+ else None
998
+ ),
999
+ },
1000
+ }
1001
+
1002
+ async def _start_monitoring_dashboard(self) -> Dict[str, Any]:
1003
+ """Start the monitoring dashboard if enabled."""
1004
+ if not self.enable_monitoring:
1005
+ return {"error": "Monitoring not enabled in configuration"}
1006
+
1007
+ try:
1008
+ # Register this pool with the global metrics aggregator
1009
+ if hasattr(self.runtime, "metrics_aggregator"):
1010
+ self.runtime.metrics_aggregator.register_collector(
1011
+ self.metrics_collector
1012
+ )
1013
+
1014
+ # Start monitoring dashboard if not already running
1015
+ if not hasattr(self.runtime, "monitoring_dashboard"):
1016
+ from kailash.nodes.monitoring.connection_dashboard import (
1017
+ ConnectionDashboardNode,
1018
+ )
1019
+
1020
+ dashboard = ConnectionDashboardNode(
1021
+ name="global_dashboard",
1022
+ port=self.monitoring_port,
1023
+ update_interval=1.0,
1024
+ )
1025
+
1026
+ # Store dashboard in runtime for sharing
1027
+ self.runtime.monitoring_dashboard = dashboard
1028
+ await dashboard.start()
1029
+
1030
+ return {
1031
+ "status": "started",
1032
+ "dashboard_url": f"http://localhost:{self.monitoring_port}",
1033
+ }
1034
+ else:
1035
+ return {
1036
+ "status": "already_running",
1037
+ "dashboard_url": f"http://localhost:{self.monitoring_port}",
1038
+ }
1039
+
1040
+ except Exception as e:
1041
+ logger.error(f"Failed to start monitoring dashboard: {e}")
1042
+ return {"error": str(e)}
1043
+
1044
+ async def _stop_monitoring_dashboard(self) -> Dict[str, Any]:
1045
+ """Stop the monitoring dashboard."""
1046
+ try:
1047
+ if hasattr(self.runtime, "monitoring_dashboard"):
1048
+ await self.runtime.monitoring_dashboard.stop()
1049
+ del self.runtime.monitoring_dashboard
1050
+ return {"status": "stopped"}
1051
+ else:
1052
+ return {"status": "not_running"}
1053
+ except Exception as e:
1054
+ logger.error(f"Failed to stop monitoring dashboard: {e}")
1055
+ return {"error": str(e)}
1056
+
1057
+ def _update_pool_metrics(self):
1058
+ """Update pool metrics for monitoring."""
1059
+ total = len(self.all_connections)
1060
+ active = len(self.active_connections)
1061
+ idle = self.available_connections.qsize()
1062
+
1063
+ # Update comprehensive metrics
1064
+ self.metrics_collector.update_pool_stats(active, idle, total)
1065
+
1066
+ # Track health checks
1067
+ for conn in self.all_connections.values():
1068
+ self.metrics_collector.track_health_check(
1069
+ success=conn.health_score > self.health_threshold,
1070
+ duration_ms=5.0, # Placeholder - real implementation would track actual time
1071
+ )