kailash 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -1
- kailash/access_control/__init__.py +1 -1
- kailash/client/__init__.py +12 -0
- kailash/client/enhanced_client.py +306 -0
- kailash/core/actors/__init__.py +16 -0
- kailash/core/actors/adaptive_pool_controller.py +630 -0
- kailash/core/actors/connection_actor.py +566 -0
- kailash/core/actors/supervisor.py +364 -0
- kailash/core/ml/__init__.py +1 -0
- kailash/core/ml/query_patterns.py +544 -0
- kailash/core/monitoring/__init__.py +19 -0
- kailash/core/monitoring/connection_metrics.py +488 -0
- kailash/core/optimization/__init__.py +1 -0
- kailash/core/resilience/__init__.py +17 -0
- kailash/core/resilience/circuit_breaker.py +382 -0
- kailash/edge/__init__.py +16 -0
- kailash/edge/compliance.py +834 -0
- kailash/edge/discovery.py +659 -0
- kailash/edge/location.py +582 -0
- kailash/gateway/__init__.py +33 -0
- kailash/gateway/api.py +289 -0
- kailash/gateway/enhanced_gateway.py +357 -0
- kailash/gateway/resource_resolver.py +217 -0
- kailash/gateway/security.py +227 -0
- kailash/middleware/auth/access_control.py +6 -6
- kailash/middleware/auth/models.py +2 -2
- kailash/middleware/communication/ai_chat.py +7 -7
- kailash/middleware/communication/api_gateway.py +5 -15
- kailash/middleware/database/base_models.py +1 -7
- kailash/middleware/gateway/__init__.py +22 -0
- kailash/middleware/gateway/checkpoint_manager.py +398 -0
- kailash/middleware/gateway/deduplicator.py +382 -0
- kailash/middleware/gateway/durable_gateway.py +417 -0
- kailash/middleware/gateway/durable_request.py +498 -0
- kailash/middleware/gateway/event_store.py +499 -0
- kailash/middleware/mcp/enhanced_server.py +2 -2
- kailash/nodes/admin/permission_check.py +817 -33
- kailash/nodes/admin/role_management.py +1242 -108
- kailash/nodes/admin/schema_manager.py +438 -0
- kailash/nodes/admin/user_management.py +1124 -1582
- kailash/nodes/code/__init__.py +8 -1
- kailash/nodes/code/async_python.py +1035 -0
- kailash/nodes/code/python.py +1 -0
- kailash/nodes/data/async_sql.py +9 -3
- kailash/nodes/data/query_pipeline.py +641 -0
- kailash/nodes/data/query_router.py +895 -0
- kailash/nodes/data/sql.py +20 -11
- kailash/nodes/data/workflow_connection_pool.py +1071 -0
- kailash/nodes/monitoring/__init__.py +3 -5
- kailash/nodes/monitoring/connection_dashboard.py +822 -0
- kailash/nodes/rag/__init__.py +2 -7
- kailash/resources/__init__.py +40 -0
- kailash/resources/factory.py +533 -0
- kailash/resources/health.py +319 -0
- kailash/resources/reference.py +288 -0
- kailash/resources/registry.py +392 -0
- kailash/runtime/async_local.py +711 -302
- kailash/testing/__init__.py +34 -0
- kailash/testing/async_test_case.py +353 -0
- kailash/testing/async_utils.py +345 -0
- kailash/testing/fixtures.py +458 -0
- kailash/testing/mock_registry.py +495 -0
- kailash/workflow/__init__.py +8 -0
- kailash/workflow/async_builder.py +621 -0
- kailash/workflow/async_patterns.py +766 -0
- kailash/workflow/cyclic_runner.py +107 -16
- kailash/workflow/graph.py +7 -2
- kailash/workflow/resilience.py +11 -1
- {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/METADATA +19 -4
- {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/RECORD +74 -28
- {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/WHEEL +0 -0
- {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/entry_points.txt +0 -0
- {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.5.0.dist-info → kailash-0.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1071 @@
|
|
1
|
+
"""Workflow-scoped connection pool for production-grade database management.
|
2
|
+
|
3
|
+
This module implements a connection pool that is scoped to workflow lifecycle,
|
4
|
+
providing better resource management and isolation compared to global pools.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
import time
|
10
|
+
import uuid
|
11
|
+
from collections import defaultdict
|
12
|
+
from datetime import datetime
|
13
|
+
from typing import Any, Dict, List, Optional, Set
|
14
|
+
|
15
|
+
from kailash.core.actors import (
|
16
|
+
ActorConnection,
|
17
|
+
ActorSupervisor,
|
18
|
+
ConnectionActor,
|
19
|
+
ConnectionState,
|
20
|
+
SupervisionStrategy,
|
21
|
+
)
|
22
|
+
from kailash.core.actors.adaptive_pool_controller import AdaptivePoolController
|
23
|
+
from kailash.core.ml.query_patterns import QueryPatternTracker
|
24
|
+
from kailash.core.monitoring.connection_metrics import (
|
25
|
+
ConnectionMetricsCollector,
|
26
|
+
ErrorCategory,
|
27
|
+
)
|
28
|
+
from kailash.core.resilience.circuit_breaker import (
|
29
|
+
CircuitBreakerConfig,
|
30
|
+
CircuitBreakerError,
|
31
|
+
ConnectionCircuitBreaker,
|
32
|
+
)
|
33
|
+
from kailash.nodes.base import NodeParameter, register_node
|
34
|
+
from kailash.nodes.base_async import AsyncNode
|
35
|
+
from kailash.sdk_exceptions import NodeExecutionError
|
36
|
+
|
37
|
+
logger = logging.getLogger(__name__)
|
38
|
+
|
39
|
+
|
40
|
+
class ConnectionPoolMetrics:
|
41
|
+
"""Metrics collector for connection pool monitoring."""
|
42
|
+
|
43
|
+
def __init__(self, pool_name: str):
|
44
|
+
self.pool_name = pool_name
|
45
|
+
self.connections_created = 0
|
46
|
+
self.connections_recycled = 0
|
47
|
+
self.connections_failed = 0
|
48
|
+
self.queries_executed = 0
|
49
|
+
self.query_errors = 0
|
50
|
+
self.acquisition_wait_times: List[float] = []
|
51
|
+
self.health_check_results: List[bool] = []
|
52
|
+
self.start_time = time.time()
|
53
|
+
|
54
|
+
def record_acquisition_time(self, wait_time: float):
|
55
|
+
"""Record time waited to acquire connection."""
|
56
|
+
self.acquisition_wait_times.append(wait_time)
|
57
|
+
# Keep only last 1000 measurements
|
58
|
+
if len(self.acquisition_wait_times) > 1000:
|
59
|
+
self.acquisition_wait_times = self.acquisition_wait_times[-1000:]
|
60
|
+
|
61
|
+
def get_stats(self) -> Dict[str, Any]:
|
62
|
+
"""Get comprehensive pool statistics."""
|
63
|
+
uptime = time.time() - self.start_time
|
64
|
+
|
65
|
+
# Calculate averages
|
66
|
+
avg_wait_time = (
|
67
|
+
sum(self.acquisition_wait_times) / len(self.acquisition_wait_times)
|
68
|
+
if self.acquisition_wait_times
|
69
|
+
else 0.0
|
70
|
+
)
|
71
|
+
|
72
|
+
health_success_rate = (
|
73
|
+
sum(1 for h in self.health_check_results if h)
|
74
|
+
/ len(self.health_check_results)
|
75
|
+
if self.health_check_results
|
76
|
+
else 1.0
|
77
|
+
)
|
78
|
+
|
79
|
+
return {
|
80
|
+
"pool_name": self.pool_name,
|
81
|
+
"uptime_seconds": uptime,
|
82
|
+
"connections": {
|
83
|
+
"created": self.connections_created,
|
84
|
+
"recycled": self.connections_recycled,
|
85
|
+
"failed": self.connections_failed,
|
86
|
+
},
|
87
|
+
"queries": {
|
88
|
+
"executed": self.queries_executed,
|
89
|
+
"errors": self.query_errors,
|
90
|
+
"error_rate": (
|
91
|
+
self.query_errors / self.queries_executed
|
92
|
+
if self.queries_executed > 0
|
93
|
+
else 0
|
94
|
+
),
|
95
|
+
},
|
96
|
+
"performance": {
|
97
|
+
"avg_acquisition_time_ms": avg_wait_time * 1000,
|
98
|
+
"p99_acquisition_time_ms": (
|
99
|
+
sorted(self.acquisition_wait_times)[
|
100
|
+
int(len(self.acquisition_wait_times) * 0.99)
|
101
|
+
]
|
102
|
+
* 1000
|
103
|
+
if self.acquisition_wait_times
|
104
|
+
else 0
|
105
|
+
),
|
106
|
+
},
|
107
|
+
"health": {
|
108
|
+
"success_rate": health_success_rate,
|
109
|
+
"checks_performed": len(self.health_check_results),
|
110
|
+
},
|
111
|
+
}
|
112
|
+
|
113
|
+
|
114
|
+
class WorkflowPatternAnalyzer:
|
115
|
+
"""Analyzes workflow patterns for optimization."""
|
116
|
+
|
117
|
+
def __init__(self):
|
118
|
+
self.workflow_patterns: Dict[str, Dict[str, Any]] = {}
|
119
|
+
self.connection_usage: Dict[str, List[float]] = defaultdict(list)
|
120
|
+
|
121
|
+
def record_workflow_start(self, workflow_id: str, workflow_type: str):
|
122
|
+
"""Record workflow start for pattern analysis."""
|
123
|
+
self.workflow_patterns[workflow_id] = {
|
124
|
+
"type": workflow_type,
|
125
|
+
"start_time": time.time(),
|
126
|
+
"connections_used": 0,
|
127
|
+
"peak_connections": 0,
|
128
|
+
}
|
129
|
+
|
130
|
+
def record_connection_usage(self, workflow_id: str, active_connections: int):
|
131
|
+
"""Record connection usage for workflow."""
|
132
|
+
if workflow_id in self.workflow_patterns:
|
133
|
+
pattern = self.workflow_patterns[workflow_id]
|
134
|
+
pattern["connections_used"] = max(
|
135
|
+
pattern["connections_used"], active_connections
|
136
|
+
)
|
137
|
+
self.connection_usage[workflow_id].append(active_connections)
|
138
|
+
|
139
|
+
def get_expected_connections(self, workflow_type: str) -> int:
|
140
|
+
"""Get expected connection count for workflow type."""
|
141
|
+
# Analyze historical data for this workflow type
|
142
|
+
similar_workflows = [
|
143
|
+
p
|
144
|
+
for p in self.workflow_patterns.values()
|
145
|
+
if p["type"] == workflow_type and "connections_used" in p
|
146
|
+
]
|
147
|
+
|
148
|
+
if not similar_workflows:
|
149
|
+
return 2 # Default
|
150
|
+
|
151
|
+
# Return 90th percentile of historical usage
|
152
|
+
usage_values = sorted([w["connections_used"] for w in similar_workflows])
|
153
|
+
percentile_index = int(len(usage_values) * 0.9)
|
154
|
+
return usage_values[percentile_index] if usage_values else 2
|
155
|
+
|
156
|
+
|
157
|
+
@register_node()
|
158
|
+
class WorkflowConnectionPool(AsyncNode):
|
159
|
+
"""
|
160
|
+
Workflow-scoped connection pool with production-grade features.
|
161
|
+
|
162
|
+
This node provides:
|
163
|
+
- Connections scoped to workflow lifecycle
|
164
|
+
- Actor-based isolation for each connection
|
165
|
+
- Automatic health monitoring and recycling
|
166
|
+
- Pattern-based pre-warming
|
167
|
+
- Comprehensive metrics and monitoring
|
168
|
+
|
169
|
+
Example:
|
170
|
+
>>> pool = WorkflowConnectionPool(
|
171
|
+
... name="workflow_db_pool",
|
172
|
+
... database_type="postgresql",
|
173
|
+
... host="localhost",
|
174
|
+
... database="myapp",
|
175
|
+
... user="dbuser",
|
176
|
+
... password="dbpass",
|
177
|
+
... min_connections=2,
|
178
|
+
... max_connections=10
|
179
|
+
... )
|
180
|
+
>>>
|
181
|
+
>>> # Get connection
|
182
|
+
>>> result = await pool.process({"operation": "acquire"})
|
183
|
+
>>> conn_id = result["connection_id"]
|
184
|
+
>>>
|
185
|
+
>>> # Execute query
|
186
|
+
>>> query_result = await pool.process({
|
187
|
+
... "operation": "execute",
|
188
|
+
... "connection_id": conn_id,
|
189
|
+
... "query": "SELECT * FROM users WHERE active = true",
|
190
|
+
... })
|
191
|
+
"""
|
192
|
+
|
193
|
+
def __init__(self, **config):
|
194
|
+
super().__init__(**config)
|
195
|
+
|
196
|
+
# Pool configuration
|
197
|
+
self.min_connections = config.get("min_connections", 2)
|
198
|
+
self.max_connections = config.get("max_connections", 10)
|
199
|
+
self.health_threshold = config.get("health_threshold", 50)
|
200
|
+
self.pre_warm_enabled = config.get("pre_warm", True)
|
201
|
+
self.adaptive_sizing_enabled = config.get("adaptive_sizing", False)
|
202
|
+
self.enable_query_routing = config.get("enable_query_routing", False)
|
203
|
+
|
204
|
+
# Database configuration
|
205
|
+
self.db_config = {
|
206
|
+
"type": config.get("database_type", "postgresql"),
|
207
|
+
"host": config.get("host"),
|
208
|
+
"port": config.get("port"),
|
209
|
+
"database": config.get("database"),
|
210
|
+
"user": config.get("user"),
|
211
|
+
"password": config.get("password"),
|
212
|
+
"connection_string": config.get("connection_string"),
|
213
|
+
}
|
214
|
+
|
215
|
+
# Actor supervision
|
216
|
+
self.supervisor = ActorSupervisor(
|
217
|
+
name=f"{self.metadata.name}_supervisor",
|
218
|
+
strategy=SupervisionStrategy.ONE_FOR_ONE,
|
219
|
+
max_restarts=3,
|
220
|
+
restart_window=60.0,
|
221
|
+
)
|
222
|
+
|
223
|
+
# Connection tracking
|
224
|
+
self.available_connections: asyncio.Queue = asyncio.Queue()
|
225
|
+
self.active_connections: Dict[str, ConnectionActor] = {}
|
226
|
+
self.all_connections: Dict[str, ConnectionActor] = {}
|
227
|
+
|
228
|
+
# Workflow integration
|
229
|
+
self.workflow_id: Optional[str] = None
|
230
|
+
self.pattern_analyzer = WorkflowPatternAnalyzer()
|
231
|
+
|
232
|
+
# Metrics
|
233
|
+
self.metrics = ConnectionPoolMetrics(self.metadata.name)
|
234
|
+
|
235
|
+
# State
|
236
|
+
self._initialized = False
|
237
|
+
self._closing = False
|
238
|
+
|
239
|
+
# Phase 2 components
|
240
|
+
self.query_pattern_tracker = None
|
241
|
+
self.adaptive_controller = None
|
242
|
+
|
243
|
+
if self.enable_query_routing:
|
244
|
+
self.query_pattern_tracker = QueryPatternTracker()
|
245
|
+
|
246
|
+
if self.adaptive_sizing_enabled:
|
247
|
+
self.adaptive_controller = AdaptivePoolController(
|
248
|
+
min_size=self.min_connections, max_size=self.max_connections
|
249
|
+
)
|
250
|
+
|
251
|
+
# Phase 3 components
|
252
|
+
# Circuit breaker for connection failures
|
253
|
+
self.circuit_breaker_config = CircuitBreakerConfig(
|
254
|
+
failure_threshold=config.get("circuit_breaker_failure_threshold", 5),
|
255
|
+
recovery_timeout=config.get("circuit_breaker_recovery_timeout", 60),
|
256
|
+
error_rate_threshold=config.get("circuit_breaker_error_rate", 0.5),
|
257
|
+
)
|
258
|
+
self.circuit_breaker = ConnectionCircuitBreaker(self.circuit_breaker_config)
|
259
|
+
|
260
|
+
# Comprehensive metrics collector
|
261
|
+
self.metrics_collector = ConnectionMetricsCollector(
|
262
|
+
pool_name=self.metadata.name,
|
263
|
+
retention_minutes=config.get("metrics_retention_minutes", 60),
|
264
|
+
)
|
265
|
+
|
266
|
+
# Enable query pipelining support
|
267
|
+
self.enable_pipelining = config.get("enable_pipelining", False)
|
268
|
+
self.pipeline_batch_size = config.get("pipeline_batch_size", 100)
|
269
|
+
|
270
|
+
# Monitoring dashboard integration
|
271
|
+
self.enable_monitoring = config.get("enable_monitoring", False)
|
272
|
+
self.monitoring_port = config.get("monitoring_port", 8080)
|
273
|
+
|
274
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
275
|
+
"""Define node parameters."""
|
276
|
+
params = [
|
277
|
+
# Database connection parameters
|
278
|
+
NodeParameter(
|
279
|
+
name="database_type",
|
280
|
+
type=str,
|
281
|
+
required=True,
|
282
|
+
default="postgresql",
|
283
|
+
description="Database type: postgresql, mysql, or sqlite",
|
284
|
+
),
|
285
|
+
NodeParameter(
|
286
|
+
name="connection_string",
|
287
|
+
type=str,
|
288
|
+
required=False,
|
289
|
+
description="Full connection string (overrides individual params)",
|
290
|
+
),
|
291
|
+
NodeParameter(
|
292
|
+
name="host", type=str, required=False, description="Database host"
|
293
|
+
),
|
294
|
+
NodeParameter(
|
295
|
+
name="port", type=int, required=False, description="Database port"
|
296
|
+
),
|
297
|
+
NodeParameter(
|
298
|
+
name="database", type=str, required=False, description="Database name"
|
299
|
+
),
|
300
|
+
NodeParameter(
|
301
|
+
name="user", type=str, required=False, description="Database user"
|
302
|
+
),
|
303
|
+
NodeParameter(
|
304
|
+
name="password",
|
305
|
+
type=str,
|
306
|
+
required=False,
|
307
|
+
description="Database password",
|
308
|
+
),
|
309
|
+
# Pool configuration
|
310
|
+
NodeParameter(
|
311
|
+
name="min_connections",
|
312
|
+
type=int,
|
313
|
+
required=False,
|
314
|
+
default=2,
|
315
|
+
description="Minimum pool connections",
|
316
|
+
),
|
317
|
+
NodeParameter(
|
318
|
+
name="max_connections",
|
319
|
+
type=int,
|
320
|
+
required=False,
|
321
|
+
default=10,
|
322
|
+
description="Maximum pool connections",
|
323
|
+
),
|
324
|
+
NodeParameter(
|
325
|
+
name="health_threshold",
|
326
|
+
type=int,
|
327
|
+
required=False,
|
328
|
+
default=50,
|
329
|
+
description="Minimum health score to keep connection",
|
330
|
+
),
|
331
|
+
NodeParameter(
|
332
|
+
name="pre_warm",
|
333
|
+
type=bool,
|
334
|
+
required=False,
|
335
|
+
default=True,
|
336
|
+
description="Enable pattern-based pre-warming",
|
337
|
+
),
|
338
|
+
NodeParameter(
|
339
|
+
name="adaptive_sizing",
|
340
|
+
type=bool,
|
341
|
+
required=False,
|
342
|
+
default=False,
|
343
|
+
description="Enable adaptive pool sizing based on workload",
|
344
|
+
),
|
345
|
+
NodeParameter(
|
346
|
+
name="enable_query_routing",
|
347
|
+
type=bool,
|
348
|
+
required=False,
|
349
|
+
default=False,
|
350
|
+
description="Enable query pattern tracking for routing optimization",
|
351
|
+
),
|
352
|
+
# Phase 3 parameters
|
353
|
+
NodeParameter(
|
354
|
+
name="circuit_breaker_failure_threshold",
|
355
|
+
type=int,
|
356
|
+
required=False,
|
357
|
+
default=5,
|
358
|
+
description="Failures before circuit breaker opens",
|
359
|
+
),
|
360
|
+
NodeParameter(
|
361
|
+
name="circuit_breaker_recovery_timeout",
|
362
|
+
type=int,
|
363
|
+
required=False,
|
364
|
+
default=60,
|
365
|
+
description="Seconds before circuit breaker tries recovery",
|
366
|
+
),
|
367
|
+
NodeParameter(
|
368
|
+
name="circuit_breaker_error_rate",
|
369
|
+
type=float,
|
370
|
+
required=False,
|
371
|
+
default=0.5,
|
372
|
+
description="Error rate threshold to open circuit",
|
373
|
+
),
|
374
|
+
NodeParameter(
|
375
|
+
name="metrics_retention_minutes",
|
376
|
+
type=int,
|
377
|
+
required=False,
|
378
|
+
default=60,
|
379
|
+
description="How long to retain detailed metrics",
|
380
|
+
),
|
381
|
+
NodeParameter(
|
382
|
+
name="enable_pipelining",
|
383
|
+
type=bool,
|
384
|
+
required=False,
|
385
|
+
default=False,
|
386
|
+
description="Enable query pipelining for batch operations",
|
387
|
+
),
|
388
|
+
NodeParameter(
|
389
|
+
name="pipeline_batch_size",
|
390
|
+
type=int,
|
391
|
+
required=False,
|
392
|
+
default=100,
|
393
|
+
description="Maximum queries per pipeline batch",
|
394
|
+
),
|
395
|
+
NodeParameter(
|
396
|
+
name="enable_monitoring",
|
397
|
+
type=bool,
|
398
|
+
required=False,
|
399
|
+
default=False,
|
400
|
+
description="Enable monitoring dashboard",
|
401
|
+
),
|
402
|
+
NodeParameter(
|
403
|
+
name="monitoring_port",
|
404
|
+
type=int,
|
405
|
+
required=False,
|
406
|
+
default=8080,
|
407
|
+
description="Port for monitoring dashboard",
|
408
|
+
),
|
409
|
+
# Operation parameters
|
410
|
+
NodeParameter(
|
411
|
+
name="operation",
|
412
|
+
type=str,
|
413
|
+
required=True,
|
414
|
+
description="Operation: initialize, acquire, release, execute, stats",
|
415
|
+
),
|
416
|
+
NodeParameter(
|
417
|
+
name="connection_id",
|
418
|
+
type=str,
|
419
|
+
required=False,
|
420
|
+
description="Connection ID for operations",
|
421
|
+
),
|
422
|
+
NodeParameter(
|
423
|
+
name="query",
|
424
|
+
type=str,
|
425
|
+
required=False,
|
426
|
+
description="SQL query to execute",
|
427
|
+
),
|
428
|
+
NodeParameter(
|
429
|
+
name="params", type=Any, required=False, description="Query parameters"
|
430
|
+
),
|
431
|
+
NodeParameter(
|
432
|
+
name="fetch_mode",
|
433
|
+
type=str,
|
434
|
+
required=False,
|
435
|
+
default="all",
|
436
|
+
description="Fetch mode: one, all, many",
|
437
|
+
),
|
438
|
+
]
|
439
|
+
|
440
|
+
# Convert list to dict as required by base class
|
441
|
+
return {param.name: param for param in params}
|
442
|
+
|
443
|
+
async def on_workflow_start(
|
444
|
+
self, workflow_id: str, workflow_type: Optional[str] = None
|
445
|
+
):
|
446
|
+
"""Called when workflow starts - pre-warm connections."""
|
447
|
+
self.workflow_id = workflow_id
|
448
|
+
self.pattern_analyzer.record_workflow_start(
|
449
|
+
workflow_id, workflow_type or "unknown"
|
450
|
+
)
|
451
|
+
|
452
|
+
if self.pre_warm_enabled and workflow_type:
|
453
|
+
expected_connections = self.pattern_analyzer.get_expected_connections(
|
454
|
+
workflow_type
|
455
|
+
)
|
456
|
+
await self._pre_warm_connections(expected_connections)
|
457
|
+
|
458
|
+
async def on_workflow_complete(self, workflow_id: str):
|
459
|
+
"""Called when workflow completes - clean up resources."""
|
460
|
+
if workflow_id == self.workflow_id:
|
461
|
+
await self._cleanup()
|
462
|
+
|
463
|
+
async def async_run(self, **inputs) -> Dict[str, Any]:
|
464
|
+
"""Process connection pool operations."""
|
465
|
+
operation = inputs.get("operation")
|
466
|
+
|
467
|
+
if operation == "initialize":
|
468
|
+
return await self._initialize()
|
469
|
+
elif operation == "acquire":
|
470
|
+
return await self._acquire_connection()
|
471
|
+
elif operation == "release":
|
472
|
+
return await self._release_connection(inputs.get("connection_id"))
|
473
|
+
elif operation == "execute":
|
474
|
+
return await self._execute_query(inputs)
|
475
|
+
elif operation == "stats":
|
476
|
+
return await self._get_stats()
|
477
|
+
elif operation == "get_status":
|
478
|
+
return await self._get_pool_status()
|
479
|
+
elif operation == "adjust_pool_size":
|
480
|
+
return await self.adjust_pool_size(inputs.get("new_size"))
|
481
|
+
elif operation == "get_pool_statistics":
|
482
|
+
return await self.get_pool_statistics()
|
483
|
+
elif operation == "get_comprehensive_status":
|
484
|
+
return await self.get_comprehensive_status()
|
485
|
+
elif operation == "start_monitoring":
|
486
|
+
return await self._start_monitoring_dashboard()
|
487
|
+
elif operation == "stop_monitoring":
|
488
|
+
return await self._stop_monitoring_dashboard()
|
489
|
+
elif operation == "export_metrics":
|
490
|
+
return {"prometheus_metrics": self.metrics_collector.export_prometheus()}
|
491
|
+
else:
|
492
|
+
raise NodeExecutionError(f"Unknown operation: {operation}")
|
493
|
+
|
494
|
+
async def _initialize(self) -> Dict[str, Any]:
|
495
|
+
"""Initialize the connection pool."""
|
496
|
+
if self._initialized:
|
497
|
+
return {"status": "already_initialized"}
|
498
|
+
|
499
|
+
try:
|
500
|
+
# Start supervisor
|
501
|
+
await self.supervisor.start()
|
502
|
+
|
503
|
+
# Set up callbacks
|
504
|
+
self.supervisor.on_actor_failure = self._on_connection_failure
|
505
|
+
self.supervisor.on_actor_restart = self._on_connection_restart
|
506
|
+
|
507
|
+
# Create minimum connections
|
508
|
+
await self._ensure_min_connections()
|
509
|
+
|
510
|
+
# Start adaptive controller if enabled
|
511
|
+
if self.adaptive_controller:
|
512
|
+
await self.adaptive_controller.start(
|
513
|
+
pool_ref=self, pattern_tracker=self.query_pattern_tracker
|
514
|
+
)
|
515
|
+
|
516
|
+
self._initialized = True
|
517
|
+
|
518
|
+
return {
|
519
|
+
"status": "initialized",
|
520
|
+
"min_connections": self.min_connections,
|
521
|
+
"max_connections": self.max_connections,
|
522
|
+
"adaptive_sizing": self.adaptive_sizing_enabled,
|
523
|
+
"query_routing": self.enable_query_routing,
|
524
|
+
}
|
525
|
+
|
526
|
+
except Exception as e:
|
527
|
+
logger.error(f"Failed to initialize pool: {e}")
|
528
|
+
raise NodeExecutionError(f"Pool initialization failed: {e}")
|
529
|
+
|
530
|
+
async def _acquire_connection(self) -> Dict[str, Any]:
|
531
|
+
"""Acquire a connection from the pool."""
|
532
|
+
if not self._initialized:
|
533
|
+
await self._initialize()
|
534
|
+
|
535
|
+
start_time = time.time()
|
536
|
+
|
537
|
+
try:
|
538
|
+
# Use circuit breaker to protect connection acquisition
|
539
|
+
async def acquire_with_circuit_breaker():
|
540
|
+
# Try to get available connection
|
541
|
+
connection = None
|
542
|
+
|
543
|
+
# Fast path: try to get immediately available connection
|
544
|
+
try:
|
545
|
+
connection = await asyncio.wait_for(
|
546
|
+
self.available_connections.get(), timeout=0.1
|
547
|
+
)
|
548
|
+
except asyncio.TimeoutError:
|
549
|
+
# Need to create new connection or wait
|
550
|
+
if len(self.all_connections) < self.max_connections:
|
551
|
+
# Create new connection
|
552
|
+
connection = await self._create_connection()
|
553
|
+
# Don't put it in available queue - we'll use it directly
|
554
|
+
else:
|
555
|
+
# Wait for available connection
|
556
|
+
connection = await self.available_connections.get()
|
557
|
+
|
558
|
+
return connection
|
559
|
+
|
560
|
+
# Execute with circuit breaker protection
|
561
|
+
connection = await self.circuit_breaker.call(acquire_with_circuit_breaker)
|
562
|
+
|
563
|
+
# Record acquisition time
|
564
|
+
wait_time = time.time() - start_time
|
565
|
+
self.metrics.record_acquisition_time(wait_time)
|
566
|
+
|
567
|
+
# Track in comprehensive metrics
|
568
|
+
with self.metrics_collector.track_acquisition() as timer:
|
569
|
+
pass # Already acquired, just recording time
|
570
|
+
|
571
|
+
# Move to active
|
572
|
+
self.active_connections[connection.id] = connection
|
573
|
+
|
574
|
+
# Update pattern analyzer
|
575
|
+
if self.workflow_id:
|
576
|
+
self.pattern_analyzer.record_connection_usage(
|
577
|
+
self.workflow_id, len(self.active_connections)
|
578
|
+
)
|
579
|
+
|
580
|
+
return {
|
581
|
+
"connection_id": connection.id,
|
582
|
+
"health_score": connection.health_score,
|
583
|
+
"acquisition_time_ms": wait_time * 1000,
|
584
|
+
}
|
585
|
+
|
586
|
+
except CircuitBreakerError as e:
|
587
|
+
# Circuit is open - pool is experiencing failures
|
588
|
+
self.metrics_collector.track_pool_exhaustion()
|
589
|
+
logger.error(f"Circuit breaker open: {e}")
|
590
|
+
raise NodeExecutionError(f"Connection pool circuit breaker open: {e}")
|
591
|
+
except Exception as e:
|
592
|
+
logger.error(f"Failed to acquire connection: {e}")
|
593
|
+
self.metrics_collector.track_query_error("ACQUIRE", e)
|
594
|
+
raise NodeExecutionError(f"Connection acquisition failed: {e}")
|
595
|
+
|
596
|
+
async def _release_connection(self, connection_id: Optional[str]) -> Dict[str, Any]:
|
597
|
+
"""Release a connection back to the pool."""
|
598
|
+
if not connection_id:
|
599
|
+
raise NodeExecutionError("connection_id required for release")
|
600
|
+
|
601
|
+
if connection_id not in self.active_connections:
|
602
|
+
raise NodeExecutionError(f"Connection {connection_id} not active")
|
603
|
+
|
604
|
+
connection = self.active_connections.pop(connection_id)
|
605
|
+
|
606
|
+
# Check if connection should be recycled
|
607
|
+
if connection.health_score < self.health_threshold:
|
608
|
+
await self._recycle_connection(connection)
|
609
|
+
return {"status": "recycled", "connection_id": connection_id}
|
610
|
+
else:
|
611
|
+
# Return to available pool
|
612
|
+
await self.available_connections.put(connection)
|
613
|
+
return {"status": "released", "connection_id": connection_id}
|
614
|
+
|
615
|
+
async def _execute_query(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
616
|
+
"""Execute a query on a specific connection."""
|
617
|
+
connection_id = inputs.get("connection_id")
|
618
|
+
if not connection_id or connection_id not in self.active_connections:
|
619
|
+
raise NodeExecutionError(f"Invalid connection_id: {connection_id}")
|
620
|
+
|
621
|
+
connection = self.active_connections[connection_id]
|
622
|
+
|
623
|
+
# Determine query type for metrics
|
624
|
+
query = inputs.get("query", "").strip().upper()
|
625
|
+
query_type = "UNKNOWN"
|
626
|
+
if query.startswith("SELECT"):
|
627
|
+
query_type = "SELECT"
|
628
|
+
elif query.startswith("INSERT"):
|
629
|
+
query_type = "INSERT"
|
630
|
+
elif query.startswith("UPDATE"):
|
631
|
+
query_type = "UPDATE"
|
632
|
+
elif query.startswith("DELETE"):
|
633
|
+
query_type = "DELETE"
|
634
|
+
|
635
|
+
try:
|
636
|
+
# Execute query with comprehensive metrics tracking
|
637
|
+
with self.metrics_collector.track_query(query_type) as timer:
|
638
|
+
result = await connection.execute(
|
639
|
+
query=inputs.get("query"),
|
640
|
+
params=inputs.get("params"),
|
641
|
+
fetch_mode=inputs.get("fetch_mode", "all"),
|
642
|
+
)
|
643
|
+
|
644
|
+
# Update metrics
|
645
|
+
self.metrics.queries_executed += 1
|
646
|
+
if not result.success:
|
647
|
+
self.metrics.query_errors += 1
|
648
|
+
self.metrics_collector.track_query_error(
|
649
|
+
query_type, Exception(result.error)
|
650
|
+
)
|
651
|
+
|
652
|
+
# Track query pattern if enabled
|
653
|
+
if self.query_pattern_tracker and inputs.get("query"):
|
654
|
+
self.query_pattern_tracker.record_execution(
|
655
|
+
fingerprint=inputs.get("query_fingerprint", inputs.get("query")),
|
656
|
+
execution_time_ms=result.execution_time * 1000,
|
657
|
+
connection_id=connection_id,
|
658
|
+
parameters=inputs.get("params", {}),
|
659
|
+
success=result.success,
|
660
|
+
result_size=len(result.data) if result.data else 0,
|
661
|
+
)
|
662
|
+
|
663
|
+
return {
|
664
|
+
"success": result.success,
|
665
|
+
"data": result.data,
|
666
|
+
"error": result.error,
|
667
|
+
"execution_time_ms": result.execution_time * 1000,
|
668
|
+
"connection_id": connection_id,
|
669
|
+
}
|
670
|
+
|
671
|
+
except Exception as e:
|
672
|
+
self.metrics.query_errors += 1
|
673
|
+
logger.error(f"Query execution failed: {e}")
|
674
|
+
raise NodeExecutionError(f"Query execution failed: {e}")
|
675
|
+
|
676
|
+
async def _get_stats(self) -> Dict[str, Any]:
|
677
|
+
"""Get comprehensive pool statistics."""
|
678
|
+
pool_stats = self.metrics.get_stats()
|
679
|
+
supervisor_stats = self.supervisor.get_stats()
|
680
|
+
|
681
|
+
# Add current pool state
|
682
|
+
pool_stats["current_state"] = {
|
683
|
+
"total_connections": len(self.all_connections),
|
684
|
+
"active_connections": len(self.active_connections),
|
685
|
+
"available_connections": self.available_connections.qsize(),
|
686
|
+
"health_scores": {
|
687
|
+
conn_id: conn.health_score
|
688
|
+
for conn_id, conn in self.all_connections.items()
|
689
|
+
},
|
690
|
+
}
|
691
|
+
|
692
|
+
pool_stats["supervisor"] = supervisor_stats
|
693
|
+
|
694
|
+
return pool_stats
|
695
|
+
|
696
|
+
async def _create_connection(self) -> ConnectionActor:
|
697
|
+
"""Create a new connection actor."""
|
698
|
+
conn_id = f"conn_{uuid.uuid4().hex[:8]}"
|
699
|
+
|
700
|
+
# Create actor connection
|
701
|
+
actor_conn = ActorConnection(
|
702
|
+
connection_id=conn_id,
|
703
|
+
db_config=self.db_config,
|
704
|
+
health_check_interval=30.0,
|
705
|
+
max_lifetime=3600.0,
|
706
|
+
max_idle_time=600.0,
|
707
|
+
)
|
708
|
+
|
709
|
+
# Add to supervisor
|
710
|
+
self.supervisor.add_actor(actor_conn)
|
711
|
+
|
712
|
+
# Create high-level interface
|
713
|
+
connection = ConnectionActor(actor_conn)
|
714
|
+
|
715
|
+
# Track connection
|
716
|
+
self.all_connections[conn_id] = connection
|
717
|
+
self.metrics.connections_created += 1
|
718
|
+
|
719
|
+
logger.info(f"Created connection {conn_id} for pool {self.metadata.name}")
|
720
|
+
|
721
|
+
return connection
|
722
|
+
|
723
|
+
async def _ensure_min_connections(self):
|
724
|
+
"""Ensure minimum connections are available."""
|
725
|
+
current_count = len(self.all_connections)
|
726
|
+
|
727
|
+
for _ in range(self.min_connections - current_count):
|
728
|
+
connection = await self._create_connection()
|
729
|
+
await self.available_connections.put(connection)
|
730
|
+
|
731
|
+
async def _pre_warm_connections(self, target_count: int):
|
732
|
+
"""Pre-warm connections based on expected usage."""
|
733
|
+
current_count = len(self.all_connections)
|
734
|
+
to_create = min(
|
735
|
+
target_count - current_count, self.max_connections - current_count
|
736
|
+
)
|
737
|
+
|
738
|
+
if to_create > 0:
|
739
|
+
logger.info(
|
740
|
+
f"Pre-warming {to_create} connections for pool {self.metadata.name}"
|
741
|
+
)
|
742
|
+
|
743
|
+
# Create connections in parallel
|
744
|
+
tasks = [self._create_connection() for _ in range(to_create)]
|
745
|
+
connections = await asyncio.gather(*tasks)
|
746
|
+
|
747
|
+
# Add to available pool
|
748
|
+
for conn in connections:
|
749
|
+
await self.available_connections.put(conn)
|
750
|
+
|
751
|
+
async def _recycle_connection(self, connection: ConnectionActor):
|
752
|
+
"""Recycle a connection."""
|
753
|
+
logger.info(
|
754
|
+
f"Recycling connection {connection.id} (health: {connection.health_score})"
|
755
|
+
)
|
756
|
+
|
757
|
+
# Remove from all connections
|
758
|
+
if connection.id in self.all_connections:
|
759
|
+
del self.all_connections[connection.id]
|
760
|
+
|
761
|
+
# Request recycling
|
762
|
+
await connection.recycle()
|
763
|
+
|
764
|
+
# Update metrics
|
765
|
+
self.metrics.connections_recycled += 1
|
766
|
+
|
767
|
+
# Ensure minimum connections
|
768
|
+
await self._ensure_min_connections()
|
769
|
+
|
770
|
+
async def _cleanup(self):
|
771
|
+
"""Clean up all connections and resources."""
|
772
|
+
if self._closing:
|
773
|
+
return
|
774
|
+
|
775
|
+
self._closing = True
|
776
|
+
logger.info(f"Cleaning up pool {self.metadata.name}")
|
777
|
+
|
778
|
+
# Stop accepting new connections
|
779
|
+
self._initialized = False
|
780
|
+
|
781
|
+
# Stop adaptive controller if running
|
782
|
+
if self.adaptive_controller:
|
783
|
+
await self.adaptive_controller.stop()
|
784
|
+
|
785
|
+
# Stop all connection actors gracefully
|
786
|
+
actors_to_stop = list(self.all_connections.values())
|
787
|
+
for actor in actors_to_stop:
|
788
|
+
try:
|
789
|
+
await actor.stop()
|
790
|
+
except Exception as e:
|
791
|
+
logger.warning(f"Error stopping actor {actor.id}: {e}")
|
792
|
+
|
793
|
+
# Stop supervisor
|
794
|
+
try:
|
795
|
+
await self.supervisor.stop()
|
796
|
+
except Exception as e:
|
797
|
+
logger.warning(f"Error stopping supervisor: {e}")
|
798
|
+
|
799
|
+
# Clear connection tracking
|
800
|
+
self.available_connections = asyncio.Queue()
|
801
|
+
self.active_connections.clear()
|
802
|
+
self.all_connections.clear()
|
803
|
+
|
804
|
+
logger.info(f"Pool {self.metadata.name} cleaned up")
|
805
|
+
|
806
|
+
def _on_connection_failure(self, actor_id: str, error: Exception):
|
807
|
+
"""Handle connection failure."""
|
808
|
+
logger.error(f"Connection {actor_id} failed: {error}")
|
809
|
+
self.metrics.connections_failed += 1
|
810
|
+
|
811
|
+
# Remove from tracking
|
812
|
+
if actor_id in self.all_connections:
|
813
|
+
del self.all_connections[actor_id]
|
814
|
+
if actor_id in self.active_connections:
|
815
|
+
del self.active_connections[actor_id]
|
816
|
+
|
817
|
+
def _on_connection_restart(self, actor_id: str, restart_count: int):
|
818
|
+
"""Handle connection restart."""
|
819
|
+
logger.info(f"Connection {actor_id} restarted (count: {restart_count})")
|
820
|
+
|
821
|
+
async def process(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
822
|
+
"""Async process method for middleware compatibility."""
|
823
|
+
return await self.async_run(**inputs)
|
824
|
+
|
825
|
+
async def __aenter__(self):
|
826
|
+
"""Context manager entry."""
|
827
|
+
await self._initialize()
|
828
|
+
return self
|
829
|
+
|
830
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
831
|
+
"""Context manager exit."""
|
832
|
+
await self._cleanup()
|
833
|
+
|
834
|
+
async def _get_pool_status(self) -> Dict[str, Any]:
|
835
|
+
"""Get pool status for query router."""
|
836
|
+
connections = {}
|
837
|
+
|
838
|
+
for conn_id, conn in self.all_connections.items():
|
839
|
+
connections[conn_id] = {
|
840
|
+
"health_score": conn.health_score,
|
841
|
+
"active_queries": 1 if conn_id in self.active_connections else 0,
|
842
|
+
"capabilities": [
|
843
|
+
"read",
|
844
|
+
"write",
|
845
|
+
], # TODO: Add actual capability detection
|
846
|
+
"avg_latency_ms": 0.0, # TODO: Track actual latency
|
847
|
+
"last_used": datetime.now().isoformat(),
|
848
|
+
}
|
849
|
+
|
850
|
+
return {
|
851
|
+
"connections": connections,
|
852
|
+
"pool_size": len(self.all_connections),
|
853
|
+
"active_count": len(self.active_connections),
|
854
|
+
"available_count": self.available_connections.qsize(),
|
855
|
+
}
|
856
|
+
|
857
|
+
async def adjust_pool_size(self, new_size: int) -> Dict[str, Any]:
|
858
|
+
"""Dynamically adjust pool size."""
|
859
|
+
if new_size < self.min_connections or new_size > self.max_connections:
|
860
|
+
return {
|
861
|
+
"success": False,
|
862
|
+
"reason": f"Size must be between {self.min_connections} and {self.max_connections}",
|
863
|
+
}
|
864
|
+
|
865
|
+
current_size = len(self.all_connections)
|
866
|
+
|
867
|
+
if new_size > current_size:
|
868
|
+
# Scale up
|
869
|
+
connections_to_add = new_size - current_size
|
870
|
+
for _ in range(connections_to_add):
|
871
|
+
try:
|
872
|
+
await self._create_connection()
|
873
|
+
except Exception as e:
|
874
|
+
logger.error(f"Failed to create connection during scale up: {e}")
|
875
|
+
|
876
|
+
elif new_size < current_size:
|
877
|
+
# Scale down - remove idle connections first
|
878
|
+
connections_to_remove = current_size - new_size
|
879
|
+
removed = 0
|
880
|
+
|
881
|
+
# Try to remove idle connections
|
882
|
+
while (
|
883
|
+
removed < connections_to_remove
|
884
|
+
and not self.available_connections.empty()
|
885
|
+
):
|
886
|
+
try:
|
887
|
+
conn = await asyncio.wait_for(
|
888
|
+
self.available_connections.get(), timeout=0.1
|
889
|
+
)
|
890
|
+
await self._recycle_connection(conn)
|
891
|
+
removed += 1
|
892
|
+
except asyncio.TimeoutError:
|
893
|
+
break
|
894
|
+
|
895
|
+
return {
|
896
|
+
"success": True,
|
897
|
+
"previous_size": current_size,
|
898
|
+
"new_size": len(self.all_connections),
|
899
|
+
}
|
900
|
+
|
901
|
+
async def get_pool_statistics(self) -> Dict[str, Any]:
|
902
|
+
"""Get detailed pool statistics for adaptive sizing."""
|
903
|
+
total_connections = len(self.all_connections)
|
904
|
+
active_connections = len(self.active_connections)
|
905
|
+
idle_connections = self.available_connections.qsize()
|
906
|
+
|
907
|
+
# Calculate metrics
|
908
|
+
utilization_rate = (
|
909
|
+
active_connections / total_connections if total_connections > 0 else 0
|
910
|
+
)
|
911
|
+
|
912
|
+
# Get average health score
|
913
|
+
health_scores = [conn.health_score for conn in self.all_connections.values()]
|
914
|
+
avg_health_score = (
|
915
|
+
sum(health_scores) / len(health_scores) if health_scores else 100
|
916
|
+
)
|
917
|
+
|
918
|
+
# Queue depth (approximate based on waiters)
|
919
|
+
queue_depth = 0 # TODO: Track actual queue depth
|
920
|
+
|
921
|
+
# Get timing metrics from pool metrics
|
922
|
+
stats = self.metrics.get_stats()
|
923
|
+
|
924
|
+
return {
|
925
|
+
"total_connections": total_connections,
|
926
|
+
"active_connections": active_connections,
|
927
|
+
"idle_connections": idle_connections,
|
928
|
+
"queue_depth": queue_depth,
|
929
|
+
"utilization_rate": utilization_rate,
|
930
|
+
"avg_health_score": avg_health_score,
|
931
|
+
"avg_acquisition_time_ms": stats["performance"]["avg_acquisition_time_ms"],
|
932
|
+
"avg_query_time_ms": 50.0, # TODO: Track actual query time
|
933
|
+
"queries_per_second": (
|
934
|
+
stats["queries"]["executed"] / stats["uptime_seconds"]
|
935
|
+
if stats["uptime_seconds"] > 0
|
936
|
+
else 0
|
937
|
+
),
|
938
|
+
# Phase 3 additions
|
939
|
+
"circuit_breaker_status": self.circuit_breaker.get_status(),
|
940
|
+
"comprehensive_metrics": self.metrics_collector.get_all_metrics(),
|
941
|
+
"error_rate": self.metrics_collector.get_error_summary()["error_rate"],
|
942
|
+
"health_score": avg_health_score,
|
943
|
+
"pool_name": self.metadata.name,
|
944
|
+
}
|
945
|
+
|
946
|
+
async def get_comprehensive_status(self) -> Dict[str, Any]:
|
947
|
+
"""Get comprehensive status including all Phase 3 features."""
|
948
|
+
base_stats = await self.get_pool_statistics()
|
949
|
+
|
950
|
+
# Add circuit breaker details
|
951
|
+
cb_status = self.circuit_breaker.get_status()
|
952
|
+
|
953
|
+
# Add comprehensive metrics
|
954
|
+
metrics = self.metrics_collector.get_all_metrics()
|
955
|
+
|
956
|
+
# Add pattern learning insights if enabled
|
957
|
+
pattern_insights = {}
|
958
|
+
if self.query_pattern_tracker:
|
959
|
+
patterns = self.query_pattern_tracker.get_all_patterns()
|
960
|
+
pattern_insights = {
|
961
|
+
"detected_patterns": len(patterns),
|
962
|
+
"workload_forecast": self.query_pattern_tracker.get_workload_forecast(
|
963
|
+
15
|
964
|
+
),
|
965
|
+
}
|
966
|
+
|
967
|
+
# Add adaptive controller status if enabled
|
968
|
+
adaptive_status = {}
|
969
|
+
if self.adaptive_controller:
|
970
|
+
adaptive_status = {
|
971
|
+
"current_size": len(self.all_connections),
|
972
|
+
"recommended_size": self.adaptive_controller.get_recommended_size(),
|
973
|
+
"last_adjustment": self.adaptive_controller.get_last_adjustment(),
|
974
|
+
}
|
975
|
+
|
976
|
+
return {
|
977
|
+
**base_stats,
|
978
|
+
"circuit_breaker": {
|
979
|
+
"state": cb_status["state"],
|
980
|
+
"metrics": cb_status["metrics"],
|
981
|
+
"time_until_recovery": cb_status.get("time_until_recovery"),
|
982
|
+
},
|
983
|
+
"detailed_metrics": {
|
984
|
+
"counters": metrics["counters"],
|
985
|
+
"gauges": metrics["gauges"],
|
986
|
+
"histograms": metrics["histograms"],
|
987
|
+
"errors": metrics["errors"],
|
988
|
+
"query_summary": metrics["queries"],
|
989
|
+
},
|
990
|
+
"pattern_insights": pattern_insights,
|
991
|
+
"adaptive_control": adaptive_status,
|
992
|
+
"monitoring": {
|
993
|
+
"dashboard_enabled": self.enable_monitoring,
|
994
|
+
"dashboard_url": (
|
995
|
+
f"http://localhost:{self.monitoring_port}"
|
996
|
+
if self.enable_monitoring
|
997
|
+
else None
|
998
|
+
),
|
999
|
+
},
|
1000
|
+
}
|
1001
|
+
|
1002
|
+
async def _start_monitoring_dashboard(self) -> Dict[str, Any]:
|
1003
|
+
"""Start the monitoring dashboard if enabled."""
|
1004
|
+
if not self.enable_monitoring:
|
1005
|
+
return {"error": "Monitoring not enabled in configuration"}
|
1006
|
+
|
1007
|
+
try:
|
1008
|
+
# Register this pool with the global metrics aggregator
|
1009
|
+
if hasattr(self.runtime, "metrics_aggregator"):
|
1010
|
+
self.runtime.metrics_aggregator.register_collector(
|
1011
|
+
self.metrics_collector
|
1012
|
+
)
|
1013
|
+
|
1014
|
+
# Start monitoring dashboard if not already running
|
1015
|
+
if not hasattr(self.runtime, "monitoring_dashboard"):
|
1016
|
+
from kailash.nodes.monitoring.connection_dashboard import (
|
1017
|
+
ConnectionDashboardNode,
|
1018
|
+
)
|
1019
|
+
|
1020
|
+
dashboard = ConnectionDashboardNode(
|
1021
|
+
name="global_dashboard",
|
1022
|
+
port=self.monitoring_port,
|
1023
|
+
update_interval=1.0,
|
1024
|
+
)
|
1025
|
+
|
1026
|
+
# Store dashboard in runtime for sharing
|
1027
|
+
self.runtime.monitoring_dashboard = dashboard
|
1028
|
+
await dashboard.start()
|
1029
|
+
|
1030
|
+
return {
|
1031
|
+
"status": "started",
|
1032
|
+
"dashboard_url": f"http://localhost:{self.monitoring_port}",
|
1033
|
+
}
|
1034
|
+
else:
|
1035
|
+
return {
|
1036
|
+
"status": "already_running",
|
1037
|
+
"dashboard_url": f"http://localhost:{self.monitoring_port}",
|
1038
|
+
}
|
1039
|
+
|
1040
|
+
except Exception as e:
|
1041
|
+
logger.error(f"Failed to start monitoring dashboard: {e}")
|
1042
|
+
return {"error": str(e)}
|
1043
|
+
|
1044
|
+
async def _stop_monitoring_dashboard(self) -> Dict[str, Any]:
|
1045
|
+
"""Stop the monitoring dashboard."""
|
1046
|
+
try:
|
1047
|
+
if hasattr(self.runtime, "monitoring_dashboard"):
|
1048
|
+
await self.runtime.monitoring_dashboard.stop()
|
1049
|
+
del self.runtime.monitoring_dashboard
|
1050
|
+
return {"status": "stopped"}
|
1051
|
+
else:
|
1052
|
+
return {"status": "not_running"}
|
1053
|
+
except Exception as e:
|
1054
|
+
logger.error(f"Failed to stop monitoring dashboard: {e}")
|
1055
|
+
return {"error": str(e)}
|
1056
|
+
|
1057
|
+
def _update_pool_metrics(self):
|
1058
|
+
"""Update pool metrics for monitoring."""
|
1059
|
+
total = len(self.all_connections)
|
1060
|
+
active = len(self.active_connections)
|
1061
|
+
idle = self.available_connections.qsize()
|
1062
|
+
|
1063
|
+
# Update comprehensive metrics
|
1064
|
+
self.metrics_collector.update_pool_stats(active, idle, total)
|
1065
|
+
|
1066
|
+
# Track health checks
|
1067
|
+
for conn in self.all_connections.values():
|
1068
|
+
self.metrics_collector.track_health_check(
|
1069
|
+
success=conn.health_score > self.health_threshold,
|
1070
|
+
duration_ms=5.0, # Placeholder - real implementation would track actual time
|
1071
|
+
)
|