holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +3 -5
- holmes/clients/robusta_client.py +20 -6
- holmes/common/env_vars.py +58 -3
- holmes/common/openshift.py +1 -1
- holmes/config.py +123 -148
- holmes/core/conversations.py +71 -15
- holmes/core/feedback.py +191 -0
- holmes/core/investigation.py +31 -39
- holmes/core/investigation_structured_output.py +3 -3
- holmes/core/issue.py +1 -1
- holmes/core/llm.py +508 -88
- holmes/core/models.py +108 -4
- holmes/core/openai_formatting.py +14 -1
- holmes/core/prompt.py +48 -3
- holmes/core/runbooks.py +1 -0
- holmes/core/safeguards.py +8 -6
- holmes/core/supabase_dal.py +295 -100
- holmes/core/tool_calling_llm.py +489 -428
- holmes/core/tools.py +325 -56
- holmes/core/tools_utils/token_counting.py +21 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
- holmes/core/tools_utils/tool_executor.py +0 -13
- holmes/core/tools_utils/toolset_utils.py +1 -0
- holmes/core/toolset_manager.py +191 -5
- holmes/core/tracing.py +19 -3
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +63 -0
- holmes/core/transformers/llm_summarize.py +175 -0
- holmes/core/transformers/registry.py +123 -0
- holmes/core/transformers/transformer.py +32 -0
- holmes/core/truncation/compaction.py +94 -0
- holmes/core/truncation/dal_truncation_utils.py +23 -0
- holmes/core/truncation/input_context_window_limiter.py +219 -0
- holmes/interactive.py +228 -31
- holmes/main.py +23 -40
- holmes/plugins/interfaces.py +2 -1
- holmes/plugins/prompts/__init__.py +2 -1
- holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
- holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
- holmes/plugins/prompts/generic_ask.jinja2 +0 -4
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
- holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
- holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
- holmes/plugins/runbooks/__init__.py +145 -17
- holmes/plugins/runbooks/catalog.json +2 -0
- holmes/plugins/sources/github/__init__.py +4 -2
- holmes/plugins/sources/prometheus/models.py +1 -0
- holmes/plugins/toolsets/__init__.py +44 -27
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
- holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
- holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/utils.py +0 -32
- holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
- holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
- holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
- holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
- holmes/plugins/toolsets/bash/common/bash.py +23 -13
- holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
- holmes/plugins/toolsets/bash/common/stringify.py +1 -1
- holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
- holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
- holmes/plugins/toolsets/bash/parse_command.py +12 -13
- holmes/plugins/toolsets/cilium.yaml +284 -0
- holmes/plugins/toolsets/connectivity_check.py +124 -0
- holmes/plugins/toolsets/coralogix/api.py +132 -119
- holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
- holmes/plugins/toolsets/coralogix/utils.py +15 -79
- holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
- holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
- holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
- holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
- holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
- holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
- holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/git.py +54 -50
- holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
- holmes/plugins/toolsets/grafana/common.py +13 -29
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
- holmes/plugins/toolsets/grafana/loki_api.py +4 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
- holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
- holmes/plugins/toolsets/internet/internet.py +15 -16
- holmes/plugins/toolsets/internet/notion.py +9 -11
- holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
- holmes/plugins/toolsets/investigator/model.py +3 -1
- holmes/plugins/toolsets/json_filter_mixin.py +134 -0
- holmes/plugins/toolsets/kafka.py +36 -42
- holmes/plugins/toolsets/kubernetes.yaml +317 -113
- holmes/plugins/toolsets/kubernetes_logs.py +9 -9
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
- holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
- holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
- holmes/plugins/toolsets/openshift.yaml +283 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/api.py +23 -4
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
- holmes/plugins/toolsets/robusta/robusta.py +239 -68
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/connection_utils.py +31 -0
- holmes/utils/console/result.py +10 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
- holmes/utils/env.py +7 -0
- holmes/utils/file_utils.py +2 -1
- holmes/utils/global_instructions.py +60 -11
- holmes/utils/holmes_status.py +6 -4
- holmes/utils/holmes_sync_toolsets.py +0 -2
- holmes/utils/krr_utils.py +188 -0
- holmes/utils/log.py +15 -0
- holmes/utils/markdown_utils.py +2 -3
- holmes/utils/memory_limit.py +58 -0
- holmes/utils/sentry_helper.py +64 -0
- holmes/utils/stream.py +69 -8
- holmes/utils/tags.py +4 -3
- holmes/version.py +37 -15
- holmesgpt-0.18.4.dist-info/LICENSE +178 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
- holmesgpt-0.18.4.dist-info/RECORD +258 -0
- holmes/core/performance_timing.py +0 -72
- holmes/plugins/toolsets/aws.yaml +0 -80
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
- holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
- holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
- holmes/plugins/toolsets/newrelic.py +0 -231
- holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
- holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
- holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
- holmes/plugins/toolsets/servicenow/install.md +0 -37
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
- holmes/utils/keygen_utils.py +0 -6
- holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
- holmesgpt-0.13.2.dist-info/RECORD +0 -234
- /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
|
@@ -1,27 +1,41 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
-
import re
|
|
5
4
|
import time
|
|
6
|
-
import
|
|
7
|
-
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
5
|
+
from typing import Any, Dict, Optional, Tuple, Type, Union
|
|
8
6
|
from urllib.parse import urljoin
|
|
9
7
|
|
|
8
|
+
import dateutil.parser
|
|
10
9
|
import requests # type: ignore
|
|
11
|
-
from
|
|
12
|
-
from requests import RequestException
|
|
10
|
+
from prometrix.auth import PrometheusAuthorization
|
|
13
11
|
from prometrix.connect.aws_connect import AWSPrometheusConnect
|
|
12
|
+
from prometrix.models.prometheus_config import (
|
|
13
|
+
AzurePrometheusConfig as PrometrixAzureConfig,
|
|
14
|
+
)
|
|
14
15
|
from prometrix.models.prometheus_config import PrometheusConfig as BasePrometheusConfig
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
17
|
+
from requests import RequestException
|
|
18
|
+
from requests.exceptions import SSLError # type: ignore
|
|
19
|
+
|
|
20
|
+
from holmes.common.env_vars import IS_OPENSHIFT, MAX_GRAPH_POINTS
|
|
21
|
+
from holmes.common.openshift import load_openshift_token
|
|
15
22
|
from holmes.core.tools import (
|
|
16
23
|
CallablePrerequisite,
|
|
17
24
|
StructuredToolResult,
|
|
25
|
+
StructuredToolResultStatus,
|
|
18
26
|
Tool,
|
|
27
|
+
ToolInvokeContext,
|
|
19
28
|
ToolParameter,
|
|
20
|
-
ToolResultStatus,
|
|
21
29
|
Toolset,
|
|
22
30
|
ToolsetTag,
|
|
23
31
|
)
|
|
32
|
+
from holmes.core.tools_utils.token_counting import count_tool_response_tokens
|
|
33
|
+
from holmes.core.tools_utils.tool_context_window_limiter import get_pct_token_count
|
|
24
34
|
from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
|
|
35
|
+
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
36
|
+
DEFAULT_GRAPH_TIME_SPAN_SECONDS,
|
|
37
|
+
)
|
|
38
|
+
from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
|
|
25
39
|
from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
|
|
26
40
|
from holmes.plugins.toolsets.utils import (
|
|
27
41
|
get_param_or_raise,
|
|
@@ -30,31 +44,76 @@ from holmes.plugins.toolsets.utils import (
|
|
|
30
44
|
toolset_name_for_one_liner,
|
|
31
45
|
)
|
|
32
46
|
from holmes.utils.cache import TTLCache
|
|
33
|
-
from holmes.common.env_vars import IS_OPENSHIFT, MAX_GRAPH_POINTS
|
|
34
|
-
from holmes.common.openshift import load_openshift_token
|
|
35
|
-
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
36
|
-
DEFAULT_GRAPH_TIME_SPAN_SECONDS,
|
|
37
|
-
)
|
|
38
|
-
from holmes.utils.keygen_utils import generate_random_key
|
|
39
47
|
|
|
40
48
|
PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
|
|
49
|
+
PROMETHEUS_METADATA_API_LIMIT = 100 # Default limit for Prometheus metadata APIs (series, labels, metadata) to prevent overwhelming responses
|
|
50
|
+
# Default timeout values for PromQL queries
|
|
51
|
+
DEFAULT_QUERY_TIMEOUT_SECONDS = 20
|
|
52
|
+
MAX_QUERY_TIMEOUT_SECONDS = 180
|
|
53
|
+
# Default timeout for metadata API calls (discovery endpoints)
|
|
54
|
+
DEFAULT_METADATA_TIMEOUT_SECONDS = 20
|
|
55
|
+
MAX_METADATA_TIMEOUT_SECONDS = 60
|
|
56
|
+
# Default time window for metadata APIs (in hours)
|
|
57
|
+
DEFAULT_METADATA_TIME_WINDOW_HRS = 1
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def format_ssl_error_message(prometheus_url: str, error: SSLError) -> str:
|
|
61
|
+
"""Format a clear SSL error message with remediation steps."""
|
|
62
|
+
return (
|
|
63
|
+
f"SSL certificate verification failed when connecting to Prometheus at {prometheus_url}. "
|
|
64
|
+
f"Error: {str(error)}. "
|
|
65
|
+
f"To disable SSL verification, set 'verify_ssl: false' in your configuration. "
|
|
66
|
+
f"For Helm deployments, add this to your values.yaml:\n"
|
|
67
|
+
f" toolsets:\n"
|
|
68
|
+
f" prometheus/metrics:\n"
|
|
69
|
+
f" config:\n"
|
|
70
|
+
f" verify_ssl: false"
|
|
71
|
+
)
|
|
41
72
|
|
|
42
73
|
|
|
43
74
|
class PrometheusConfig(BaseModel):
|
|
75
|
+
"""Prometheus toolset configuration.
|
|
76
|
+
|
|
77
|
+
Deprecated config names (still accepted but not in schema):
|
|
78
|
+
- default_metadata_time_window_hrs -> discover_metrics_from_last_hours
|
|
79
|
+
- default_query_timeout_seconds -> query_timeout_seconds_default
|
|
80
|
+
- max_query_timeout_seconds -> query_timeout_seconds_hard_max
|
|
81
|
+
- default_metadata_timeout_seconds -> metadata_timeout_seconds_default
|
|
82
|
+
- max_metadata_timeout_seconds -> metadata_timeout_seconds_hard_max
|
|
83
|
+
- metrics_labels_time_window_hrs -> discover_metrics_from_last_hours
|
|
84
|
+
- prometheus_ssl_enabled -> verify_ssl
|
|
85
|
+
- metrics_labels_cache_duration_hrs (no longer used)
|
|
86
|
+
- fetch_labels_with_labels_api (no longer used)
|
|
87
|
+
- fetch_metadata_with_series_api (no longer used)
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
model_config = ConfigDict(extra="allow")
|
|
91
|
+
|
|
44
92
|
# URL is optional because it can be set with an env var
|
|
45
|
-
prometheus_url: Optional[str]
|
|
46
|
-
|
|
47
|
-
#
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
93
|
+
prometheus_url: Optional[str] = None
|
|
94
|
+
|
|
95
|
+
# Discovery API time window - only return metrics with data in the last N hours
|
|
96
|
+
discover_metrics_from_last_hours: int = DEFAULT_METADATA_TIME_WINDOW_HRS
|
|
97
|
+
|
|
98
|
+
# Query timeout configuration
|
|
99
|
+
query_timeout_seconds_default: int = DEFAULT_QUERY_TIMEOUT_SECONDS
|
|
100
|
+
query_timeout_seconds_hard_max: int = MAX_QUERY_TIMEOUT_SECONDS
|
|
101
|
+
|
|
102
|
+
# Metadata API timeout configuration
|
|
103
|
+
metadata_timeout_seconds_default: int = DEFAULT_METADATA_TIMEOUT_SECONDS
|
|
104
|
+
metadata_timeout_seconds_hard_max: int = MAX_METADATA_TIMEOUT_SECONDS
|
|
105
|
+
|
|
53
106
|
tool_calls_return_data: bool = True
|
|
54
107
|
headers: Dict = Field(default_factory=dict)
|
|
55
|
-
rules_cache_duration_seconds:
|
|
108
|
+
rules_cache_duration_seconds: Optional[int] = 1800 # 30 minutes
|
|
56
109
|
additional_labels: Optional[Dict[str, str]] = None
|
|
57
|
-
|
|
110
|
+
verify_ssl: bool = True
|
|
111
|
+
|
|
112
|
+
# Custom limit to the max number of tokens that a query result can take to proactively
|
|
113
|
+
# prevent token limit issues. Expressed in % of the model's context window.
|
|
114
|
+
# This limit only overrides the global limit for all tools (TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT)
|
|
115
|
+
# if it is lower.
|
|
116
|
+
query_response_size_limit_pct: Optional[int] = None
|
|
58
117
|
|
|
59
118
|
@field_validator("prometheus_url")
|
|
60
119
|
def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
|
|
@@ -64,11 +123,52 @@ class PrometheusConfig(BaseModel):
|
|
|
64
123
|
|
|
65
124
|
@model_validator(mode="after")
|
|
66
125
|
def validate_prom_config(self):
|
|
126
|
+
# Handle deprecated config names passed as extra fields
|
|
127
|
+
# These are accepted via extra="allow" but not defined in schema
|
|
128
|
+
extra = self.model_extra or {}
|
|
129
|
+
deprecated_with_replacement = []
|
|
130
|
+
|
|
131
|
+
# Map of old names -> new names
|
|
132
|
+
deprecated_mappings = {
|
|
133
|
+
"default_metadata_time_window_hrs": "discover_metrics_from_last_hours",
|
|
134
|
+
"default_query_timeout_seconds": "query_timeout_seconds_default",
|
|
135
|
+
"max_query_timeout_seconds": "query_timeout_seconds_hard_max",
|
|
136
|
+
"default_metadata_timeout_seconds": "metadata_timeout_seconds_default",
|
|
137
|
+
"max_metadata_timeout_seconds": "metadata_timeout_seconds_hard_max",
|
|
138
|
+
"metrics_labels_time_window_hrs": "discover_metrics_from_last_hours",
|
|
139
|
+
"prometheus_ssl_enabled": "verify_ssl",
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
for old_name, new_name in deprecated_mappings.items():
|
|
143
|
+
if old_name in extra:
|
|
144
|
+
setattr(self, new_name, extra[old_name])
|
|
145
|
+
deprecated_with_replacement.append(f"{old_name} -> {new_name}")
|
|
146
|
+
|
|
147
|
+
if deprecated_with_replacement:
|
|
148
|
+
logging.warning(
|
|
149
|
+
f"Prometheus config uses deprecated names. Please update: "
|
|
150
|
+
f"{', '.join(deprecated_with_replacement)}"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Check for deprecated config values that no longer have any effect
|
|
154
|
+
deprecated_no_effect = [
|
|
155
|
+
name
|
|
156
|
+
for name in [
|
|
157
|
+
"metrics_labels_cache_duration_hrs",
|
|
158
|
+
"fetch_labels_with_labels_api",
|
|
159
|
+
"fetch_metadata_with_series_api",
|
|
160
|
+
]
|
|
161
|
+
if name in extra
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
if deprecated_no_effect:
|
|
165
|
+
logging.warning(
|
|
166
|
+
f"The following Prometheus config values are deprecated and have no effect: "
|
|
167
|
+
f"{', '.join(deprecated_no_effect)}"
|
|
168
|
+
)
|
|
169
|
+
|
|
67
170
|
# If openshift is enabled, and the user didn't configure auth headers, we will try to load the token from the service account.
|
|
68
171
|
if IS_OPENSHIFT:
|
|
69
|
-
if self.healthcheck == "-/healthy":
|
|
70
|
-
self.healthcheck = "api/v1/query?query=up"
|
|
71
|
-
|
|
72
172
|
if self.headers.get("Authorization"):
|
|
73
173
|
return self
|
|
74
174
|
|
|
@@ -88,8 +188,7 @@ class AMPConfig(PrometheusConfig):
|
|
|
88
188
|
aws_secret_access_key: Optional[str] = None
|
|
89
189
|
aws_region: str
|
|
90
190
|
aws_service_name: str = "aps"
|
|
91
|
-
|
|
92
|
-
prometheus_ssl_enabled: bool = False
|
|
191
|
+
verify_ssl: bool = False
|
|
93
192
|
assume_role_arn: Optional[str] = None
|
|
94
193
|
|
|
95
194
|
# Refresh the AWS client (and its STS creds) every N seconds (default: 15 minutes)
|
|
@@ -113,7 +212,7 @@ class AMPConfig(PrometheusConfig):
|
|
|
113
212
|
try:
|
|
114
213
|
base_config = BasePrometheusConfig(
|
|
115
214
|
url=self.prometheus_url,
|
|
116
|
-
disable_ssl=not self.
|
|
215
|
+
disable_ssl=not self.verify_ssl,
|
|
117
216
|
additional_labels=self.additional_labels,
|
|
118
217
|
)
|
|
119
218
|
self._aws_client = AWSPrometheusConnect(
|
|
@@ -132,12 +231,155 @@ class AMPConfig(PrometheusConfig):
|
|
|
132
231
|
return self._aws_client
|
|
133
232
|
|
|
134
233
|
|
|
234
|
+
class AzurePrometheusConfig(PrometheusConfig):
|
|
235
|
+
azure_resource: Optional[str] = None
|
|
236
|
+
azure_metadata_endpoint: Optional[str] = None
|
|
237
|
+
azure_token_endpoint: Optional[str] = None
|
|
238
|
+
azure_use_managed_id: bool = False
|
|
239
|
+
azure_client_id: Optional[str] = None
|
|
240
|
+
azure_client_secret: Optional[str] = None
|
|
241
|
+
azure_tenant_id: Optional[str] = None
|
|
242
|
+
verify_ssl: bool = True
|
|
243
|
+
|
|
244
|
+
# Refresh the Azure bearer token every N seconds (default: 15 minutes)
|
|
245
|
+
refresh_interval_seconds: int = 900
|
|
246
|
+
|
|
247
|
+
_prometrix_config: Optional[PrometrixAzureConfig] = None
|
|
248
|
+
_token_created_at: float = 0.0
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _load_from_env_or_default(
|
|
252
|
+
config_value: Optional[str], env_var: str, default: Optional[str] = None
|
|
253
|
+
) -> Optional[str]:
|
|
254
|
+
"""Load value from config, environment variable, or use default."""
|
|
255
|
+
if config_value:
|
|
256
|
+
return config_value
|
|
257
|
+
return os.environ.get(env_var, default)
|
|
258
|
+
|
|
259
|
+
def __init__(self, **data):
|
|
260
|
+
super().__init__(**data)
|
|
261
|
+
# Load from environment variables if not provided in config
|
|
262
|
+
self.azure_client_id = self._load_from_env_or_default(
|
|
263
|
+
self.azure_client_id, "AZURE_CLIENT_ID"
|
|
264
|
+
)
|
|
265
|
+
self.azure_tenant_id = self._load_from_env_or_default(
|
|
266
|
+
self.azure_tenant_id, "AZURE_TENANT_ID"
|
|
267
|
+
)
|
|
268
|
+
self.azure_client_secret = self._load_from_env_or_default(
|
|
269
|
+
self.azure_client_secret, "AZURE_CLIENT_SECRET"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Set defaults from environment if not provided
|
|
273
|
+
self.azure_resource = self._load_from_env_or_default(
|
|
274
|
+
self.azure_resource,
|
|
275
|
+
"AZURE_RESOURCE",
|
|
276
|
+
"https://prometheus.monitor.azure.com",
|
|
277
|
+
)
|
|
278
|
+
# from https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/how-to-use-vm-token
|
|
279
|
+
self.azure_metadata_endpoint = self._load_from_env_or_default(
|
|
280
|
+
self.azure_metadata_endpoint,
|
|
281
|
+
"AZURE_METADATA_ENDPOINT",
|
|
282
|
+
"http://169.254.169.254/metadata/identity/oauth2/token",
|
|
283
|
+
)
|
|
284
|
+
self.azure_token_endpoint = self._load_from_env_or_default(
|
|
285
|
+
self.azure_token_endpoint, "AZURE_TOKEN_ENDPOINT"
|
|
286
|
+
)
|
|
287
|
+
if not self.azure_token_endpoint and self.azure_tenant_id:
|
|
288
|
+
self.azure_token_endpoint = (
|
|
289
|
+
f"https://login.microsoftonline.com/{self.azure_tenant_id}/oauth2/token"
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# Check if managed identity should be used
|
|
293
|
+
if not self.azure_use_managed_id:
|
|
294
|
+
self.azure_use_managed_id = os.environ.get(
|
|
295
|
+
"AZURE_USE_MANAGED_ID", "false"
|
|
296
|
+
).lower() in ("true", "1")
|
|
297
|
+
|
|
298
|
+
# Convert None to empty string for prometrix compatibility (prometrix checks != "")
|
|
299
|
+
azure_client_id = self.azure_client_id or ""
|
|
300
|
+
azure_tenant_id = self.azure_tenant_id or ""
|
|
301
|
+
azure_client_secret = self.azure_client_secret or ""
|
|
302
|
+
azure_resource = self.azure_resource or ""
|
|
303
|
+
azure_metadata_endpoint = self.azure_metadata_endpoint or ""
|
|
304
|
+
azure_token_endpoint = self.azure_token_endpoint or ""
|
|
305
|
+
|
|
306
|
+
# Create prometrix Azure config
|
|
307
|
+
self._prometrix_config = PrometrixAzureConfig(
|
|
308
|
+
url=self.prometheus_url,
|
|
309
|
+
azure_resource=azure_resource,
|
|
310
|
+
azure_metadata_endpoint=azure_metadata_endpoint,
|
|
311
|
+
azure_token_endpoint=azure_token_endpoint,
|
|
312
|
+
azure_use_managed_id=self.azure_use_managed_id,
|
|
313
|
+
azure_client_id=azure_client_id,
|
|
314
|
+
azure_client_secret=azure_client_secret,
|
|
315
|
+
azure_tenant_id=azure_tenant_id,
|
|
316
|
+
disable_ssl=not self.verify_ssl,
|
|
317
|
+
additional_labels=self.additional_labels,
|
|
318
|
+
)
|
|
319
|
+
# Ensure promtrix gets a real bool (not string) for managed identity
|
|
320
|
+
# fixing internal prometrix config issue
|
|
321
|
+
object.__setattr__(
|
|
322
|
+
self._prometrix_config,
|
|
323
|
+
"azure_use_managed_id",
|
|
324
|
+
bool(self.azure_use_managed_id),
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
PrometheusAuthorization.azure_authorization(self._prometrix_config)
|
|
328
|
+
|
|
329
|
+
@staticmethod
|
|
330
|
+
def is_azure_config(config: dict[str, Any]) -> bool:
|
|
331
|
+
"""Check if config dict or environment variables indicate Azure Prometheus config."""
|
|
332
|
+
# Check for explicit Azure fields in config
|
|
333
|
+
if (
|
|
334
|
+
"azure_client_id" in config
|
|
335
|
+
or "azure_tenant_id" in config
|
|
336
|
+
or "azure_use_managed_id" in config
|
|
337
|
+
):
|
|
338
|
+
return True
|
|
339
|
+
|
|
340
|
+
# Check for Azure environment variables
|
|
341
|
+
if os.environ.get("AZURE_CLIENT_ID") or os.environ.get("AZURE_TENANT_ID"):
|
|
342
|
+
return True
|
|
343
|
+
|
|
344
|
+
return False
|
|
345
|
+
|
|
346
|
+
def is_amp(self) -> bool:
|
|
347
|
+
return False
|
|
348
|
+
|
|
349
|
+
def _should_refresh_token(self) -> bool:
|
|
350
|
+
if not PrometheusAuthorization.bearer_token:
|
|
351
|
+
return True
|
|
352
|
+
return (time.time() - self._token_created_at) >= self.refresh_interval_seconds
|
|
353
|
+
|
|
354
|
+
def request_new_token(self) -> bool:
|
|
355
|
+
"""Request a new Azure access token using prometrix."""
|
|
356
|
+
success = PrometheusAuthorization.request_new_token(self._prometrix_config)
|
|
357
|
+
if success:
|
|
358
|
+
self._token_created_at = time.time()
|
|
359
|
+
return success
|
|
360
|
+
|
|
361
|
+
def get_authorization_headers(self) -> Dict[str, str]:
|
|
362
|
+
# Request new token if needed
|
|
363
|
+
if self._should_refresh_token():
|
|
364
|
+
if not self.request_new_token():
|
|
365
|
+
logging.error("Failed to request new Azure access token")
|
|
366
|
+
return {}
|
|
367
|
+
self._token_created_at = time.time()
|
|
368
|
+
|
|
369
|
+
headers = PrometheusAuthorization.get_authorization_headers(
|
|
370
|
+
self._prometrix_config
|
|
371
|
+
)
|
|
372
|
+
if not headers.get("Authorization"):
|
|
373
|
+
logging.warning("No authorization header generated for Azure Prometheus")
|
|
374
|
+
return headers
|
|
375
|
+
|
|
376
|
+
|
|
135
377
|
class BasePrometheusTool(Tool):
|
|
136
378
|
toolset: "PrometheusToolset"
|
|
137
379
|
|
|
138
380
|
|
|
139
381
|
def do_request(
|
|
140
|
-
config, # PrometheusConfig | AMPConfig
|
|
382
|
+
config, # PrometheusConfig | AMPConfig | AzurePrometheusConfig
|
|
141
383
|
url: str,
|
|
142
384
|
params: Optional[Dict] = None,
|
|
143
385
|
data: Optional[Dict] = None,
|
|
@@ -149,17 +391,20 @@ def do_request(
|
|
|
149
391
|
"""
|
|
150
392
|
Route a request through either:
|
|
151
393
|
- AWSPrometheusConnect (SigV4) when config is AMPConfig
|
|
394
|
+
- Azure bearer token auth when config is AzurePrometheusConfig
|
|
152
395
|
- plain requests otherwise
|
|
153
396
|
|
|
154
397
|
method defaults to GET so callers can omit it for reads.
|
|
155
398
|
"""
|
|
156
399
|
if verify is None:
|
|
157
|
-
verify = config.
|
|
400
|
+
verify = config.verify_ssl
|
|
158
401
|
if headers is None:
|
|
159
402
|
headers = config.headers or {}
|
|
160
403
|
|
|
161
404
|
if isinstance(config, AMPConfig):
|
|
162
405
|
client = config.get_aws_client() # cached AWSPrometheusConnect
|
|
406
|
+
# Note: timeout parameter is not supported by prometrix's signed_request
|
|
407
|
+
# AWS/AMP requests will not respect the timeout setting
|
|
163
408
|
return client.signed_request( # type: ignore
|
|
164
409
|
method=method,
|
|
165
410
|
url=url,
|
|
@@ -169,7 +414,21 @@ def do_request(
|
|
|
169
414
|
headers=headers,
|
|
170
415
|
)
|
|
171
416
|
|
|
172
|
-
|
|
417
|
+
if isinstance(config, AzurePrometheusConfig):
|
|
418
|
+
# Merge Azure authorization headers with provided headers
|
|
419
|
+
azure_headers = config.get_authorization_headers()
|
|
420
|
+
headers = {**azure_headers, **headers}
|
|
421
|
+
return requests.request(
|
|
422
|
+
method=method,
|
|
423
|
+
url=url,
|
|
424
|
+
headers=headers,
|
|
425
|
+
params=params,
|
|
426
|
+
data=data,
|
|
427
|
+
timeout=timeout,
|
|
428
|
+
verify=verify,
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# Non-AMP, Non-Azure: plain HTTP
|
|
173
432
|
return requests.request(
|
|
174
433
|
method=method,
|
|
175
434
|
url=url,
|
|
@@ -181,99 +440,6 @@ def do_request(
|
|
|
181
440
|
)
|
|
182
441
|
|
|
183
442
|
|
|
184
|
-
def filter_metrics_by_type(metrics: Dict, expected_type: str):
|
|
185
|
-
return {
|
|
186
|
-
metric_name: metric_data
|
|
187
|
-
for metric_name, metric_data in metrics.items()
|
|
188
|
-
if expected_type in metric_data.get("type", "")
|
|
189
|
-
or metric_data.get("type", "") == "?"
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
|
|
194
|
-
regex = re.compile(pattern)
|
|
195
|
-
return {
|
|
196
|
-
metric_name: metric_data
|
|
197
|
-
for metric_name, metric_data in metrics.items()
|
|
198
|
-
if regex.search(metric_name)
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def fetch_metadata(
|
|
206
|
-
prometheus_url: str,
|
|
207
|
-
headers: Optional[Dict],
|
|
208
|
-
config,
|
|
209
|
-
verify_ssl: bool = True,
|
|
210
|
-
) -> Dict:
|
|
211
|
-
metadata_url = urljoin(prometheus_url, "api/v1/metadata")
|
|
212
|
-
metadata_response = do_request(
|
|
213
|
-
config=config,
|
|
214
|
-
url=metadata_url,
|
|
215
|
-
headers=headers,
|
|
216
|
-
timeout=60,
|
|
217
|
-
verify=verify_ssl,
|
|
218
|
-
method="GET",
|
|
219
|
-
)
|
|
220
|
-
metadata_response.raise_for_status()
|
|
221
|
-
|
|
222
|
-
metadata = metadata_response.json()["data"]
|
|
223
|
-
|
|
224
|
-
metrics = {}
|
|
225
|
-
for metric_name, meta_list in metadata.items():
|
|
226
|
-
if meta_list:
|
|
227
|
-
metric_type = meta_list[0].get("type", "unknown")
|
|
228
|
-
metric_description = meta_list[0].get("help", "unknown")
|
|
229
|
-
metrics[metric_name] = {
|
|
230
|
-
"type": metric_type,
|
|
231
|
-
"description": metric_description,
|
|
232
|
-
"labels": set(),
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
return metrics
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def fetch_metadata_with_series_api(
|
|
239
|
-
prometheus_url: str,
|
|
240
|
-
metric_name: str,
|
|
241
|
-
headers: Dict,
|
|
242
|
-
config,
|
|
243
|
-
verify_ssl: bool = True,
|
|
244
|
-
) -> Dict:
|
|
245
|
-
url = urljoin(prometheus_url, "api/v1/series")
|
|
246
|
-
params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
|
|
247
|
-
|
|
248
|
-
response = do_request(
|
|
249
|
-
config=config,
|
|
250
|
-
url=url,
|
|
251
|
-
headers=headers,
|
|
252
|
-
params=params,
|
|
253
|
-
timeout=60,
|
|
254
|
-
verify=verify_ssl,
|
|
255
|
-
method="GET",
|
|
256
|
-
)
|
|
257
|
-
response.raise_for_status()
|
|
258
|
-
metrics = response.json()["data"]
|
|
259
|
-
|
|
260
|
-
metadata: Dict = {}
|
|
261
|
-
for metric_data in metrics:
|
|
262
|
-
metric_name = metric_data.get("__name__")
|
|
263
|
-
if not metric_name:
|
|
264
|
-
continue
|
|
265
|
-
|
|
266
|
-
metric = metadata.get(metric_name)
|
|
267
|
-
if not metric:
|
|
268
|
-
metric = {"description": "?", "type": "?", "labels": set()}
|
|
269
|
-
metadata[metric_name] = metric
|
|
270
|
-
|
|
271
|
-
labels = {k for k in metric_data.keys() if k != "__name__"}
|
|
272
|
-
metric["labels"].update(labels)
|
|
273
|
-
|
|
274
|
-
return metadata
|
|
275
|
-
|
|
276
|
-
|
|
277
443
|
def result_has_data(result: Dict) -> bool:
|
|
278
444
|
data = result.get("data", {})
|
|
279
445
|
if len(data.get("result", [])) > 0:
|
|
@@ -284,33 +450,58 @@ def result_has_data(result: Dict) -> bool:
|
|
|
284
450
|
def adjust_step_for_max_points(
|
|
285
451
|
start_timestamp: str,
|
|
286
452
|
end_timestamp: str,
|
|
287
|
-
step: float,
|
|
453
|
+
step: Optional[float] = None,
|
|
454
|
+
max_points_override: Optional[float] = None,
|
|
288
455
|
) -> float:
|
|
289
456
|
"""
|
|
290
457
|
Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
|
|
291
|
-
Max points is controlled by the PROMETHEUS_MAX_GRAPH_POINTS environment variable (default: 300).
|
|
292
458
|
|
|
293
459
|
Args:
|
|
294
460
|
start_timestamp: RFC3339 formatted start time
|
|
295
461
|
end_timestamp: RFC3339 formatted end time
|
|
296
|
-
step: The requested step duration in seconds
|
|
462
|
+
step: The requested step duration in seconds (None for auto-calculation)
|
|
463
|
+
max_points_override: Optional override for max points (must be <= MAX_GRAPH_POINTS)
|
|
297
464
|
|
|
298
465
|
Returns:
|
|
299
466
|
Adjusted step value in seconds that ensures points <= max_points
|
|
300
467
|
"""
|
|
468
|
+
# Use override if provided and valid, otherwise use default
|
|
469
|
+
max_points = MAX_GRAPH_POINTS
|
|
470
|
+
if max_points_override is not None:
|
|
471
|
+
if max_points_override > MAX_GRAPH_POINTS:
|
|
472
|
+
logging.warning(
|
|
473
|
+
f"max_points override ({max_points_override}) exceeds system limit ({MAX_GRAPH_POINTS}), using {MAX_GRAPH_POINTS}"
|
|
474
|
+
)
|
|
475
|
+
max_points = MAX_GRAPH_POINTS
|
|
476
|
+
elif max_points_override < 1:
|
|
477
|
+
logging.warning(
|
|
478
|
+
f"max_points override ({max_points_override}) is invalid, using default {MAX_GRAPH_POINTS}"
|
|
479
|
+
)
|
|
480
|
+
max_points = MAX_GRAPH_POINTS
|
|
481
|
+
else:
|
|
482
|
+
max_points = max_points_override
|
|
483
|
+
logging.debug(f"Using max_points override: {max_points}")
|
|
301
484
|
|
|
302
485
|
start_dt = dateutil.parser.parse(start_timestamp)
|
|
303
486
|
end_dt = dateutil.parser.parse(end_timestamp)
|
|
304
487
|
|
|
305
488
|
time_range_seconds = (end_dt - start_dt).total_seconds()
|
|
306
489
|
|
|
490
|
+
# If no step provided, calculate a reasonable default
|
|
491
|
+
# Aim for ~60 data points across the time range (1 per minute for hourly, etc)
|
|
492
|
+
if step is None:
|
|
493
|
+
step = max(1, time_range_seconds / 60)
|
|
494
|
+
logging.debug(
|
|
495
|
+
f"No step provided, defaulting to {step}s for {time_range_seconds}s range"
|
|
496
|
+
)
|
|
497
|
+
|
|
307
498
|
current_points = time_range_seconds / step
|
|
308
499
|
|
|
309
500
|
# If current points exceed max, adjust the step
|
|
310
|
-
if current_points >
|
|
311
|
-
adjusted_step = time_range_seconds /
|
|
501
|
+
if current_points > max_points:
|
|
502
|
+
adjusted_step = time_range_seconds / max_points
|
|
312
503
|
logging.info(
|
|
313
|
-
f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {
|
|
504
|
+
f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {max_points}"
|
|
314
505
|
)
|
|
315
506
|
return adjusted_step
|
|
316
507
|
|
|
@@ -324,185 +515,149 @@ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]
|
|
|
324
515
|
return results
|
|
325
516
|
|
|
326
517
|
|
|
327
|
-
def
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
metric_name: str,
|
|
333
|
-
config=None,
|
|
334
|
-
verify_ssl: bool = True,
|
|
335
|
-
) -> dict:
|
|
336
|
-
"""This is a slow query. Takes 5+ seconds to run"""
|
|
337
|
-
cache_key = f"metrics_labels_series_api:{metric_name}"
|
|
338
|
-
if cache:
|
|
339
|
-
cached_result = cache.get(cache_key)
|
|
340
|
-
if cached_result:
|
|
341
|
-
return cached_result
|
|
342
|
-
|
|
343
|
-
series_url = urljoin(prometheus_url, "api/v1/series")
|
|
344
|
-
params: dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
|
|
345
|
-
|
|
346
|
-
if metrics_labels_time_window_hrs is not None:
|
|
347
|
-
params["end"] = int(time.time())
|
|
348
|
-
params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
|
|
349
|
-
|
|
350
|
-
series_response = do_request(
|
|
351
|
-
config=config,
|
|
352
|
-
url=series_url,
|
|
353
|
-
headers=headers,
|
|
354
|
-
params=params,
|
|
355
|
-
timeout=60,
|
|
356
|
-
verify=verify_ssl,
|
|
357
|
-
method="GET",
|
|
358
|
-
)
|
|
359
|
-
series_response.raise_for_status()
|
|
360
|
-
series = series_response.json()["data"]
|
|
361
|
-
|
|
362
|
-
metrics_labels: dict = {}
|
|
363
|
-
for serie in series:
|
|
364
|
-
metric_name = serie["__name__"]
|
|
365
|
-
# Add all labels except __name__
|
|
366
|
-
labels = {k for k in serie.keys() if k != "__name__"}
|
|
367
|
-
if metric_name in metrics_labels:
|
|
368
|
-
metrics_labels[metric_name].update(labels)
|
|
369
|
-
else:
|
|
370
|
-
metrics_labels[metric_name] = labels
|
|
371
|
-
if cache:
|
|
372
|
-
cache.set(cache_key, metrics_labels)
|
|
373
|
-
|
|
374
|
-
return metrics_labels
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
def fetch_metrics_labels_with_labels_api(
|
|
378
|
-
prometheus_url: str,
|
|
379
|
-
cache: Optional[TTLCache],
|
|
380
|
-
metrics_labels_time_window_hrs: Union[int, None],
|
|
381
|
-
metric_names: List[str],
|
|
382
|
-
headers: Dict,
|
|
383
|
-
config=None,
|
|
384
|
-
verify_ssl: bool = True,
|
|
385
|
-
) -> dict:
|
|
386
|
-
metrics_labels = {}
|
|
387
|
-
|
|
388
|
-
for metric_name in metric_names:
|
|
389
|
-
cache_key = f"metrics_labels_labels_api:{metric_name}"
|
|
390
|
-
if cache:
|
|
391
|
-
cached_result = cache.get(cache_key)
|
|
392
|
-
if cached_result:
|
|
393
|
-
metrics_labels[metric_name] = cached_result
|
|
394
|
-
|
|
395
|
-
url = urljoin(prometheus_url, "api/v1/labels")
|
|
396
|
-
params: dict = {
|
|
397
|
-
"match[]": f'{{__name__="{metric_name}"}}',
|
|
398
|
-
}
|
|
399
|
-
if metrics_labels_time_window_hrs is not None:
|
|
400
|
-
params["end"] = int(time.time())
|
|
401
|
-
params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
|
|
518
|
+
def create_data_summary_for_large_result(
|
|
519
|
+
result_data: Dict, query: str, data_size_tokens: int, is_range_query: bool = False
|
|
520
|
+
) -> Dict[str, Any]:
|
|
521
|
+
"""
|
|
522
|
+
Create a summary for large Prometheus results instead of returning full data.
|
|
402
523
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
)
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
prometheus_url=prometheus_url,
|
|
439
|
-
metric_name=metric_name,
|
|
440
|
-
headers=headers,
|
|
441
|
-
config=config,
|
|
442
|
-
verify_ssl=verify_ssl,
|
|
524
|
+
Args:
|
|
525
|
+
result_data: The Prometheus data result
|
|
526
|
+
query: The original PromQL query
|
|
527
|
+
data_size_tokens: Size of the data in tokens
|
|
528
|
+
is_range_query: Whether this is a range query (vs instant query)
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
Dictionary with summary information and suggestions
|
|
532
|
+
"""
|
|
533
|
+
if is_range_query:
|
|
534
|
+
series_list = result_data.get("result", [])
|
|
535
|
+
num_items = len(series_list)
|
|
536
|
+
|
|
537
|
+
# Calculate exact total data points across all series
|
|
538
|
+
total_points = 0
|
|
539
|
+
for series in series_list: # Iterate through ALL series for exact count
|
|
540
|
+
points = len(series.get("values", []))
|
|
541
|
+
total_points += points
|
|
542
|
+
|
|
543
|
+
# Analyze label keys and their cardinality
|
|
544
|
+
label_cardinality: Dict[str, set] = {}
|
|
545
|
+
for series in series_list:
|
|
546
|
+
metric = series.get("metric", {})
|
|
547
|
+
for label_key, label_value in metric.items():
|
|
548
|
+
if label_key not in label_cardinality:
|
|
549
|
+
label_cardinality[label_key] = set()
|
|
550
|
+
label_cardinality[label_key].add(label_value)
|
|
551
|
+
|
|
552
|
+
# Convert sets to counts for the summary
|
|
553
|
+
label_summary = {
|
|
554
|
+
label: len(values) for label, values in label_cardinality.items()
|
|
555
|
+
}
|
|
556
|
+
# Sort by cardinality (highest first) for better insights
|
|
557
|
+
label_summary = dict(
|
|
558
|
+
sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
|
|
443
559
|
)
|
|
444
|
-
|
|
560
|
+
|
|
561
|
+
return {
|
|
562
|
+
"message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} time series with {total_points:,} total data points.",
|
|
563
|
+
"series_count": num_items,
|
|
564
|
+
"total_data_points": total_points,
|
|
565
|
+
"data_size_tokens": data_size_tokens,
|
|
566
|
+
"label_cardinality": label_summary,
|
|
567
|
+
"suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
|
|
568
|
+
}
|
|
445
569
|
else:
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
570
|
+
# Instant query
|
|
571
|
+
result_type = result_data.get("resultType", "")
|
|
572
|
+
result_list = result_data.get("result", [])
|
|
573
|
+
num_items = len(result_list)
|
|
574
|
+
|
|
575
|
+
# Analyze label keys and their cardinality
|
|
576
|
+
instant_label_cardinality: Dict[str, set] = {}
|
|
577
|
+
for item in result_list:
|
|
578
|
+
if isinstance(item, dict):
|
|
579
|
+
metric = item.get("metric", {})
|
|
580
|
+
for label_key, label_value in metric.items():
|
|
581
|
+
if label_key not in instant_label_cardinality:
|
|
582
|
+
instant_label_cardinality[label_key] = set()
|
|
583
|
+
instant_label_cardinality[label_key].add(label_value)
|
|
584
|
+
|
|
585
|
+
# Convert sets to counts for the summary
|
|
586
|
+
label_summary = {
|
|
587
|
+
label: len(values) for label, values in instant_label_cardinality.items()
|
|
588
|
+
}
|
|
589
|
+
# Sort by cardinality (highest first) for better insights
|
|
590
|
+
label_summary = dict(
|
|
591
|
+
sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
|
|
451
592
|
)
|
|
452
|
-
metrics = filter_metrics_by_name(metrics, metric_name)
|
|
453
593
|
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
headers=headers,
|
|
463
|
-
config=config,
|
|
464
|
-
verify_ssl=verify_ssl,
|
|
465
|
-
)
|
|
466
|
-
else:
|
|
467
|
-
metrics_labels = fetch_metrics_labels_with_series_api(
|
|
468
|
-
prometheus_url=prometheus_url,
|
|
469
|
-
cache=cache,
|
|
470
|
-
metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
|
|
471
|
-
metric_name=metric_name,
|
|
472
|
-
headers=headers,
|
|
473
|
-
config=config,
|
|
474
|
-
verify_ssl=verify_ssl,
|
|
475
|
-
)
|
|
594
|
+
return {
|
|
595
|
+
"message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} results.",
|
|
596
|
+
"result_count": num_items,
|
|
597
|
+
"result_type": result_type,
|
|
598
|
+
"data_size_tokens": data_size_tokens,
|
|
599
|
+
"label_cardinality": label_summary,
|
|
600
|
+
"suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
|
|
601
|
+
}
|
|
476
602
|
|
|
477
|
-
for metric_name in metrics:
|
|
478
|
-
if metric_name in metrics_labels:
|
|
479
|
-
metrics[metric_name]["labels"] = metrics_labels[metric_name]
|
|
480
603
|
|
|
481
|
-
|
|
604
|
+
class MetricsBasedResponse(BaseModel):
|
|
605
|
+
status: str
|
|
606
|
+
error_message: Optional[str] = None
|
|
607
|
+
data: Optional[str] = None
|
|
608
|
+
tool_name: str
|
|
609
|
+
description: str
|
|
610
|
+
query: str
|
|
611
|
+
start: Optional[str] = None
|
|
612
|
+
end: Optional[str] = None
|
|
613
|
+
step: Optional[float] = None
|
|
614
|
+
output_type: Optional[str] = None
|
|
615
|
+
data_summary: Optional[dict[str, Any]] = None
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def create_structured_tool_result(
|
|
619
|
+
params: dict, response: MetricsBasedResponse
|
|
620
|
+
) -> StructuredToolResult:
|
|
621
|
+
status = StructuredToolResultStatus.SUCCESS
|
|
622
|
+
error = None
|
|
623
|
+
if response.error_message or response.status.lower() in ("failed", "error"):
|
|
624
|
+
status = StructuredToolResultStatus.ERROR
|
|
625
|
+
error = (
|
|
626
|
+
response.error_message
|
|
627
|
+
if response.error_message
|
|
628
|
+
else "Unknown Prometheus error"
|
|
629
|
+
)
|
|
630
|
+
elif not response.data:
|
|
631
|
+
status = StructuredToolResultStatus.NO_DATA
|
|
632
|
+
|
|
633
|
+
return StructuredToolResult(
|
|
634
|
+
status=status,
|
|
635
|
+
data=response,
|
|
636
|
+
params=params,
|
|
637
|
+
error=error,
|
|
638
|
+
)
|
|
482
639
|
|
|
483
640
|
|
|
484
641
|
class ListPrometheusRules(BasePrometheusTool):
|
|
485
642
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
486
643
|
super().__init__(
|
|
487
644
|
name="list_prometheus_rules",
|
|
488
|
-
description="List all defined
|
|
645
|
+
description="List all defined Prometheus rules (api/v1/rules). Will show the Prometheus rules description, expression and annotations",
|
|
489
646
|
parameters={},
|
|
490
647
|
toolset=toolset,
|
|
491
648
|
)
|
|
492
649
|
self._cache = None
|
|
493
650
|
|
|
494
|
-
def _invoke(
|
|
495
|
-
self, params: dict, user_approved: bool = False
|
|
496
|
-
) -> StructuredToolResult:
|
|
651
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
497
652
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
498
653
|
return StructuredToolResult(
|
|
499
|
-
status=
|
|
654
|
+
status=StructuredToolResultStatus.ERROR,
|
|
500
655
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
501
656
|
params=params,
|
|
502
657
|
)
|
|
503
658
|
if self.toolset.config.is_amp():
|
|
504
659
|
return StructuredToolResult(
|
|
505
|
-
status=
|
|
660
|
+
status=StructuredToolResultStatus.ERROR,
|
|
506
661
|
error="Tool not supported in AMP",
|
|
507
662
|
params=params,
|
|
508
663
|
)
|
|
@@ -515,7 +670,7 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
515
670
|
logging.debug("rules returned from cache")
|
|
516
671
|
|
|
517
672
|
return StructuredToolResult(
|
|
518
|
-
status=
|
|
673
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
519
674
|
data=cached_rules,
|
|
520
675
|
params=params,
|
|
521
676
|
)
|
|
@@ -528,8 +683,8 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
528
683
|
config=self.toolset.config,
|
|
529
684
|
url=rules_url,
|
|
530
685
|
params=params,
|
|
531
|
-
timeout=
|
|
532
|
-
verify=self.toolset.config.
|
|
686
|
+
timeout=40,
|
|
687
|
+
verify=self.toolset.config.verify_ssl,
|
|
533
688
|
headers=self.toolset.config.headers,
|
|
534
689
|
method="GET",
|
|
535
690
|
)
|
|
@@ -539,28 +694,35 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
539
694
|
if self._cache:
|
|
540
695
|
self._cache.set(PROMETHEUS_RULES_CACHE_KEY, data)
|
|
541
696
|
return StructuredToolResult(
|
|
542
|
-
status=
|
|
697
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
543
698
|
data=data,
|
|
544
699
|
params=params,
|
|
545
700
|
)
|
|
546
701
|
except requests.Timeout:
|
|
547
702
|
logging.warning("Timeout while fetching prometheus rules", exc_info=True)
|
|
548
703
|
return StructuredToolResult(
|
|
549
|
-
status=
|
|
704
|
+
status=StructuredToolResultStatus.ERROR,
|
|
550
705
|
error="Request timed out while fetching rules",
|
|
551
706
|
params=params,
|
|
552
707
|
)
|
|
708
|
+
except SSLError as e:
|
|
709
|
+
logging.warning("SSL error while fetching prometheus rules", exc_info=True)
|
|
710
|
+
return StructuredToolResult(
|
|
711
|
+
status=StructuredToolResultStatus.ERROR,
|
|
712
|
+
error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
|
|
713
|
+
params=params,
|
|
714
|
+
)
|
|
553
715
|
except RequestException as e:
|
|
554
716
|
logging.warning("Failed to fetch prometheus rules", exc_info=True)
|
|
555
717
|
return StructuredToolResult(
|
|
556
|
-
status=
|
|
718
|
+
status=StructuredToolResultStatus.ERROR,
|
|
557
719
|
error=f"Network error while fetching rules: {str(e)}",
|
|
558
720
|
params=params,
|
|
559
721
|
)
|
|
560
722
|
except Exception as e:
|
|
561
723
|
logging.warning("Failed to process prometheus rules", exc_info=True)
|
|
562
724
|
return StructuredToolResult(
|
|
563
|
-
status=
|
|
725
|
+
status=StructuredToolResultStatus.ERROR,
|
|
564
726
|
error=f"Unexpected error: {str(e)}",
|
|
565
727
|
params=params,
|
|
566
728
|
)
|
|
@@ -569,120 +731,553 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
569
731
|
return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Rules"
|
|
570
732
|
|
|
571
733
|
|
|
572
|
-
class
|
|
734
|
+
class GetMetricNames(BasePrometheusTool):
|
|
735
|
+
"""Thin wrapper around /api/v1/label/__name__/values - the fastest way to discover metric names"""
|
|
736
|
+
|
|
573
737
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
574
738
|
super().__init__(
|
|
575
|
-
name="
|
|
576
|
-
description=
|
|
739
|
+
name="get_metric_names",
|
|
740
|
+
description=(
|
|
741
|
+
"Get list of metric names using /api/v1/label/__name__/values. "
|
|
742
|
+
"FASTEST method for metric discovery when you need to explore available metrics. "
|
|
743
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique metric names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use a more specific filter. "
|
|
744
|
+
f"ALWAYS use match[] parameter to filter metrics - without it you'll get random {PROMETHEUS_METADATA_API_LIMIT} metrics which is rarely useful. "
|
|
745
|
+
"Note: Does not return metric metadata (type, description, labels). "
|
|
746
|
+
"By default returns metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
747
|
+
),
|
|
577
748
|
parameters={
|
|
578
|
-
"
|
|
579
|
-
description=
|
|
749
|
+
"match": ToolParameter(
|
|
750
|
+
description=(
|
|
751
|
+
"REQUIRED: PromQL selector to filter metrics. Use regex OR (|) to check multiple patterns in one call - much faster than multiple calls! Examples: "
|
|
752
|
+
"'{__name__=~\"node_cpu.*|node_memory.*|node_disk.*\"}' for all node resource metrics, "
|
|
753
|
+
"'{__name__=~\"container_cpu.*|container_memory.*|container_network.*\"}' for all container metrics, "
|
|
754
|
+
"'{__name__=~\"kube_pod.*|kube_deployment.*|kube_service.*\"}' for multiple Kubernetes object metrics, "
|
|
755
|
+
"'{__name__=~\".*cpu.*|.*memory.*|.*disk.*\"}' for all resource metrics, "
|
|
756
|
+
"'{namespace=~\"kube-system|default|monitoring\"}' for metrics from multiple namespaces, "
|
|
757
|
+
"'{job=~\"prometheus|node-exporter|kube-state-metrics\"}' for metrics from multiple jobs."
|
|
758
|
+
),
|
|
759
|
+
type="string",
|
|
760
|
+
required=True,
|
|
761
|
+
),
|
|
762
|
+
"start": ToolParameter(
|
|
763
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
580
764
|
type="string",
|
|
581
765
|
required=False,
|
|
582
766
|
),
|
|
583
|
-
"
|
|
584
|
-
description="
|
|
767
|
+
"end": ToolParameter(
|
|
768
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
585
769
|
type="string",
|
|
586
|
-
required=
|
|
770
|
+
required=False,
|
|
587
771
|
),
|
|
588
772
|
},
|
|
589
773
|
toolset=toolset,
|
|
590
774
|
)
|
|
591
|
-
self._cache = None
|
|
592
775
|
|
|
593
|
-
def _invoke(
|
|
594
|
-
self, params: dict, user_approved: bool = False
|
|
595
|
-
) -> StructuredToolResult:
|
|
776
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
596
777
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
597
778
|
return StructuredToolResult(
|
|
598
|
-
status=
|
|
779
|
+
status=StructuredToolResultStatus.ERROR,
|
|
599
780
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
600
781
|
params=params,
|
|
601
782
|
)
|
|
602
|
-
if not self._cache and self.toolset.config.metrics_labels_cache_duration_hrs:
|
|
603
|
-
self._cache = TTLCache(
|
|
604
|
-
self.toolset.config.metrics_labels_cache_duration_hrs * 3600 # type: ignore
|
|
605
|
-
)
|
|
606
783
|
try:
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
784
|
+
match_param = params.get("match")
|
|
785
|
+
if not match_param:
|
|
786
|
+
return StructuredToolResult(
|
|
787
|
+
status=StructuredToolResultStatus.ERROR,
|
|
788
|
+
error="Match parameter is required to filter metrics",
|
|
789
|
+
params=params,
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
url = urljoin(
|
|
793
|
+
self.toolset.config.prometheus_url, "api/v1/label/__name__/values"
|
|
610
794
|
)
|
|
795
|
+
query_params = {
|
|
796
|
+
"limit": str(PROMETHEUS_METADATA_API_LIMIT),
|
|
797
|
+
"match[]": match_param,
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
# Add time parameters - use provided values or defaults
|
|
801
|
+
if params.get("end"):
|
|
802
|
+
query_params["end"] = params["end"]
|
|
803
|
+
else:
|
|
804
|
+
query_params["end"] = str(int(time.time()))
|
|
805
|
+
|
|
806
|
+
if params.get("start"):
|
|
807
|
+
query_params["start"] = params["start"]
|
|
808
|
+
elif self.toolset.config.discover_metrics_from_last_hours:
|
|
809
|
+
# Use default time window
|
|
810
|
+
query_params["start"] = str(
|
|
811
|
+
int(time.time())
|
|
812
|
+
- (self.toolset.config.discover_metrics_from_last_hours * 3600)
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
response = do_request(
|
|
816
|
+
config=self.toolset.config,
|
|
817
|
+
url=url,
|
|
818
|
+
params=query_params,
|
|
819
|
+
timeout=self.toolset.config.metadata_timeout_seconds_default,
|
|
820
|
+
verify=self.toolset.config.verify_ssl,
|
|
821
|
+
headers=self.toolset.config.headers,
|
|
822
|
+
method="GET",
|
|
823
|
+
)
|
|
824
|
+
response.raise_for_status()
|
|
825
|
+
data = response.json()
|
|
826
|
+
|
|
827
|
+
# Check if results were truncated
|
|
828
|
+
if (
|
|
829
|
+
"data" in data
|
|
830
|
+
and isinstance(data["data"], list)
|
|
831
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
832
|
+
):
|
|
833
|
+
data["_truncated"] = True
|
|
834
|
+
data["_message"] = (
|
|
835
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match filter to see additional metrics."
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
return StructuredToolResult(
|
|
839
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
840
|
+
data=data,
|
|
841
|
+
params=params,
|
|
842
|
+
)
|
|
843
|
+
except Exception as e:
|
|
844
|
+
return StructuredToolResult(
|
|
845
|
+
status=StructuredToolResultStatus.ERROR,
|
|
846
|
+
error=str(e),
|
|
847
|
+
params=params,
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
851
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metric Names"
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
class GetLabelValues(BasePrometheusTool):
|
|
855
|
+
"""Get values for a specific label across all metrics"""
|
|
611
856
|
|
|
612
|
-
|
|
613
|
-
|
|
857
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
858
|
+
super().__init__(
|
|
859
|
+
name="get_label_values",
|
|
860
|
+
description=(
|
|
861
|
+
"Get all values for a specific label using /api/v1/label/{label}/values. "
|
|
862
|
+
"Use this to discover pods, namespaces, jobs, instances, etc. "
|
|
863
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique values (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
|
|
864
|
+
"Supports optional match[] parameter to filter. "
|
|
865
|
+
"By default returns values from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
866
|
+
),
|
|
867
|
+
parameters={
|
|
868
|
+
"label": ToolParameter(
|
|
869
|
+
description="Label name to get values for (e.g., 'pod', 'namespace', 'job', 'instance')",
|
|
870
|
+
type="string",
|
|
871
|
+
required=True,
|
|
872
|
+
),
|
|
873
|
+
"match": ToolParameter(
|
|
874
|
+
description=(
|
|
875
|
+
"Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
|
|
876
|
+
"'{namespace=\"default\"}')."
|
|
877
|
+
),
|
|
878
|
+
type="string",
|
|
879
|
+
required=False,
|
|
880
|
+
),
|
|
881
|
+
"start": ToolParameter(
|
|
882
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
883
|
+
type="string",
|
|
884
|
+
required=False,
|
|
885
|
+
),
|
|
886
|
+
"end": ToolParameter(
|
|
887
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
888
|
+
type="string",
|
|
889
|
+
required=False,
|
|
890
|
+
),
|
|
891
|
+
},
|
|
892
|
+
toolset=toolset,
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
896
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
897
|
+
return StructuredToolResult(
|
|
898
|
+
status=StructuredToolResultStatus.ERROR,
|
|
899
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
900
|
+
params=params,
|
|
901
|
+
)
|
|
902
|
+
try:
|
|
903
|
+
label = params.get("label")
|
|
904
|
+
if not label:
|
|
614
905
|
return StructuredToolResult(
|
|
615
|
-
status=
|
|
616
|
-
error="
|
|
906
|
+
status=StructuredToolResultStatus.ERROR,
|
|
907
|
+
error="Label parameter is required",
|
|
617
908
|
params=params,
|
|
618
909
|
)
|
|
619
910
|
|
|
620
|
-
|
|
621
|
-
prometheus_url
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
911
|
+
url = urljoin(
|
|
912
|
+
self.toolset.config.prometheus_url, f"api/v1/label/{label}/values"
|
|
913
|
+
)
|
|
914
|
+
query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
|
|
915
|
+
if params.get("match"):
|
|
916
|
+
query_params["match[]"] = params["match"]
|
|
917
|
+
|
|
918
|
+
# Add time parameters - use provided values or defaults
|
|
919
|
+
if params.get("end"):
|
|
920
|
+
query_params["end"] = params["end"]
|
|
921
|
+
else:
|
|
922
|
+
query_params["end"] = str(int(time.time()))
|
|
923
|
+
|
|
924
|
+
if params.get("start"):
|
|
925
|
+
query_params["start"] = params["start"]
|
|
926
|
+
elif self.toolset.config.discover_metrics_from_last_hours:
|
|
927
|
+
# Use default time window
|
|
928
|
+
query_params["start"] = str(
|
|
929
|
+
int(time.time())
|
|
930
|
+
- (self.toolset.config.discover_metrics_from_last_hours * 3600)
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
response = do_request(
|
|
934
|
+
config=self.toolset.config,
|
|
935
|
+
url=url,
|
|
936
|
+
params=query_params,
|
|
937
|
+
timeout=self.toolset.config.metadata_timeout_seconds_default,
|
|
938
|
+
verify=self.toolset.config.verify_ssl,
|
|
627
939
|
headers=self.toolset.config.headers,
|
|
940
|
+
method="GET",
|
|
941
|
+
)
|
|
942
|
+
response.raise_for_status()
|
|
943
|
+
data = response.json()
|
|
944
|
+
|
|
945
|
+
# Check if results were truncated
|
|
946
|
+
if (
|
|
947
|
+
"data" in data
|
|
948
|
+
and isinstance(data["data"], list)
|
|
949
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
950
|
+
):
|
|
951
|
+
data["_truncated"] = True
|
|
952
|
+
data["_message"] = (
|
|
953
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter label '{label}' values."
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
return StructuredToolResult(
|
|
957
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
958
|
+
data=data,
|
|
959
|
+
params=params,
|
|
960
|
+
)
|
|
961
|
+
except Exception as e:
|
|
962
|
+
return StructuredToolResult(
|
|
963
|
+
status=StructuredToolResultStatus.ERROR,
|
|
964
|
+
error=str(e),
|
|
965
|
+
params=params,
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
969
|
+
label = params.get("label", "")
|
|
970
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get {label} Values"
|
|
971
|
+
|
|
972
|
+
|
|
973
|
+
class GetAllLabels(BasePrometheusTool):
|
|
974
|
+
"""Get all label names that exist in Prometheus"""
|
|
975
|
+
|
|
976
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
977
|
+
super().__init__(
|
|
978
|
+
name="get_all_labels",
|
|
979
|
+
description=(
|
|
980
|
+
"Get list of all label names using /api/v1/labels. "
|
|
981
|
+
"Use this to discover what labels are available across all metrics. "
|
|
982
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} label names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
|
|
983
|
+
"Supports optional match[] parameter to filter. "
|
|
984
|
+
"By default returns labels from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
985
|
+
),
|
|
986
|
+
parameters={
|
|
987
|
+
"match": ToolParameter(
|
|
988
|
+
description=(
|
|
989
|
+
"Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
|
|
990
|
+
"'{job=\"prometheus\"}')."
|
|
991
|
+
),
|
|
992
|
+
type="string",
|
|
993
|
+
required=False,
|
|
994
|
+
),
|
|
995
|
+
"start": ToolParameter(
|
|
996
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
997
|
+
type="string",
|
|
998
|
+
required=False,
|
|
999
|
+
),
|
|
1000
|
+
"end": ToolParameter(
|
|
1001
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
1002
|
+
type="string",
|
|
1003
|
+
required=False,
|
|
1004
|
+
),
|
|
1005
|
+
},
|
|
1006
|
+
toolset=toolset,
|
|
1007
|
+
)
|
|
1008
|
+
|
|
1009
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
1010
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
1011
|
+
return StructuredToolResult(
|
|
1012
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1013
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
1014
|
+
params=params,
|
|
1015
|
+
)
|
|
1016
|
+
try:
|
|
1017
|
+
url = urljoin(self.toolset.config.prometheus_url, "api/v1/labels")
|
|
1018
|
+
query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
|
|
1019
|
+
if params.get("match"):
|
|
1020
|
+
query_params["match[]"] = params["match"]
|
|
1021
|
+
|
|
1022
|
+
# Add time parameters - use provided values or defaults
|
|
1023
|
+
if params.get("end"):
|
|
1024
|
+
query_params["end"] = params["end"]
|
|
1025
|
+
else:
|
|
1026
|
+
query_params["end"] = str(int(time.time()))
|
|
1027
|
+
|
|
1028
|
+
if params.get("start"):
|
|
1029
|
+
query_params["start"] = params["start"]
|
|
1030
|
+
elif self.toolset.config.discover_metrics_from_last_hours:
|
|
1031
|
+
# Use default time window
|
|
1032
|
+
query_params["start"] = str(
|
|
1033
|
+
int(time.time())
|
|
1034
|
+
- (self.toolset.config.discover_metrics_from_last_hours * 3600)
|
|
1035
|
+
)
|
|
1036
|
+
|
|
1037
|
+
response = do_request(
|
|
628
1038
|
config=self.toolset.config,
|
|
629
|
-
|
|
1039
|
+
url=url,
|
|
1040
|
+
params=query_params,
|
|
1041
|
+
timeout=self.toolset.config.metadata_timeout_seconds_default,
|
|
1042
|
+
verify=self.toolset.config.verify_ssl,
|
|
1043
|
+
headers=self.toolset.config.headers,
|
|
1044
|
+
method="GET",
|
|
630
1045
|
)
|
|
1046
|
+
response.raise_for_status()
|
|
1047
|
+
data = response.json()
|
|
1048
|
+
|
|
1049
|
+
# Check if results were truncated
|
|
1050
|
+
if (
|
|
1051
|
+
"data" in data
|
|
1052
|
+
and isinstance(data["data"], list)
|
|
1053
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
1054
|
+
):
|
|
1055
|
+
data["_truncated"] = True
|
|
1056
|
+
data["_message"] = (
|
|
1057
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter labels."
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
return StructuredToolResult(
|
|
1061
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
1062
|
+
data=data,
|
|
1063
|
+
params=params,
|
|
1064
|
+
)
|
|
1065
|
+
except Exception as e:
|
|
1066
|
+
return StructuredToolResult(
|
|
1067
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1068
|
+
error=str(e),
|
|
1069
|
+
params=params,
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
1073
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get All Labels"
|
|
631
1074
|
|
|
632
|
-
type_filter = params.get("type_filter")
|
|
633
|
-
if type_filter:
|
|
634
|
-
metrics = filter_metrics_by_type(metrics, type_filter)
|
|
635
1075
|
|
|
636
|
-
|
|
637
|
-
|
|
1076
|
+
class GetSeries(BasePrometheusTool):
|
|
1077
|
+
"""Get time series matching a selector"""
|
|
638
1078
|
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
1079
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
1080
|
+
super().__init__(
|
|
1081
|
+
name="get_series",
|
|
1082
|
+
description=(
|
|
1083
|
+
"Get time series using /api/v1/series. "
|
|
1084
|
+
"Returns label sets for all time series matching the selector. "
|
|
1085
|
+
"SLOWER than other discovery methods - use only when you need full label sets. "
|
|
1086
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} series (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more series exist - use more specific selector. "
|
|
1087
|
+
"Requires match[] parameter with PromQL selector. "
|
|
1088
|
+
"By default returns series active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
1089
|
+
),
|
|
1090
|
+
parameters={
|
|
1091
|
+
"match": ToolParameter(
|
|
1092
|
+
description=(
|
|
1093
|
+
"PromQL selector to match series (e.g., 'up', 'node_cpu_seconds_total', "
|
|
1094
|
+
"'{__name__=~\"node.*\"}', '{job=\"prometheus\"}', "
|
|
1095
|
+
'\'{__name__="up",job="prometheus"}\').'
|
|
1096
|
+
),
|
|
1097
|
+
type="string",
|
|
1098
|
+
required=True,
|
|
1099
|
+
),
|
|
1100
|
+
"start": ToolParameter(
|
|
1101
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
1102
|
+
type="string",
|
|
1103
|
+
required=False,
|
|
1104
|
+
),
|
|
1105
|
+
"end": ToolParameter(
|
|
1106
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
1107
|
+
type="string",
|
|
1108
|
+
required=False,
|
|
1109
|
+
),
|
|
1110
|
+
},
|
|
1111
|
+
toolset=toolset,
|
|
1112
|
+
)
|
|
1113
|
+
|
|
1114
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
1115
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
1116
|
+
return StructuredToolResult(
|
|
1117
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1118
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
1119
|
+
params=params,
|
|
1120
|
+
)
|
|
1121
|
+
try:
|
|
1122
|
+
match = params.get("match")
|
|
1123
|
+
if not match:
|
|
1124
|
+
return StructuredToolResult(
|
|
1125
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1126
|
+
error="Match parameter is required",
|
|
1127
|
+
params=params,
|
|
642
1128
|
)
|
|
643
|
-
|
|
644
|
-
|
|
1129
|
+
|
|
1130
|
+
url = urljoin(self.toolset.config.prometheus_url, "api/v1/series")
|
|
1131
|
+
query_params = {
|
|
1132
|
+
"match[]": match,
|
|
1133
|
+
"limit": str(PROMETHEUS_METADATA_API_LIMIT),
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
# Add time parameters - use provided values or defaults
|
|
1137
|
+
if params.get("end"):
|
|
1138
|
+
query_params["end"] = params["end"]
|
|
1139
|
+
else:
|
|
1140
|
+
query_params["end"] = str(int(time.time()))
|
|
1141
|
+
|
|
1142
|
+
if params.get("start"):
|
|
1143
|
+
query_params["start"] = params["start"]
|
|
1144
|
+
elif self.toolset.config.discover_metrics_from_last_hours:
|
|
1145
|
+
# Use default time window
|
|
1146
|
+
query_params["start"] = str(
|
|
1147
|
+
int(time.time())
|
|
1148
|
+
- (self.toolset.config.discover_metrics_from_last_hours * 3600)
|
|
645
1149
|
)
|
|
646
1150
|
|
|
647
|
-
|
|
1151
|
+
response = do_request(
|
|
1152
|
+
config=self.toolset.config,
|
|
1153
|
+
url=url,
|
|
1154
|
+
params=query_params,
|
|
1155
|
+
timeout=self.toolset.config.metadata_timeout_seconds_default,
|
|
1156
|
+
verify=self.toolset.config.verify_ssl,
|
|
1157
|
+
headers=self.toolset.config.headers,
|
|
1158
|
+
method="GET",
|
|
1159
|
+
)
|
|
1160
|
+
response.raise_for_status()
|
|
1161
|
+
data = response.json()
|
|
1162
|
+
|
|
1163
|
+
# Check if results were truncated
|
|
1164
|
+
if (
|
|
1165
|
+
"data" in data
|
|
1166
|
+
and isinstance(data["data"], list)
|
|
1167
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
1168
|
+
):
|
|
1169
|
+
data["_truncated"] = True
|
|
1170
|
+
data["_message"] = (
|
|
1171
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match selector to see additional series."
|
|
1172
|
+
)
|
|
1173
|
+
|
|
1174
|
+
return StructuredToolResult(
|
|
1175
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
1176
|
+
data=data,
|
|
1177
|
+
params=params,
|
|
1178
|
+
)
|
|
1179
|
+
except Exception as e:
|
|
648
1180
|
return StructuredToolResult(
|
|
649
|
-
status=
|
|
650
|
-
|
|
1181
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1182
|
+
error=str(e),
|
|
651
1183
|
params=params,
|
|
652
1184
|
)
|
|
653
1185
|
|
|
654
|
-
|
|
655
|
-
|
|
1186
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
1187
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Series"
|
|
1188
|
+
|
|
1189
|
+
|
|
1190
|
+
class GetMetricMetadata(BasePrometheusTool):
|
|
1191
|
+
"""Get metadata (type, description, unit) for metrics"""
|
|
1192
|
+
|
|
1193
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
1194
|
+
super().__init__(
|
|
1195
|
+
name="get_metric_metadata",
|
|
1196
|
+
description=(
|
|
1197
|
+
"Get metric metadata using /api/v1/metadata. "
|
|
1198
|
+
"Returns type, help text, and unit for metrics. "
|
|
1199
|
+
"Use after discovering metric names to get their descriptions. "
|
|
1200
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} metrics (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - filter by specific metric name. "
|
|
1201
|
+
"Supports optional metric name filter."
|
|
1202
|
+
),
|
|
1203
|
+
parameters={
|
|
1204
|
+
"metric": ToolParameter(
|
|
1205
|
+
description=(
|
|
1206
|
+
"Optional metric name to filter (e.g., 'up', 'node_cpu_seconds_total'). "
|
|
1207
|
+
"If not provided, returns metadata for all metrics."
|
|
1208
|
+
),
|
|
1209
|
+
type="string",
|
|
1210
|
+
required=False,
|
|
1211
|
+
),
|
|
1212
|
+
},
|
|
1213
|
+
toolset=toolset,
|
|
1214
|
+
)
|
|
1215
|
+
|
|
1216
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
1217
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
656
1218
|
return StructuredToolResult(
|
|
657
|
-
status=
|
|
658
|
-
error="
|
|
1219
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1220
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
659
1221
|
params=params,
|
|
660
1222
|
)
|
|
661
|
-
|
|
662
|
-
|
|
1223
|
+
try:
|
|
1224
|
+
url = urljoin(self.toolset.config.prometheus_url, "api/v1/metadata")
|
|
1225
|
+
query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
|
|
1226
|
+
|
|
1227
|
+
if params.get("metric"):
|
|
1228
|
+
query_params["metric"] = params["metric"]
|
|
1229
|
+
|
|
1230
|
+
response = do_request(
|
|
1231
|
+
config=self.toolset.config,
|
|
1232
|
+
url=url,
|
|
1233
|
+
params=query_params,
|
|
1234
|
+
timeout=self.toolset.config.metadata_timeout_seconds_default,
|
|
1235
|
+
verify=self.toolset.config.verify_ssl,
|
|
1236
|
+
headers=self.toolset.config.headers,
|
|
1237
|
+
method="GET",
|
|
1238
|
+
)
|
|
1239
|
+
response.raise_for_status()
|
|
1240
|
+
data = response.json()
|
|
1241
|
+
|
|
1242
|
+
# Check if results were truncated (metadata endpoint returns a dict, not a list)
|
|
1243
|
+
if (
|
|
1244
|
+
"data" in data
|
|
1245
|
+
and isinstance(data["data"], dict)
|
|
1246
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
1247
|
+
):
|
|
1248
|
+
data["_truncated"] = True
|
|
1249
|
+
data["_message"] = (
|
|
1250
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use metric parameter to filter by specific metric name."
|
|
1251
|
+
)
|
|
1252
|
+
|
|
663
1253
|
return StructuredToolResult(
|
|
664
|
-
status=
|
|
665
|
-
|
|
1254
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
1255
|
+
data=data,
|
|
666
1256
|
params=params,
|
|
667
1257
|
)
|
|
668
1258
|
except Exception as e:
|
|
669
|
-
logging.warn("Failed to process prometheus metrics", exc_info=True)
|
|
670
1259
|
return StructuredToolResult(
|
|
671
|
-
status=
|
|
672
|
-
error=
|
|
1260
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1261
|
+
error=str(e),
|
|
673
1262
|
params=params,
|
|
674
1263
|
)
|
|
675
1264
|
|
|
676
1265
|
def get_parameterized_one_liner(self, params) -> str:
|
|
677
|
-
|
|
678
|
-
return
|
|
1266
|
+
metric = params.get("metric", "all")
|
|
1267
|
+
return (
|
|
1268
|
+
f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metadata ({metric})"
|
|
1269
|
+
)
|
|
679
1270
|
|
|
680
1271
|
|
|
681
1272
|
class ExecuteInstantQuery(BasePrometheusTool):
|
|
682
1273
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
683
1274
|
super().__init__(
|
|
684
1275
|
name="execute_prometheus_instant_query",
|
|
685
|
-
description=
|
|
1276
|
+
description=(
|
|
1277
|
+
f"Execute an instant PromQL query (single point in time). "
|
|
1278
|
+
f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
|
|
1279
|
+
f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries."
|
|
1280
|
+
),
|
|
686
1281
|
parameters={
|
|
687
1282
|
"query": ToolParameter(
|
|
688
1283
|
description="The PromQL query",
|
|
@@ -694,16 +1289,23 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
694
1289
|
type="string",
|
|
695
1290
|
required=True,
|
|
696
1291
|
),
|
|
1292
|
+
"timeout": ToolParameter(
|
|
1293
|
+
description=(
|
|
1294
|
+
f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
|
|
1295
|
+
f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
|
|
1296
|
+
f"Increase for complex queries that may take longer."
|
|
1297
|
+
),
|
|
1298
|
+
type="number",
|
|
1299
|
+
required=False,
|
|
1300
|
+
),
|
|
697
1301
|
},
|
|
698
1302
|
toolset=toolset,
|
|
699
1303
|
)
|
|
700
1304
|
|
|
701
|
-
def _invoke(
|
|
702
|
-
self, params: dict, user_approved: bool = False
|
|
703
|
-
) -> StructuredToolResult:
|
|
1305
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
704
1306
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
705
1307
|
return StructuredToolResult(
|
|
706
|
-
status=
|
|
1308
|
+
status=StructuredToolResultStatus.ERROR,
|
|
707
1309
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
708
1310
|
params=params,
|
|
709
1311
|
)
|
|
@@ -715,13 +1317,25 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
715
1317
|
|
|
716
1318
|
payload = {"query": query}
|
|
717
1319
|
|
|
1320
|
+
# Get timeout parameter and enforce limits
|
|
1321
|
+
default_timeout = self.toolset.config.query_timeout_seconds_default
|
|
1322
|
+
max_timeout = self.toolset.config.query_timeout_seconds_hard_max
|
|
1323
|
+
timeout = params.get("timeout", default_timeout)
|
|
1324
|
+
if timeout > max_timeout:
|
|
1325
|
+
timeout = max_timeout
|
|
1326
|
+
logging.warning(
|
|
1327
|
+
f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
|
|
1328
|
+
)
|
|
1329
|
+
elif timeout < 1:
|
|
1330
|
+
timeout = default_timeout # Min 1 second, but use default if invalid
|
|
1331
|
+
|
|
718
1332
|
response = do_request(
|
|
719
1333
|
config=self.toolset.config,
|
|
720
1334
|
url=url,
|
|
721
1335
|
headers=self.toolset.config.headers,
|
|
722
1336
|
data=payload,
|
|
723
|
-
timeout=
|
|
724
|
-
verify=self.toolset.config.
|
|
1337
|
+
timeout=timeout,
|
|
1338
|
+
verify=self.toolset.config.verify_ssl,
|
|
725
1339
|
method="POST",
|
|
726
1340
|
)
|
|
727
1341
|
|
|
@@ -734,24 +1348,68 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
734
1348
|
error_message = (
|
|
735
1349
|
"The prometheus query returned no result. Is the query correct?"
|
|
736
1350
|
)
|
|
737
|
-
response_data =
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
1351
|
+
response_data = MetricsBasedResponse(
|
|
1352
|
+
status=status,
|
|
1353
|
+
error_message=error_message,
|
|
1354
|
+
tool_name=self.name,
|
|
1355
|
+
description=description,
|
|
1356
|
+
query=query,
|
|
1357
|
+
)
|
|
1358
|
+
structured_tool_result: StructuredToolResult
|
|
1359
|
+
# Check if data should be included based on size
|
|
746
1360
|
if self.toolset.config.tool_calls_return_data:
|
|
747
|
-
|
|
1361
|
+
result_data = data.get("data", {})
|
|
1362
|
+
response_data.data = result_data
|
|
748
1363
|
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
1364
|
+
structured_tool_result = create_structured_tool_result(
|
|
1365
|
+
params=params, response=response_data
|
|
1366
|
+
)
|
|
1367
|
+
tool_call_id = context.tool_call_id
|
|
1368
|
+
tool_name = context.tool_name
|
|
1369
|
+
token_count = count_tool_response_tokens(
|
|
1370
|
+
llm=context.llm,
|
|
1371
|
+
structured_tool_result=structured_tool_result,
|
|
1372
|
+
tool_call_id=tool_call_id,
|
|
1373
|
+
tool_name=tool_name,
|
|
1374
|
+
)
|
|
1375
|
+
|
|
1376
|
+
token_limit = context.max_token_count
|
|
1377
|
+
if self.toolset.config.query_response_size_limit_pct:
|
|
1378
|
+
custom_token_limit = get_pct_token_count(
|
|
1379
|
+
percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
|
|
1380
|
+
llm=context.llm,
|
|
1381
|
+
)
|
|
1382
|
+
if custom_token_limit < token_limit:
|
|
1383
|
+
token_limit = custom_token_limit
|
|
1384
|
+
|
|
1385
|
+
# Provide summary if data is too large
|
|
1386
|
+
if token_count > token_limit:
|
|
1387
|
+
response_data.data = None
|
|
1388
|
+
response_data.data_summary = (
|
|
1389
|
+
create_data_summary_for_large_result(
|
|
1390
|
+
result_data,
|
|
1391
|
+
query,
|
|
1392
|
+
token_count,
|
|
1393
|
+
is_range_query=False,
|
|
1394
|
+
)
|
|
1395
|
+
)
|
|
1396
|
+
logging.info(
|
|
1397
|
+
f"Prometheus instant query returned large dataset: "
|
|
1398
|
+
f"{response_data.data_summary.get('result_count', 0)} results, "
|
|
1399
|
+
f"{token_count:,} tokens (limit: {token_limit:,}). "
|
|
1400
|
+
f"Returning summary instead of full data."
|
|
1401
|
+
)
|
|
1402
|
+
# Also add token info to the summary for debugging
|
|
1403
|
+
response_data.data_summary["_debug_info"] = (
|
|
1404
|
+
f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
|
|
1405
|
+
)
|
|
1406
|
+
else:
|
|
1407
|
+
response_data.data = result_data
|
|
1408
|
+
|
|
1409
|
+
structured_tool_result = create_structured_tool_result(
|
|
1410
|
+
params=params, response=response_data
|
|
754
1411
|
)
|
|
1412
|
+
return structured_tool_result
|
|
755
1413
|
|
|
756
1414
|
# Handle known Prometheus error status codes
|
|
757
1415
|
error_msg = "Unknown error occurred"
|
|
@@ -764,29 +1422,36 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
764
1422
|
except json.JSONDecodeError:
|
|
765
1423
|
pass
|
|
766
1424
|
return StructuredToolResult(
|
|
767
|
-
status=
|
|
1425
|
+
status=StructuredToolResultStatus.ERROR,
|
|
768
1426
|
error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
|
|
769
1427
|
params=params,
|
|
770
1428
|
)
|
|
771
1429
|
|
|
772
1430
|
# For other status codes, just return the status code and content
|
|
773
1431
|
return StructuredToolResult(
|
|
774
|
-
status=
|
|
1432
|
+
status=StructuredToolResultStatus.ERROR,
|
|
775
1433
|
error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
|
|
776
1434
|
params=params,
|
|
777
1435
|
)
|
|
778
1436
|
|
|
1437
|
+
except SSLError as e:
|
|
1438
|
+
logging.warning("SSL error while executing Prometheus query", exc_info=True)
|
|
1439
|
+
return StructuredToolResult(
|
|
1440
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1441
|
+
error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
|
|
1442
|
+
params=params,
|
|
1443
|
+
)
|
|
779
1444
|
except RequestException as e:
|
|
780
1445
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
781
1446
|
return StructuredToolResult(
|
|
782
|
-
status=
|
|
1447
|
+
status=StructuredToolResultStatus.ERROR,
|
|
783
1448
|
error=f"Connection error to Prometheus: {str(e)}",
|
|
784
1449
|
params=params,
|
|
785
1450
|
)
|
|
786
1451
|
except Exception as e:
|
|
787
1452
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
788
1453
|
return StructuredToolResult(
|
|
789
|
-
status=
|
|
1454
|
+
status=StructuredToolResultStatus.ERROR,
|
|
790
1455
|
error=f"Unexpected error executing query: {str(e)}",
|
|
791
1456
|
params=params,
|
|
792
1457
|
)
|
|
@@ -800,7 +1465,12 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
800
1465
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
801
1466
|
super().__init__(
|
|
802
1467
|
name="execute_prometheus_range_query",
|
|
803
|
-
description=
|
|
1468
|
+
description=(
|
|
1469
|
+
f"Generates a graph and Execute a PromQL range query. "
|
|
1470
|
+
f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
|
|
1471
|
+
f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries. "
|
|
1472
|
+
f"Default time range is last 1 hour."
|
|
1473
|
+
),
|
|
804
1474
|
parameters={
|
|
805
1475
|
"query": ToolParameter(
|
|
806
1476
|
description="The PromQL query",
|
|
@@ -827,23 +1497,40 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
827
1497
|
"step": ToolParameter(
|
|
828
1498
|
description="Query resolution step width in duration format or float number of seconds",
|
|
829
1499
|
type="number",
|
|
830
|
-
required=
|
|
1500
|
+
required=False,
|
|
831
1501
|
),
|
|
832
1502
|
"output_type": ToolParameter(
|
|
833
1503
|
description="Specifies how to interpret the Prometheus result. Use 'Plain' for raw values, 'Bytes' to format byte values, 'Percentage' to scale 0–1 values into 0–100%, or 'CPUUsage' to convert values to cores (e.g., 500 becomes 500m, 2000 becomes 2).",
|
|
834
1504
|
type="string",
|
|
835
1505
|
required=True,
|
|
836
1506
|
),
|
|
1507
|
+
"timeout": ToolParameter(
|
|
1508
|
+
description=(
|
|
1509
|
+
f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
|
|
1510
|
+
f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
|
|
1511
|
+
f"Increase for complex queries that may take longer."
|
|
1512
|
+
),
|
|
1513
|
+
type="number",
|
|
1514
|
+
required=False,
|
|
1515
|
+
),
|
|
1516
|
+
"max_points": ToolParameter(
|
|
1517
|
+
description=(
|
|
1518
|
+
f"Maximum number of data points to return. Default: {int(MAX_GRAPH_POINTS)}. "
|
|
1519
|
+
f"Can be reduced to get fewer data points (e.g., 50 for simpler graphs). "
|
|
1520
|
+
f"Cannot exceed system limit of {int(MAX_GRAPH_POINTS)}. "
|
|
1521
|
+
f"If your query would return more points than this limit, the step will be automatically adjusted."
|
|
1522
|
+
),
|
|
1523
|
+
type="number",
|
|
1524
|
+
required=False,
|
|
1525
|
+
),
|
|
837
1526
|
},
|
|
838
1527
|
toolset=toolset,
|
|
839
1528
|
)
|
|
840
1529
|
|
|
841
|
-
def _invoke(
|
|
842
|
-
self, params: dict, user_approved: bool = False
|
|
843
|
-
) -> StructuredToolResult:
|
|
1530
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
844
1531
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
845
1532
|
return StructuredToolResult(
|
|
846
|
-
status=
|
|
1533
|
+
status=StructuredToolResultStatus.ERROR,
|
|
847
1534
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
848
1535
|
params=params,
|
|
849
1536
|
)
|
|
@@ -857,12 +1544,17 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
857
1544
|
end_timestamp=params.get("end"),
|
|
858
1545
|
default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
|
|
859
1546
|
)
|
|
860
|
-
step = params.get("step"
|
|
1547
|
+
step = parse_duration_to_seconds(params.get("step"))
|
|
1548
|
+
max_points = params.get(
|
|
1549
|
+
"max_points"
|
|
1550
|
+
) # Get the optional max_points parameter
|
|
861
1551
|
|
|
1552
|
+
# adjust_step_for_max_points handles None case and converts to float
|
|
862
1553
|
step = adjust_step_for_max_points(
|
|
863
1554
|
start_timestamp=start,
|
|
864
1555
|
end_timestamp=end,
|
|
865
|
-
step=
|
|
1556
|
+
step=step,
|
|
1557
|
+
max_points_override=max_points,
|
|
866
1558
|
)
|
|
867
1559
|
|
|
868
1560
|
description = params.get("description", "")
|
|
@@ -874,13 +1566,25 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
874
1566
|
"step": step,
|
|
875
1567
|
}
|
|
876
1568
|
|
|
1569
|
+
# Get timeout parameter and enforce limits
|
|
1570
|
+
default_timeout = self.toolset.config.query_timeout_seconds_default
|
|
1571
|
+
max_timeout = self.toolset.config.query_timeout_seconds_hard_max
|
|
1572
|
+
timeout = params.get("timeout", default_timeout)
|
|
1573
|
+
if timeout > max_timeout:
|
|
1574
|
+
timeout = max_timeout
|
|
1575
|
+
logging.warning(
|
|
1576
|
+
f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
|
|
1577
|
+
)
|
|
1578
|
+
elif timeout < 1:
|
|
1579
|
+
timeout = default_timeout # Min 1 second, but use default if invalid
|
|
1580
|
+
|
|
877
1581
|
response = do_request(
|
|
878
1582
|
config=self.toolset.config,
|
|
879
1583
|
url=url,
|
|
880
1584
|
headers=self.toolset.config.headers,
|
|
881
1585
|
data=payload,
|
|
882
|
-
timeout=
|
|
883
|
-
verify=self.toolset.config.
|
|
1586
|
+
timeout=timeout,
|
|
1587
|
+
verify=self.toolset.config.verify_ssl,
|
|
884
1588
|
method="POST",
|
|
885
1589
|
)
|
|
886
1590
|
|
|
@@ -893,29 +1597,73 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
893
1597
|
error_message = (
|
|
894
1598
|
"The prometheus query returned no result. Is the query correct?"
|
|
895
1599
|
)
|
|
896
|
-
response_data =
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
1600
|
+
response_data = MetricsBasedResponse(
|
|
1601
|
+
status=status,
|
|
1602
|
+
error_message=error_message,
|
|
1603
|
+
tool_name=self.name,
|
|
1604
|
+
description=description,
|
|
1605
|
+
query=query,
|
|
1606
|
+
start=start,
|
|
1607
|
+
end=end,
|
|
1608
|
+
step=step,
|
|
1609
|
+
output_type=output_type,
|
|
1610
|
+
)
|
|
1611
|
+
|
|
1612
|
+
structured_tool_result: StructuredToolResult
|
|
908
1613
|
|
|
1614
|
+
# Check if data should be included based on size
|
|
909
1615
|
if self.toolset.config.tool_calls_return_data:
|
|
910
|
-
|
|
911
|
-
|
|
1616
|
+
result_data = data.get("data", {})
|
|
1617
|
+
response_data.data = result_data
|
|
1618
|
+
structured_tool_result = create_structured_tool_result(
|
|
1619
|
+
params=params, response=response_data
|
|
1620
|
+
)
|
|
912
1621
|
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
1622
|
+
tool_call_id = context.tool_call_id
|
|
1623
|
+
tool_name = context.tool_name
|
|
1624
|
+
token_count = count_tool_response_tokens(
|
|
1625
|
+
llm=context.llm,
|
|
1626
|
+
structured_tool_result=structured_tool_result,
|
|
1627
|
+
tool_call_id=tool_call_id,
|
|
1628
|
+
tool_name=tool_name,
|
|
1629
|
+
)
|
|
1630
|
+
|
|
1631
|
+
token_limit = context.max_token_count
|
|
1632
|
+
if self.toolset.config.query_response_size_limit_pct:
|
|
1633
|
+
custom_token_limit = get_pct_token_count(
|
|
1634
|
+
percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
|
|
1635
|
+
llm=context.llm,
|
|
1636
|
+
)
|
|
1637
|
+
if custom_token_limit < token_limit:
|
|
1638
|
+
token_limit = custom_token_limit
|
|
1639
|
+
|
|
1640
|
+
# Provide summary if data is too large
|
|
1641
|
+
if token_count > token_limit:
|
|
1642
|
+
response_data.data = None
|
|
1643
|
+
response_data.data_summary = (
|
|
1644
|
+
create_data_summary_for_large_result(
|
|
1645
|
+
result_data, query, token_count, is_range_query=True
|
|
1646
|
+
)
|
|
1647
|
+
)
|
|
1648
|
+
logging.info(
|
|
1649
|
+
f"Prometheus range query returned large dataset: "
|
|
1650
|
+
f"{response_data.data_summary.get('series_count', 0)} series, "
|
|
1651
|
+
f"{token_count:,} tokens (limit: {token_limit:,}). "
|
|
1652
|
+
f"Returning summary instead of full data."
|
|
1653
|
+
)
|
|
1654
|
+
# Also add character info to the summary for debugging
|
|
1655
|
+
response_data.data_summary["_debug_info"] = (
|
|
1656
|
+
f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
|
|
1657
|
+
)
|
|
1658
|
+
else:
|
|
1659
|
+
response_data.data = result_data
|
|
1660
|
+
|
|
1661
|
+
structured_tool_result = create_structured_tool_result(
|
|
1662
|
+
params=params, response=response_data
|
|
917
1663
|
)
|
|
918
1664
|
|
|
1665
|
+
return structured_tool_result
|
|
1666
|
+
|
|
919
1667
|
error_msg = "Unknown error occurred"
|
|
920
1668
|
if response.status_code in [400, 429]:
|
|
921
1669
|
try:
|
|
@@ -926,28 +1674,37 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
926
1674
|
except json.JSONDecodeError:
|
|
927
1675
|
pass
|
|
928
1676
|
return StructuredToolResult(
|
|
929
|
-
status=
|
|
1677
|
+
status=StructuredToolResultStatus.ERROR,
|
|
930
1678
|
error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
|
|
931
1679
|
params=params,
|
|
932
1680
|
)
|
|
933
1681
|
|
|
934
1682
|
return StructuredToolResult(
|
|
935
|
-
status=
|
|
1683
|
+
status=StructuredToolResultStatus.ERROR,
|
|
936
1684
|
error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
|
|
937
1685
|
params=params,
|
|
938
1686
|
)
|
|
939
1687
|
|
|
1688
|
+
except SSLError as e:
|
|
1689
|
+
logging.warning(
|
|
1690
|
+
"SSL error while executing Prometheus range query", exc_info=True
|
|
1691
|
+
)
|
|
1692
|
+
return StructuredToolResult(
|
|
1693
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1694
|
+
error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
|
|
1695
|
+
params=params,
|
|
1696
|
+
)
|
|
940
1697
|
except RequestException as e:
|
|
941
1698
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
942
1699
|
return StructuredToolResult(
|
|
943
|
-
status=
|
|
1700
|
+
status=StructuredToolResultStatus.ERROR,
|
|
944
1701
|
error=f"Connection error to Prometheus: {str(e)}",
|
|
945
1702
|
params=params,
|
|
946
1703
|
)
|
|
947
1704
|
except Exception as e:
|
|
948
1705
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
949
1706
|
return StructuredToolResult(
|
|
950
|
-
status=
|
|
1707
|
+
status=StructuredToolResultStatus.ERROR,
|
|
951
1708
|
error=f"Unexpected error executing query: {str(e)}",
|
|
952
1709
|
params=params,
|
|
953
1710
|
)
|
|
@@ -958,7 +1715,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
958
1715
|
|
|
959
1716
|
|
|
960
1717
|
class PrometheusToolset(Toolset):
|
|
961
|
-
config: Optional[Union[PrometheusConfig, AMPConfig]] = None
|
|
1718
|
+
config: Optional[Union[PrometheusConfig, AMPConfig, AzurePrometheusConfig]] = None
|
|
962
1719
|
|
|
963
1720
|
def __init__(self):
|
|
964
1721
|
super().__init__(
|
|
@@ -969,7 +1726,11 @@ class PrometheusToolset(Toolset):
|
|
|
969
1726
|
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
|
|
970
1727
|
tools=[
|
|
971
1728
|
ListPrometheusRules(toolset=self),
|
|
972
|
-
|
|
1729
|
+
GetMetricNames(toolset=self),
|
|
1730
|
+
GetLabelValues(toolset=self),
|
|
1731
|
+
GetAllLabels(toolset=self),
|
|
1732
|
+
GetSeries(toolset=self),
|
|
1733
|
+
GetMetricMetadata(toolset=self),
|
|
973
1734
|
ExecuteInstantQuery(toolset=self),
|
|
974
1735
|
ExecuteRangeQuery(toolset=self),
|
|
975
1736
|
],
|
|
@@ -987,16 +1748,36 @@ class PrometheusToolset(Toolset):
|
|
|
987
1748
|
|
|
988
1749
|
def determine_prometheus_class(
|
|
989
1750
|
self, config: dict[str, Any]
|
|
990
|
-
) -> Type[Union[PrometheusConfig, AMPConfig]]:
|
|
1751
|
+
) -> Type[Union[PrometheusConfig, AMPConfig, AzurePrometheusConfig]]:
|
|
991
1752
|
has_aws_fields = "aws_region" in config
|
|
992
|
-
|
|
1753
|
+
if has_aws_fields:
|
|
1754
|
+
return AMPConfig
|
|
1755
|
+
|
|
1756
|
+
# Check for Azure config using static method
|
|
1757
|
+
is_azure = AzurePrometheusConfig.is_azure_config(config)
|
|
1758
|
+
if is_azure:
|
|
1759
|
+
logging.info("Detected Azure Managed Prometheus configuration")
|
|
1760
|
+
return AzurePrometheusConfig if is_azure else PrometheusConfig
|
|
1761
|
+
|
|
1762
|
+
def _disable_azure_incompatible_tools(self):
|
|
1763
|
+
"""
|
|
1764
|
+
Azure Managed Prometheus does not support some APIs.
|
|
1765
|
+
Remove unsupported tools.
|
|
1766
|
+
"""
|
|
1767
|
+
incompatible = {
|
|
1768
|
+
"get_label_values",
|
|
1769
|
+
"get_metric_metadata",
|
|
1770
|
+
"list_prometheus_rules",
|
|
1771
|
+
}
|
|
1772
|
+
self.tools = [t for t in self.tools if t.name not in incompatible]
|
|
993
1773
|
|
|
994
1774
|
def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
|
|
995
1775
|
try:
|
|
996
1776
|
if config:
|
|
997
1777
|
config_cls = self.determine_prometheus_class(config)
|
|
998
1778
|
self.config = config_cls(**config) # type: ignore
|
|
999
|
-
|
|
1779
|
+
if isinstance(self.config, AzurePrometheusConfig):
|
|
1780
|
+
self._disable_azure_incompatible_tools()
|
|
1000
1781
|
self._reload_llm_instructions()
|
|
1001
1782
|
return self._is_healthy()
|
|
1002
1783
|
except Exception:
|
|
@@ -1041,14 +1822,14 @@ class PrometheusToolset(Toolset):
|
|
|
1041
1822
|
f"Toolset {self.name} failed to initialize because prometheus is not configured correctly",
|
|
1042
1823
|
)
|
|
1043
1824
|
|
|
1044
|
-
url = urljoin(self.config.prometheus_url,
|
|
1825
|
+
url = urljoin(self.config.prometheus_url, "api/v1/query?query=up")
|
|
1045
1826
|
try:
|
|
1046
1827
|
response = do_request(
|
|
1047
1828
|
config=self.config,
|
|
1048
1829
|
url=url,
|
|
1049
1830
|
headers=self.config.headers,
|
|
1050
1831
|
timeout=10,
|
|
1051
|
-
verify=self.config.
|
|
1832
|
+
verify=self.config.verify_ssl,
|
|
1052
1833
|
method="GET",
|
|
1053
1834
|
)
|
|
1054
1835
|
|
|
@@ -1060,13 +1841,8 @@ class PrometheusToolset(Toolset):
|
|
|
1060
1841
|
f"Failed to connect to Prometheus at {url}: HTTP {response.status_code}",
|
|
1061
1842
|
)
|
|
1062
1843
|
|
|
1063
|
-
except RequestException:
|
|
1064
|
-
return (
|
|
1065
|
-
False,
|
|
1066
|
-
f"Failed to initialize using url={url}",
|
|
1067
|
-
)
|
|
1068
1844
|
except Exception as e:
|
|
1069
|
-
logging.
|
|
1845
|
+
logging.debug("Failed to initialize Prometheus", exc_info=True)
|
|
1070
1846
|
return (
|
|
1071
1847
|
False,
|
|
1072
1848
|
f"Failed to initialize using url={url}. Unexpected error: {str(e)}",
|
|
@@ -1074,6 +1850,11 @@ class PrometheusToolset(Toolset):
|
|
|
1074
1850
|
|
|
1075
1851
|
def get_example_config(self):
|
|
1076
1852
|
example_config = PrometheusConfig(
|
|
1077
|
-
prometheus_url="http://
|
|
1853
|
+
prometheus_url="http://prometheus-server.monitoring.svc.cluster.local:9090",
|
|
1854
|
+
headers={"Authorization": "Basic <base64_encoded_credentials>"},
|
|
1855
|
+
discover_metrics_from_last_hours=1,
|
|
1856
|
+
query_timeout_seconds_default=20,
|
|
1857
|
+
query_timeout_seconds_hard_max=180,
|
|
1858
|
+
verify_ssl=True,
|
|
1078
1859
|
)
|
|
1079
1860
|
return example_config.model_dump()
|