holmesgpt 0.16.2a0__py3-none-any.whl → 0.18.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +3 -5
- holmes/clients/robusta_client.py +4 -3
- holmes/common/env_vars.py +18 -2
- holmes/common/openshift.py +1 -1
- holmes/config.py +11 -6
- holmes/core/conversations.py +30 -13
- holmes/core/investigation.py +21 -25
- holmes/core/investigation_structured_output.py +3 -3
- holmes/core/issue.py +1 -1
- holmes/core/llm.py +50 -31
- holmes/core/models.py +19 -17
- holmes/core/openai_formatting.py +1 -1
- holmes/core/prompt.py +47 -2
- holmes/core/runbooks.py +1 -0
- holmes/core/safeguards.py +4 -2
- holmes/core/supabase_dal.py +4 -2
- holmes/core/tool_calling_llm.py +102 -141
- holmes/core/tools.py +19 -28
- holmes/core/tools_utils/token_counting.py +9 -2
- holmes/core/tools_utils/tool_context_window_limiter.py +13 -30
- holmes/core/tools_utils/tool_executor.py +0 -18
- holmes/core/tools_utils/toolset_utils.py +1 -0
- holmes/core/toolset_manager.py +37 -2
- holmes/core/tracing.py +13 -2
- holmes/core/transformers/__init__.py +1 -1
- holmes/core/transformers/base.py +1 -0
- holmes/core/transformers/llm_summarize.py +3 -2
- holmes/core/transformers/registry.py +2 -1
- holmes/core/transformers/transformer.py +1 -0
- holmes/core/truncation/compaction.py +37 -2
- holmes/core/truncation/input_context_window_limiter.py +3 -2
- holmes/interactive.py +52 -8
- holmes/main.py +17 -37
- holmes/plugins/interfaces.py +2 -1
- holmes/plugins/prompts/__init__.py +2 -1
- holmes/plugins/prompts/_fetch_logs.jinja2 +5 -5
- holmes/plugins/prompts/_runbook_instructions.jinja2 +2 -1
- holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +2 -1
- holmes/plugins/prompts/generic_ask.jinja2 +0 -2
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -2
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -2
- holmes/plugins/prompts/generic_investigation.jinja2 +0 -2
- holmes/plugins/prompts/investigation_procedure.jinja2 +2 -1
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -2
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -2
- holmes/plugins/runbooks/__init__.py +32 -3
- holmes/plugins/sources/github/__init__.py +4 -2
- holmes/plugins/sources/prometheus/models.py +1 -0
- holmes/plugins/toolsets/__init__.py +30 -26
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +13 -12
- holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
- holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -12
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +7 -7
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -7
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -5
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -7
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +6 -8
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -3
- holmes/plugins/toolsets/azure_sql/utils.py +0 -32
- holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
- holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
- holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
- holmes/plugins/toolsets/bash/bash_toolset.py +2 -3
- holmes/plugins/toolsets/bash/common/bash.py +19 -9
- holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
- holmes/plugins/toolsets/bash/common/stringify.py +1 -1
- holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
- holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
- holmes/plugins/toolsets/bash/parse_command.py +12 -13
- holmes/plugins/toolsets/connectivity_check.py +124 -0
- holmes/plugins/toolsets/coralogix/api.py +132 -119
- holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
- holmes/plugins/toolsets/coralogix/utils.py +15 -79
- holmes/plugins/toolsets/datadog/datadog_api.py +36 -3
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +34 -1
- holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
- holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
- holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
- holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +71 -28
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +224 -375
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +67 -36
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +360 -343
- holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
- holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
- holmes/plugins/toolsets/git.py +7 -8
- holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
- holmes/plugins/toolsets/grafana/common.py +2 -30
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +2 -1
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +18 -2
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +92 -18
- holmes/plugins/toolsets/grafana/loki_api.py +4 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +109 -25
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +22 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +201 -33
- holmes/plugins/toolsets/grafana/trace_parser.py +3 -2
- holmes/plugins/toolsets/internet/internet.py +10 -10
- holmes/plugins/toolsets/internet/notion.py +5 -6
- holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
- holmes/plugins/toolsets/investigator/model.py +3 -1
- holmes/plugins/toolsets/json_filter_mixin.py +134 -0
- holmes/plugins/toolsets/kafka.py +12 -7
- holmes/plugins/toolsets/kubernetes.yaml +260 -30
- holmes/plugins/toolsets/kubernetes_logs.py +3 -3
- holmes/plugins/toolsets/logging_utils/logging_api.py +16 -6
- holmes/plugins/toolsets/mcp/toolset_mcp.py +88 -60
- holmes/plugins/toolsets/newrelic/new_relic_api.py +41 -1
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +24 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +212 -55
- holmes/plugins/toolsets/prometheus/prometheus.py +358 -102
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +11 -3
- holmes/plugins/toolsets/rabbitmq/api.py +23 -4
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +5 -5
- holmes/plugins/toolsets/robusta/robusta.py +5 -5
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +25 -6
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +1 -1
- holmes/plugins/toolsets/utils.py +1 -1
- holmes/utils/config_utils.py +1 -1
- holmes/utils/connection_utils.py +31 -0
- holmes/utils/console/result.py +10 -0
- holmes/utils/file_utils.py +2 -1
- holmes/utils/global_instructions.py +10 -26
- holmes/utils/holmes_status.py +4 -3
- holmes/utils/log.py +15 -0
- holmes/utils/markdown_utils.py +2 -3
- holmes/utils/memory_limit.py +58 -0
- holmes/utils/sentry_helper.py +23 -0
- holmes/utils/stream.py +12 -5
- holmes/utils/tags.py +4 -3
- holmes/version.py +3 -1
- {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +12 -10
- holmesgpt-0.18.4.dist-info/RECORD +258 -0
- holmes/plugins/toolsets/aws.yaml +0 -80
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -114
- holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -736
- holmes/plugins/toolsets/grafana/grafana_api.py +0 -64
- holmes/plugins/toolsets/opensearch/__init__.py +0 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +0 -250
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -215
- holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
- holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
- holmes/utils/keygen_utils.py +0 -6
- holmesgpt-0.16.2a0.dist-info/RECORD +0 -258
- holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_ppl_query_docs.jinja2 +0 -0
- holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist.py +2 -2
- /holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist_instructions.jinja2 +0 -0
- {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/LICENSE +0 -0
- {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
- {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
|
@@ -2,28 +2,39 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
4
|
import time
|
|
5
|
-
import dateutil.parser
|
|
6
5
|
from typing import Any, Dict, Optional, Tuple, Type, Union
|
|
7
6
|
from urllib.parse import urljoin
|
|
8
7
|
|
|
8
|
+
import dateutil.parser
|
|
9
9
|
import requests # type: ignore
|
|
10
|
-
from
|
|
11
|
-
from requests import RequestException
|
|
10
|
+
from prometrix.auth import PrometheusAuthorization
|
|
12
11
|
from prometrix.connect.aws_connect import AWSPrometheusConnect
|
|
12
|
+
from prometrix.models.prometheus_config import (
|
|
13
|
+
AzurePrometheusConfig as PrometrixAzureConfig,
|
|
14
|
+
)
|
|
13
15
|
from prometrix.models.prometheus_config import PrometheusConfig as BasePrometheusConfig
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
17
|
+
from requests import RequestException
|
|
18
|
+
from requests.exceptions import SSLError # type: ignore
|
|
19
|
+
|
|
20
|
+
from holmes.common.env_vars import IS_OPENSHIFT, MAX_GRAPH_POINTS
|
|
21
|
+
from holmes.common.openshift import load_openshift_token
|
|
14
22
|
from holmes.core.tools import (
|
|
15
23
|
CallablePrerequisite,
|
|
16
24
|
StructuredToolResult,
|
|
25
|
+
StructuredToolResultStatus,
|
|
17
26
|
Tool,
|
|
18
27
|
ToolInvokeContext,
|
|
19
28
|
ToolParameter,
|
|
20
|
-
StructuredToolResultStatus,
|
|
21
29
|
Toolset,
|
|
22
30
|
ToolsetTag,
|
|
23
31
|
)
|
|
24
32
|
from holmes.core.tools_utils.token_counting import count_tool_response_tokens
|
|
25
33
|
from holmes.core.tools_utils.tool_context_window_limiter import get_pct_token_count
|
|
26
34
|
from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
|
|
35
|
+
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
36
|
+
DEFAULT_GRAPH_TIME_SPAN_SECONDS,
|
|
37
|
+
)
|
|
27
38
|
from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
|
|
28
39
|
from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
|
|
29
40
|
from holmes.plugins.toolsets.utils import (
|
|
@@ -33,12 +44,6 @@ from holmes.plugins.toolsets.utils import (
|
|
|
33
44
|
toolset_name_for_one_liner,
|
|
34
45
|
)
|
|
35
46
|
from holmes.utils.cache import TTLCache
|
|
36
|
-
from holmes.common.env_vars import IS_OPENSHIFT, MAX_GRAPH_POINTS
|
|
37
|
-
from holmes.common.openshift import load_openshift_token
|
|
38
|
-
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
39
|
-
DEFAULT_GRAPH_TIME_SPAN_SECONDS,
|
|
40
|
-
)
|
|
41
|
-
from holmes.utils.keygen_utils import generate_random_key
|
|
42
47
|
|
|
43
48
|
PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
|
|
44
49
|
PROMETHEUS_METADATA_API_LIMIT = 100 # Default limit for Prometheus metadata APIs (series, labels, metadata) to prevent overwhelming responses
|
|
@@ -52,46 +57,57 @@ MAX_METADATA_TIMEOUT_SECONDS = 60
|
|
|
52
57
|
DEFAULT_METADATA_TIME_WINDOW_HRS = 1
|
|
53
58
|
|
|
54
59
|
|
|
60
|
+
def format_ssl_error_message(prometheus_url: str, error: SSLError) -> str:
|
|
61
|
+
"""Format a clear SSL error message with remediation steps."""
|
|
62
|
+
return (
|
|
63
|
+
f"SSL certificate verification failed when connecting to Prometheus at {prometheus_url}. "
|
|
64
|
+
f"Error: {str(error)}. "
|
|
65
|
+
f"To disable SSL verification, set 'verify_ssl: false' in your configuration. "
|
|
66
|
+
f"For Helm deployments, add this to your values.yaml:\n"
|
|
67
|
+
f" toolsets:\n"
|
|
68
|
+
f" prometheus/metrics:\n"
|
|
69
|
+
f" config:\n"
|
|
70
|
+
f" verify_ssl: false"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
55
74
|
class PrometheusConfig(BaseModel):
|
|
75
|
+
"""Prometheus toolset configuration.
|
|
76
|
+
|
|
77
|
+
Deprecated config names (still accepted but not in schema):
|
|
78
|
+
- default_metadata_time_window_hrs -> discover_metrics_from_last_hours
|
|
79
|
+
- default_query_timeout_seconds -> query_timeout_seconds_default
|
|
80
|
+
- max_query_timeout_seconds -> query_timeout_seconds_hard_max
|
|
81
|
+
- default_metadata_timeout_seconds -> metadata_timeout_seconds_default
|
|
82
|
+
- max_metadata_timeout_seconds -> metadata_timeout_seconds_hard_max
|
|
83
|
+
- metrics_labels_time_window_hrs -> discover_metrics_from_last_hours
|
|
84
|
+
- prometheus_ssl_enabled -> verify_ssl
|
|
85
|
+
- metrics_labels_cache_duration_hrs (no longer used)
|
|
86
|
+
- fetch_labels_with_labels_api (no longer used)
|
|
87
|
+
- fetch_metadata_with_series_api (no longer used)
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
model_config = ConfigDict(extra="allow")
|
|
91
|
+
|
|
56
92
|
# URL is optional because it can be set with an env var
|
|
57
|
-
prometheus_url: Optional[str]
|
|
58
|
-
healthcheck: str = "-/healthy"
|
|
93
|
+
prometheus_url: Optional[str] = None
|
|
59
94
|
|
|
60
|
-
#
|
|
61
|
-
|
|
95
|
+
# Discovery API time window - only return metrics with data in the last N hours
|
|
96
|
+
discover_metrics_from_last_hours: int = DEFAULT_METADATA_TIME_WINDOW_HRS
|
|
62
97
|
|
|
63
98
|
# Query timeout configuration
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
)
|
|
67
|
-
max_query_timeout_seconds: int = (
|
|
68
|
-
MAX_QUERY_TIMEOUT_SECONDS # Maximum allowed timeout for PromQL queries
|
|
69
|
-
)
|
|
99
|
+
query_timeout_seconds_default: int = DEFAULT_QUERY_TIMEOUT_SECONDS
|
|
100
|
+
query_timeout_seconds_hard_max: int = MAX_QUERY_TIMEOUT_SECONDS
|
|
70
101
|
|
|
71
102
|
# Metadata API timeout configuration
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
)
|
|
75
|
-
max_metadata_timeout_seconds: int = (
|
|
76
|
-
MAX_METADATA_TIMEOUT_SECONDS # Maximum allowed timeout for metadata APIs
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
# DEPRECATED: These config values are deprecated and will be removed in a future version
|
|
80
|
-
# Using None as default so we can detect if user explicitly set them
|
|
81
|
-
metrics_labels_time_window_hrs: Optional[int] = (
|
|
82
|
-
None # DEPRECATED - use default_metadata_time_window_hrs instead
|
|
83
|
-
)
|
|
84
|
-
metrics_labels_cache_duration_hrs: Optional[int] = (
|
|
85
|
-
None # DEPRECATED - no longer used
|
|
86
|
-
)
|
|
87
|
-
fetch_labels_with_labels_api: Optional[bool] = None # DEPRECATED - no longer used
|
|
88
|
-
fetch_metadata_with_series_api: Optional[bool] = None # DEPRECATED - no longer used
|
|
103
|
+
metadata_timeout_seconds_default: int = DEFAULT_METADATA_TIMEOUT_SECONDS
|
|
104
|
+
metadata_timeout_seconds_hard_max: int = MAX_METADATA_TIMEOUT_SECONDS
|
|
89
105
|
|
|
90
106
|
tool_calls_return_data: bool = True
|
|
91
107
|
headers: Dict = Field(default_factory=dict)
|
|
92
108
|
rules_cache_duration_seconds: Optional[int] = 1800 # 30 minutes
|
|
93
109
|
additional_labels: Optional[Dict[str, str]] = None
|
|
94
|
-
|
|
110
|
+
verify_ssl: bool = True
|
|
95
111
|
|
|
96
112
|
# Custom limit to the max number of tokens that a query result can take to proactively
|
|
97
113
|
# prevent token limit issues. Expressed in % of the model's context window.
|
|
@@ -107,31 +123,52 @@ class PrometheusConfig(BaseModel):
|
|
|
107
123
|
|
|
108
124
|
@model_validator(mode="after")
|
|
109
125
|
def validate_prom_config(self):
|
|
110
|
-
#
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
126
|
+
# Handle deprecated config names passed as extra fields
|
|
127
|
+
# These are accepted via extra="allow" but not defined in schema
|
|
128
|
+
extra = self.model_extra or {}
|
|
129
|
+
deprecated_with_replacement = []
|
|
130
|
+
|
|
131
|
+
# Map of old names -> new names
|
|
132
|
+
deprecated_mappings = {
|
|
133
|
+
"default_metadata_time_window_hrs": "discover_metrics_from_last_hours",
|
|
134
|
+
"default_query_timeout_seconds": "query_timeout_seconds_default",
|
|
135
|
+
"max_query_timeout_seconds": "query_timeout_seconds_hard_max",
|
|
136
|
+
"default_metadata_timeout_seconds": "metadata_timeout_seconds_default",
|
|
137
|
+
"max_metadata_timeout_seconds": "metadata_timeout_seconds_hard_max",
|
|
138
|
+
"metrics_labels_time_window_hrs": "discover_metrics_from_last_hours",
|
|
139
|
+
"prometheus_ssl_enabled": "verify_ssl",
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
for old_name, new_name in deprecated_mappings.items():
|
|
143
|
+
if old_name in extra:
|
|
144
|
+
setattr(self, new_name, extra[old_name])
|
|
145
|
+
deprecated_with_replacement.append(f"{old_name} -> {new_name}")
|
|
146
|
+
|
|
147
|
+
if deprecated_with_replacement:
|
|
148
|
+
logging.warning(
|
|
149
|
+
f"Prometheus config uses deprecated names. Please update: "
|
|
150
|
+
f"{', '.join(deprecated_with_replacement)}"
|
|
115
151
|
)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
152
|
+
|
|
153
|
+
# Check for deprecated config values that no longer have any effect
|
|
154
|
+
deprecated_no_effect = [
|
|
155
|
+
name
|
|
156
|
+
for name in [
|
|
157
|
+
"metrics_labels_cache_duration_hrs",
|
|
158
|
+
"fetch_labels_with_labels_api",
|
|
159
|
+
"fetch_metadata_with_series_api",
|
|
160
|
+
]
|
|
161
|
+
if name in extra
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
if deprecated_no_effect:
|
|
126
165
|
logging.warning(
|
|
127
|
-
f"
|
|
128
|
-
f"{', '.join(
|
|
166
|
+
f"The following Prometheus config values are deprecated and have no effect: "
|
|
167
|
+
f"{', '.join(deprecated_no_effect)}"
|
|
129
168
|
)
|
|
169
|
+
|
|
130
170
|
# If openshift is enabled, and the user didn't configure auth headers, we will try to load the token from the service account.
|
|
131
171
|
if IS_OPENSHIFT:
|
|
132
|
-
if self.healthcheck == "-/healthy":
|
|
133
|
-
self.healthcheck = "api/v1/query?query=up"
|
|
134
|
-
|
|
135
172
|
if self.headers.get("Authorization"):
|
|
136
173
|
return self
|
|
137
174
|
|
|
@@ -151,8 +188,7 @@ class AMPConfig(PrometheusConfig):
|
|
|
151
188
|
aws_secret_access_key: Optional[str] = None
|
|
152
189
|
aws_region: str
|
|
153
190
|
aws_service_name: str = "aps"
|
|
154
|
-
|
|
155
|
-
prometheus_ssl_enabled: bool = False
|
|
191
|
+
verify_ssl: bool = False
|
|
156
192
|
assume_role_arn: Optional[str] = None
|
|
157
193
|
|
|
158
194
|
# Refresh the AWS client (and its STS creds) every N seconds (default: 15 minutes)
|
|
@@ -176,7 +212,7 @@ class AMPConfig(PrometheusConfig):
|
|
|
176
212
|
try:
|
|
177
213
|
base_config = BasePrometheusConfig(
|
|
178
214
|
url=self.prometheus_url,
|
|
179
|
-
disable_ssl=not self.
|
|
215
|
+
disable_ssl=not self.verify_ssl,
|
|
180
216
|
additional_labels=self.additional_labels,
|
|
181
217
|
)
|
|
182
218
|
self._aws_client = AWSPrometheusConnect(
|
|
@@ -195,12 +231,155 @@ class AMPConfig(PrometheusConfig):
|
|
|
195
231
|
return self._aws_client
|
|
196
232
|
|
|
197
233
|
|
|
234
|
+
class AzurePrometheusConfig(PrometheusConfig):
|
|
235
|
+
azure_resource: Optional[str] = None
|
|
236
|
+
azure_metadata_endpoint: Optional[str] = None
|
|
237
|
+
azure_token_endpoint: Optional[str] = None
|
|
238
|
+
azure_use_managed_id: bool = False
|
|
239
|
+
azure_client_id: Optional[str] = None
|
|
240
|
+
azure_client_secret: Optional[str] = None
|
|
241
|
+
azure_tenant_id: Optional[str] = None
|
|
242
|
+
verify_ssl: bool = True
|
|
243
|
+
|
|
244
|
+
# Refresh the Azure bearer token every N seconds (default: 15 minutes)
|
|
245
|
+
refresh_interval_seconds: int = 900
|
|
246
|
+
|
|
247
|
+
_prometrix_config: Optional[PrometrixAzureConfig] = None
|
|
248
|
+
_token_created_at: float = 0.0
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _load_from_env_or_default(
|
|
252
|
+
config_value: Optional[str], env_var: str, default: Optional[str] = None
|
|
253
|
+
) -> Optional[str]:
|
|
254
|
+
"""Load value from config, environment variable, or use default."""
|
|
255
|
+
if config_value:
|
|
256
|
+
return config_value
|
|
257
|
+
return os.environ.get(env_var, default)
|
|
258
|
+
|
|
259
|
+
def __init__(self, **data):
|
|
260
|
+
super().__init__(**data)
|
|
261
|
+
# Load from environment variables if not provided in config
|
|
262
|
+
self.azure_client_id = self._load_from_env_or_default(
|
|
263
|
+
self.azure_client_id, "AZURE_CLIENT_ID"
|
|
264
|
+
)
|
|
265
|
+
self.azure_tenant_id = self._load_from_env_or_default(
|
|
266
|
+
self.azure_tenant_id, "AZURE_TENANT_ID"
|
|
267
|
+
)
|
|
268
|
+
self.azure_client_secret = self._load_from_env_or_default(
|
|
269
|
+
self.azure_client_secret, "AZURE_CLIENT_SECRET"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Set defaults from environment if not provided
|
|
273
|
+
self.azure_resource = self._load_from_env_or_default(
|
|
274
|
+
self.azure_resource,
|
|
275
|
+
"AZURE_RESOURCE",
|
|
276
|
+
"https://prometheus.monitor.azure.com",
|
|
277
|
+
)
|
|
278
|
+
# from https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/how-to-use-vm-token
|
|
279
|
+
self.azure_metadata_endpoint = self._load_from_env_or_default(
|
|
280
|
+
self.azure_metadata_endpoint,
|
|
281
|
+
"AZURE_METADATA_ENDPOINT",
|
|
282
|
+
"http://169.254.169.254/metadata/identity/oauth2/token",
|
|
283
|
+
)
|
|
284
|
+
self.azure_token_endpoint = self._load_from_env_or_default(
|
|
285
|
+
self.azure_token_endpoint, "AZURE_TOKEN_ENDPOINT"
|
|
286
|
+
)
|
|
287
|
+
if not self.azure_token_endpoint and self.azure_tenant_id:
|
|
288
|
+
self.azure_token_endpoint = (
|
|
289
|
+
f"https://login.microsoftonline.com/{self.azure_tenant_id}/oauth2/token"
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# Check if managed identity should be used
|
|
293
|
+
if not self.azure_use_managed_id:
|
|
294
|
+
self.azure_use_managed_id = os.environ.get(
|
|
295
|
+
"AZURE_USE_MANAGED_ID", "false"
|
|
296
|
+
).lower() in ("true", "1")
|
|
297
|
+
|
|
298
|
+
# Convert None to empty string for prometrix compatibility (prometrix checks != "")
|
|
299
|
+
azure_client_id = self.azure_client_id or ""
|
|
300
|
+
azure_tenant_id = self.azure_tenant_id or ""
|
|
301
|
+
azure_client_secret = self.azure_client_secret or ""
|
|
302
|
+
azure_resource = self.azure_resource or ""
|
|
303
|
+
azure_metadata_endpoint = self.azure_metadata_endpoint or ""
|
|
304
|
+
azure_token_endpoint = self.azure_token_endpoint or ""
|
|
305
|
+
|
|
306
|
+
# Create prometrix Azure config
|
|
307
|
+
self._prometrix_config = PrometrixAzureConfig(
|
|
308
|
+
url=self.prometheus_url,
|
|
309
|
+
azure_resource=azure_resource,
|
|
310
|
+
azure_metadata_endpoint=azure_metadata_endpoint,
|
|
311
|
+
azure_token_endpoint=azure_token_endpoint,
|
|
312
|
+
azure_use_managed_id=self.azure_use_managed_id,
|
|
313
|
+
azure_client_id=azure_client_id,
|
|
314
|
+
azure_client_secret=azure_client_secret,
|
|
315
|
+
azure_tenant_id=azure_tenant_id,
|
|
316
|
+
disable_ssl=not self.verify_ssl,
|
|
317
|
+
additional_labels=self.additional_labels,
|
|
318
|
+
)
|
|
319
|
+
# Ensure promtrix gets a real bool (not string) for managed identity
|
|
320
|
+
# fixing internal prometrix config issue
|
|
321
|
+
object.__setattr__(
|
|
322
|
+
self._prometrix_config,
|
|
323
|
+
"azure_use_managed_id",
|
|
324
|
+
bool(self.azure_use_managed_id),
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
PrometheusAuthorization.azure_authorization(self._prometrix_config)
|
|
328
|
+
|
|
329
|
+
@staticmethod
|
|
330
|
+
def is_azure_config(config: dict[str, Any]) -> bool:
|
|
331
|
+
"""Check if config dict or environment variables indicate Azure Prometheus config."""
|
|
332
|
+
# Check for explicit Azure fields in config
|
|
333
|
+
if (
|
|
334
|
+
"azure_client_id" in config
|
|
335
|
+
or "azure_tenant_id" in config
|
|
336
|
+
or "azure_use_managed_id" in config
|
|
337
|
+
):
|
|
338
|
+
return True
|
|
339
|
+
|
|
340
|
+
# Check for Azure environment variables
|
|
341
|
+
if os.environ.get("AZURE_CLIENT_ID") or os.environ.get("AZURE_TENANT_ID"):
|
|
342
|
+
return True
|
|
343
|
+
|
|
344
|
+
return False
|
|
345
|
+
|
|
346
|
+
def is_amp(self) -> bool:
|
|
347
|
+
return False
|
|
348
|
+
|
|
349
|
+
def _should_refresh_token(self) -> bool:
|
|
350
|
+
if not PrometheusAuthorization.bearer_token:
|
|
351
|
+
return True
|
|
352
|
+
return (time.time() - self._token_created_at) >= self.refresh_interval_seconds
|
|
353
|
+
|
|
354
|
+
def request_new_token(self) -> bool:
|
|
355
|
+
"""Request a new Azure access token using prometrix."""
|
|
356
|
+
success = PrometheusAuthorization.request_new_token(self._prometrix_config)
|
|
357
|
+
if success:
|
|
358
|
+
self._token_created_at = time.time()
|
|
359
|
+
return success
|
|
360
|
+
|
|
361
|
+
def get_authorization_headers(self) -> Dict[str, str]:
|
|
362
|
+
# Request new token if needed
|
|
363
|
+
if self._should_refresh_token():
|
|
364
|
+
if not self.request_new_token():
|
|
365
|
+
logging.error("Failed to request new Azure access token")
|
|
366
|
+
return {}
|
|
367
|
+
self._token_created_at = time.time()
|
|
368
|
+
|
|
369
|
+
headers = PrometheusAuthorization.get_authorization_headers(
|
|
370
|
+
self._prometrix_config
|
|
371
|
+
)
|
|
372
|
+
if not headers.get("Authorization"):
|
|
373
|
+
logging.warning("No authorization header generated for Azure Prometheus")
|
|
374
|
+
return headers
|
|
375
|
+
|
|
376
|
+
|
|
198
377
|
class BasePrometheusTool(Tool):
|
|
199
378
|
toolset: "PrometheusToolset"
|
|
200
379
|
|
|
201
380
|
|
|
202
381
|
def do_request(
|
|
203
|
-
config, # PrometheusConfig | AMPConfig
|
|
382
|
+
config, # PrometheusConfig | AMPConfig | AzurePrometheusConfig
|
|
204
383
|
url: str,
|
|
205
384
|
params: Optional[Dict] = None,
|
|
206
385
|
data: Optional[Dict] = None,
|
|
@@ -212,12 +391,13 @@ def do_request(
|
|
|
212
391
|
"""
|
|
213
392
|
Route a request through either:
|
|
214
393
|
- AWSPrometheusConnect (SigV4) when config is AMPConfig
|
|
394
|
+
- Azure bearer token auth when config is AzurePrometheusConfig
|
|
215
395
|
- plain requests otherwise
|
|
216
396
|
|
|
217
397
|
method defaults to GET so callers can omit it for reads.
|
|
218
398
|
"""
|
|
219
399
|
if verify is None:
|
|
220
|
-
verify = config.
|
|
400
|
+
verify = config.verify_ssl
|
|
221
401
|
if headers is None:
|
|
222
402
|
headers = config.headers or {}
|
|
223
403
|
|
|
@@ -234,7 +414,21 @@ def do_request(
|
|
|
234
414
|
headers=headers,
|
|
235
415
|
)
|
|
236
416
|
|
|
237
|
-
|
|
417
|
+
if isinstance(config, AzurePrometheusConfig):
|
|
418
|
+
# Merge Azure authorization headers with provided headers
|
|
419
|
+
azure_headers = config.get_authorization_headers()
|
|
420
|
+
headers = {**azure_headers, **headers}
|
|
421
|
+
return requests.request(
|
|
422
|
+
method=method,
|
|
423
|
+
url=url,
|
|
424
|
+
headers=headers,
|
|
425
|
+
params=params,
|
|
426
|
+
data=data,
|
|
427
|
+
timeout=timeout,
|
|
428
|
+
verify=verify,
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# Non-AMP, Non-Azure: plain HTTP
|
|
238
432
|
return requests.request(
|
|
239
433
|
method=method,
|
|
240
434
|
url=url,
|
|
@@ -411,7 +605,6 @@ class MetricsBasedResponse(BaseModel):
|
|
|
411
605
|
status: str
|
|
412
606
|
error_message: Optional[str] = None
|
|
413
607
|
data: Optional[str] = None
|
|
414
|
-
random_key: str
|
|
415
608
|
tool_name: str
|
|
416
609
|
description: str
|
|
417
610
|
query: str
|
|
@@ -426,15 +619,22 @@ def create_structured_tool_result(
|
|
|
426
619
|
params: dict, response: MetricsBasedResponse
|
|
427
620
|
) -> StructuredToolResult:
|
|
428
621
|
status = StructuredToolResultStatus.SUCCESS
|
|
622
|
+
error = None
|
|
429
623
|
if response.error_message or response.status.lower() in ("failed", "error"):
|
|
430
624
|
status = StructuredToolResultStatus.ERROR
|
|
625
|
+
error = (
|
|
626
|
+
response.error_message
|
|
627
|
+
if response.error_message
|
|
628
|
+
else "Unknown Prometheus error"
|
|
629
|
+
)
|
|
431
630
|
elif not response.data:
|
|
432
631
|
status = StructuredToolResultStatus.NO_DATA
|
|
433
632
|
|
|
434
633
|
return StructuredToolResult(
|
|
435
634
|
status=status,
|
|
436
|
-
data=response
|
|
635
|
+
data=response,
|
|
437
636
|
params=params,
|
|
637
|
+
error=error,
|
|
438
638
|
)
|
|
439
639
|
|
|
440
640
|
|
|
@@ -484,7 +684,7 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
484
684
|
url=rules_url,
|
|
485
685
|
params=params,
|
|
486
686
|
timeout=40,
|
|
487
|
-
verify=self.toolset.config.
|
|
687
|
+
verify=self.toolset.config.verify_ssl,
|
|
488
688
|
headers=self.toolset.config.headers,
|
|
489
689
|
method="GET",
|
|
490
690
|
)
|
|
@@ -505,6 +705,13 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
505
705
|
error="Request timed out while fetching rules",
|
|
506
706
|
params=params,
|
|
507
707
|
)
|
|
708
|
+
except SSLError as e:
|
|
709
|
+
logging.warning("SSL error while fetching prometheus rules", exc_info=True)
|
|
710
|
+
return StructuredToolResult(
|
|
711
|
+
status=StructuredToolResultStatus.ERROR,
|
|
712
|
+
error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
|
|
713
|
+
params=params,
|
|
714
|
+
)
|
|
508
715
|
except RequestException as e:
|
|
509
716
|
logging.warning("Failed to fetch prometheus rules", exc_info=True)
|
|
510
717
|
return StructuredToolResult(
|
|
@@ -598,19 +805,19 @@ class GetMetricNames(BasePrometheusTool):
|
|
|
598
805
|
|
|
599
806
|
if params.get("start"):
|
|
600
807
|
query_params["start"] = params["start"]
|
|
601
|
-
elif self.toolset.config.
|
|
808
|
+
elif self.toolset.config.discover_metrics_from_last_hours:
|
|
602
809
|
# Use default time window
|
|
603
810
|
query_params["start"] = str(
|
|
604
811
|
int(time.time())
|
|
605
|
-
- (self.toolset.config.
|
|
812
|
+
- (self.toolset.config.discover_metrics_from_last_hours * 3600)
|
|
606
813
|
)
|
|
607
814
|
|
|
608
815
|
response = do_request(
|
|
609
816
|
config=self.toolset.config,
|
|
610
817
|
url=url,
|
|
611
818
|
params=query_params,
|
|
612
|
-
timeout=self.toolset.config.
|
|
613
|
-
verify=self.toolset.config.
|
|
819
|
+
timeout=self.toolset.config.metadata_timeout_seconds_default,
|
|
820
|
+
verify=self.toolset.config.verify_ssl,
|
|
614
821
|
headers=self.toolset.config.headers,
|
|
615
822
|
method="GET",
|
|
616
823
|
)
|
|
@@ -716,19 +923,19 @@ class GetLabelValues(BasePrometheusTool):
|
|
|
716
923
|
|
|
717
924
|
if params.get("start"):
|
|
718
925
|
query_params["start"] = params["start"]
|
|
719
|
-
elif self.toolset.config.
|
|
926
|
+
elif self.toolset.config.discover_metrics_from_last_hours:
|
|
720
927
|
# Use default time window
|
|
721
928
|
query_params["start"] = str(
|
|
722
929
|
int(time.time())
|
|
723
|
-
- (self.toolset.config.
|
|
930
|
+
- (self.toolset.config.discover_metrics_from_last_hours * 3600)
|
|
724
931
|
)
|
|
725
932
|
|
|
726
933
|
response = do_request(
|
|
727
934
|
config=self.toolset.config,
|
|
728
935
|
url=url,
|
|
729
936
|
params=query_params,
|
|
730
|
-
timeout=self.toolset.config.
|
|
731
|
-
verify=self.toolset.config.
|
|
937
|
+
timeout=self.toolset.config.metadata_timeout_seconds_default,
|
|
938
|
+
verify=self.toolset.config.verify_ssl,
|
|
732
939
|
headers=self.toolset.config.headers,
|
|
733
940
|
method="GET",
|
|
734
941
|
)
|
|
@@ -820,19 +1027,19 @@ class GetAllLabels(BasePrometheusTool):
|
|
|
820
1027
|
|
|
821
1028
|
if params.get("start"):
|
|
822
1029
|
query_params["start"] = params["start"]
|
|
823
|
-
elif self.toolset.config.
|
|
1030
|
+
elif self.toolset.config.discover_metrics_from_last_hours:
|
|
824
1031
|
# Use default time window
|
|
825
1032
|
query_params["start"] = str(
|
|
826
1033
|
int(time.time())
|
|
827
|
-
- (self.toolset.config.
|
|
1034
|
+
- (self.toolset.config.discover_metrics_from_last_hours * 3600)
|
|
828
1035
|
)
|
|
829
1036
|
|
|
830
1037
|
response = do_request(
|
|
831
1038
|
config=self.toolset.config,
|
|
832
1039
|
url=url,
|
|
833
1040
|
params=query_params,
|
|
834
|
-
timeout=self.toolset.config.
|
|
835
|
-
verify=self.toolset.config.
|
|
1041
|
+
timeout=self.toolset.config.metadata_timeout_seconds_default,
|
|
1042
|
+
verify=self.toolset.config.verify_ssl,
|
|
836
1043
|
headers=self.toolset.config.headers,
|
|
837
1044
|
method="GET",
|
|
838
1045
|
)
|
|
@@ -934,19 +1141,19 @@ class GetSeries(BasePrometheusTool):
|
|
|
934
1141
|
|
|
935
1142
|
if params.get("start"):
|
|
936
1143
|
query_params["start"] = params["start"]
|
|
937
|
-
elif self.toolset.config.
|
|
1144
|
+
elif self.toolset.config.discover_metrics_from_last_hours:
|
|
938
1145
|
# Use default time window
|
|
939
1146
|
query_params["start"] = str(
|
|
940
1147
|
int(time.time())
|
|
941
|
-
- (self.toolset.config.
|
|
1148
|
+
- (self.toolset.config.discover_metrics_from_last_hours * 3600)
|
|
942
1149
|
)
|
|
943
1150
|
|
|
944
1151
|
response = do_request(
|
|
945
1152
|
config=self.toolset.config,
|
|
946
1153
|
url=url,
|
|
947
1154
|
params=query_params,
|
|
948
|
-
timeout=self.toolset.config.
|
|
949
|
-
verify=self.toolset.config.
|
|
1155
|
+
timeout=self.toolset.config.metadata_timeout_seconds_default,
|
|
1156
|
+
verify=self.toolset.config.verify_ssl,
|
|
950
1157
|
headers=self.toolset.config.headers,
|
|
951
1158
|
method="GET",
|
|
952
1159
|
)
|
|
@@ -1024,8 +1231,8 @@ class GetMetricMetadata(BasePrometheusTool):
|
|
|
1024
1231
|
config=self.toolset.config,
|
|
1025
1232
|
url=url,
|
|
1026
1233
|
params=query_params,
|
|
1027
|
-
timeout=self.toolset.config.
|
|
1028
|
-
verify=self.toolset.config.
|
|
1234
|
+
timeout=self.toolset.config.metadata_timeout_seconds_default,
|
|
1235
|
+
verify=self.toolset.config.verify_ssl,
|
|
1029
1236
|
headers=self.toolset.config.headers,
|
|
1030
1237
|
method="GET",
|
|
1031
1238
|
)
|
|
@@ -1111,8 +1318,8 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
1111
1318
|
payload = {"query": query}
|
|
1112
1319
|
|
|
1113
1320
|
# Get timeout parameter and enforce limits
|
|
1114
|
-
default_timeout = self.toolset.config.
|
|
1115
|
-
max_timeout = self.toolset.config.
|
|
1321
|
+
default_timeout = self.toolset.config.query_timeout_seconds_default
|
|
1322
|
+
max_timeout = self.toolset.config.query_timeout_seconds_hard_max
|
|
1116
1323
|
timeout = params.get("timeout", default_timeout)
|
|
1117
1324
|
if timeout > max_timeout:
|
|
1118
1325
|
timeout = max_timeout
|
|
@@ -1128,7 +1335,7 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
1128
1335
|
headers=self.toolset.config.headers,
|
|
1129
1336
|
data=payload,
|
|
1130
1337
|
timeout=timeout,
|
|
1131
|
-
verify=self.toolset.config.
|
|
1338
|
+
verify=self.toolset.config.verify_ssl,
|
|
1132
1339
|
method="POST",
|
|
1133
1340
|
)
|
|
1134
1341
|
|
|
@@ -1144,7 +1351,6 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
1144
1351
|
response_data = MetricsBasedResponse(
|
|
1145
1352
|
status=status,
|
|
1146
1353
|
error_message=error_message,
|
|
1147
|
-
random_key=generate_random_key(),
|
|
1148
1354
|
tool_name=self.name,
|
|
1149
1355
|
description=description,
|
|
1150
1356
|
query=query,
|
|
@@ -1158,8 +1364,13 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
1158
1364
|
structured_tool_result = create_structured_tool_result(
|
|
1159
1365
|
params=params, response=response_data
|
|
1160
1366
|
)
|
|
1367
|
+
tool_call_id = context.tool_call_id
|
|
1368
|
+
tool_name = context.tool_name
|
|
1161
1369
|
token_count = count_tool_response_tokens(
|
|
1162
|
-
llm=context.llm,
|
|
1370
|
+
llm=context.llm,
|
|
1371
|
+
structured_tool_result=structured_tool_result,
|
|
1372
|
+
tool_call_id=tool_call_id,
|
|
1373
|
+
tool_name=tool_name,
|
|
1163
1374
|
)
|
|
1164
1375
|
|
|
1165
1376
|
token_limit = context.max_token_count
|
|
@@ -1223,6 +1434,13 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
1223
1434
|
params=params,
|
|
1224
1435
|
)
|
|
1225
1436
|
|
|
1437
|
+
except SSLError as e:
|
|
1438
|
+
logging.warning("SSL error while executing Prometheus query", exc_info=True)
|
|
1439
|
+
return StructuredToolResult(
|
|
1440
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1441
|
+
error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
|
|
1442
|
+
params=params,
|
|
1443
|
+
)
|
|
1226
1444
|
except RequestException as e:
|
|
1227
1445
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
1228
1446
|
return StructuredToolResult(
|
|
@@ -1349,8 +1567,8 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
1349
1567
|
}
|
|
1350
1568
|
|
|
1351
1569
|
# Get timeout parameter and enforce limits
|
|
1352
|
-
default_timeout = self.toolset.config.
|
|
1353
|
-
max_timeout = self.toolset.config.
|
|
1570
|
+
default_timeout = self.toolset.config.query_timeout_seconds_default
|
|
1571
|
+
max_timeout = self.toolset.config.query_timeout_seconds_hard_max
|
|
1354
1572
|
timeout = params.get("timeout", default_timeout)
|
|
1355
1573
|
if timeout > max_timeout:
|
|
1356
1574
|
timeout = max_timeout
|
|
@@ -1366,7 +1584,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
1366
1584
|
headers=self.toolset.config.headers,
|
|
1367
1585
|
data=payload,
|
|
1368
1586
|
timeout=timeout,
|
|
1369
|
-
verify=self.toolset.config.
|
|
1587
|
+
verify=self.toolset.config.verify_ssl,
|
|
1370
1588
|
method="POST",
|
|
1371
1589
|
)
|
|
1372
1590
|
|
|
@@ -1382,7 +1600,6 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
1382
1600
|
response_data = MetricsBasedResponse(
|
|
1383
1601
|
status=status,
|
|
1384
1602
|
error_message=error_message,
|
|
1385
|
-
random_key=generate_random_key(),
|
|
1386
1603
|
tool_name=self.name,
|
|
1387
1604
|
description=description,
|
|
1388
1605
|
query=query,
|
|
@@ -1402,8 +1619,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
1402
1619
|
params=params, response=response_data
|
|
1403
1620
|
)
|
|
1404
1621
|
|
|
1622
|
+
tool_call_id = context.tool_call_id
|
|
1623
|
+
tool_name = context.tool_name
|
|
1405
1624
|
token_count = count_tool_response_tokens(
|
|
1406
|
-
llm=context.llm,
|
|
1625
|
+
llm=context.llm,
|
|
1626
|
+
structured_tool_result=structured_tool_result,
|
|
1627
|
+
tool_call_id=tool_call_id,
|
|
1628
|
+
tool_name=tool_name,
|
|
1407
1629
|
)
|
|
1408
1630
|
|
|
1409
1631
|
token_limit = context.max_token_count
|
|
@@ -1463,6 +1685,15 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
1463
1685
|
params=params,
|
|
1464
1686
|
)
|
|
1465
1687
|
|
|
1688
|
+
except SSLError as e:
|
|
1689
|
+
logging.warning(
|
|
1690
|
+
"SSL error while executing Prometheus range query", exc_info=True
|
|
1691
|
+
)
|
|
1692
|
+
return StructuredToolResult(
|
|
1693
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1694
|
+
error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
|
|
1695
|
+
params=params,
|
|
1696
|
+
)
|
|
1466
1697
|
except RequestException as e:
|
|
1467
1698
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
1468
1699
|
return StructuredToolResult(
|
|
@@ -1484,7 +1715,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
1484
1715
|
|
|
1485
1716
|
|
|
1486
1717
|
class PrometheusToolset(Toolset):
|
|
1487
|
-
config: Optional[Union[PrometheusConfig, AMPConfig]] = None
|
|
1718
|
+
config: Optional[Union[PrometheusConfig, AMPConfig, AzurePrometheusConfig]] = None
|
|
1488
1719
|
|
|
1489
1720
|
def __init__(self):
|
|
1490
1721
|
super().__init__(
|
|
@@ -1517,16 +1748,36 @@ class PrometheusToolset(Toolset):
|
|
|
1517
1748
|
|
|
1518
1749
|
def determine_prometheus_class(
|
|
1519
1750
|
self, config: dict[str, Any]
|
|
1520
|
-
) -> Type[Union[PrometheusConfig, AMPConfig]]:
|
|
1751
|
+
) -> Type[Union[PrometheusConfig, AMPConfig, AzurePrometheusConfig]]:
|
|
1521
1752
|
has_aws_fields = "aws_region" in config
|
|
1522
|
-
|
|
1753
|
+
if has_aws_fields:
|
|
1754
|
+
return AMPConfig
|
|
1755
|
+
|
|
1756
|
+
# Check for Azure config using static method
|
|
1757
|
+
is_azure = AzurePrometheusConfig.is_azure_config(config)
|
|
1758
|
+
if is_azure:
|
|
1759
|
+
logging.info("Detected Azure Managed Prometheus configuration")
|
|
1760
|
+
return AzurePrometheusConfig if is_azure else PrometheusConfig
|
|
1761
|
+
|
|
1762
|
+
def _disable_azure_incompatible_tools(self):
|
|
1763
|
+
"""
|
|
1764
|
+
Azure Managed Prometheus does not support some APIs.
|
|
1765
|
+
Remove unsupported tools.
|
|
1766
|
+
"""
|
|
1767
|
+
incompatible = {
|
|
1768
|
+
"get_label_values",
|
|
1769
|
+
"get_metric_metadata",
|
|
1770
|
+
"list_prometheus_rules",
|
|
1771
|
+
}
|
|
1772
|
+
self.tools = [t for t in self.tools if t.name not in incompatible]
|
|
1523
1773
|
|
|
1524
1774
|
def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
|
|
1525
1775
|
try:
|
|
1526
1776
|
if config:
|
|
1527
1777
|
config_cls = self.determine_prometheus_class(config)
|
|
1528
1778
|
self.config = config_cls(**config) # type: ignore
|
|
1529
|
-
|
|
1779
|
+
if isinstance(self.config, AzurePrometheusConfig):
|
|
1780
|
+
self._disable_azure_incompatible_tools()
|
|
1530
1781
|
self._reload_llm_instructions()
|
|
1531
1782
|
return self._is_healthy()
|
|
1532
1783
|
except Exception:
|
|
@@ -1571,14 +1822,14 @@ class PrometheusToolset(Toolset):
|
|
|
1571
1822
|
f"Toolset {self.name} failed to initialize because prometheus is not configured correctly",
|
|
1572
1823
|
)
|
|
1573
1824
|
|
|
1574
|
-
url = urljoin(self.config.prometheus_url,
|
|
1825
|
+
url = urljoin(self.config.prometheus_url, "api/v1/query?query=up")
|
|
1575
1826
|
try:
|
|
1576
1827
|
response = do_request(
|
|
1577
1828
|
config=self.config,
|
|
1578
1829
|
url=url,
|
|
1579
1830
|
headers=self.config.headers,
|
|
1580
1831
|
timeout=10,
|
|
1581
|
-
verify=self.config.
|
|
1832
|
+
verify=self.config.verify_ssl,
|
|
1582
1833
|
method="GET",
|
|
1583
1834
|
)
|
|
1584
1835
|
|
|
@@ -1599,6 +1850,11 @@ class PrometheusToolset(Toolset):
|
|
|
1599
1850
|
|
|
1600
1851
|
def get_example_config(self):
|
|
1601
1852
|
example_config = PrometheusConfig(
|
|
1602
|
-
prometheus_url="http://
|
|
1853
|
+
prometheus_url="http://prometheus-server.monitoring.svc.cluster.local:9090",
|
|
1854
|
+
headers={"Authorization": "Basic <base64_encoded_credentials>"},
|
|
1855
|
+
discover_metrics_from_last_hours=1,
|
|
1856
|
+
query_timeout_seconds_default=20,
|
|
1857
|
+
query_timeout_seconds_hard_max=180,
|
|
1858
|
+
verify_ssl=True,
|
|
1603
1859
|
)
|
|
1604
1860
|
return example_config.model_dump()
|