holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +17 -4
- holmes/common/env_vars.py +40 -1
- holmes/config.py +114 -144
- holmes/core/conversations.py +53 -14
- holmes/core/feedback.py +191 -0
- holmes/core/investigation.py +18 -22
- holmes/core/llm.py +489 -88
- holmes/core/models.py +103 -1
- holmes/core/openai_formatting.py +13 -0
- holmes/core/prompt.py +1 -1
- holmes/core/safeguards.py +4 -4
- holmes/core/supabase_dal.py +293 -100
- holmes/core/tool_calling_llm.py +423 -323
- holmes/core/tools.py +311 -33
- holmes/core/tools_utils/token_counting.py +14 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
- holmes/core/tools_utils/tool_executor.py +13 -8
- holmes/core/toolset_manager.py +155 -4
- holmes/core/tracing.py +6 -1
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +62 -0
- holmes/core/transformers/llm_summarize.py +174 -0
- holmes/core/transformers/registry.py +122 -0
- holmes/core/transformers/transformer.py +31 -0
- holmes/core/truncation/compaction.py +59 -0
- holmes/core/truncation/dal_truncation_utils.py +23 -0
- holmes/core/truncation/input_context_window_limiter.py +218 -0
- holmes/interactive.py +177 -24
- holmes/main.py +7 -4
- holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
- holmes/plugins/prompts/generic_ask.jinja2 +2 -4
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
- holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
- holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
- holmes/plugins/runbooks/__init__.py +117 -18
- holmes/plugins/runbooks/catalog.json +2 -0
- holmes/plugins/toolsets/__init__.py +21 -8
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
- holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
- holmes/plugins/toolsets/bash/common/bash.py +7 -7
- holmes/plugins/toolsets/cilium.yaml +284 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
- holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
- holmes/plugins/toolsets/git.py +51 -46
- holmes/plugins/toolsets/grafana/common.py +15 -3
- holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
- holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
- holmes/plugins/toolsets/internet/internet.py +6 -7
- holmes/plugins/toolsets/internet/notion.py +5 -6
- holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
- holmes/plugins/toolsets/kafka.py +25 -36
- holmes/plugins/toolsets/kubernetes.yaml +58 -84
- holmes/plugins/toolsets/kubernetes_logs.py +6 -6
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
- holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
- holmes/plugins/toolsets/newrelic/__init__.py +0 -0
- holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
- holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
- holmes/plugins/toolsets/openshift.yaml +283 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
- holmes/plugins/toolsets/robusta/robusta.py +236 -65
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
- holmes/utils/env.py +7 -0
- holmes/utils/global_instructions.py +75 -10
- holmes/utils/holmes_status.py +2 -1
- holmes/utils/holmes_sync_toolsets.py +0 -2
- holmes/utils/krr_utils.py +188 -0
- holmes/utils/sentry_helper.py +41 -0
- holmes/utils/stream.py +61 -7
- holmes/version.py +34 -14
- holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
- holmes/core/performance_timing.py +0 -72
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
- holmes/plugins/toolsets/newrelic.py +0 -231
- holmes/plugins/toolsets/servicenow/install.md +0 -37
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
- holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0
|
@@ -10,7 +10,7 @@ from holmes.common.env_vars import KUBERNETES_LOGS_TIMEOUT_SECONDS
|
|
|
10
10
|
from holmes.core.tools import (
|
|
11
11
|
StaticPrerequisite,
|
|
12
12
|
StructuredToolResult,
|
|
13
|
-
|
|
13
|
+
StructuredToolResultStatus,
|
|
14
14
|
ToolsetTag,
|
|
15
15
|
)
|
|
16
16
|
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
@@ -140,7 +140,7 @@ class KubernetesLogsToolset(BasePodLoggingToolset):
|
|
|
140
140
|
# Ensure both results are not None (they should always be set by the loop)
|
|
141
141
|
if current_logs_result is None or previous_logs_result is None:
|
|
142
142
|
return StructuredToolResult(
|
|
143
|
-
status=
|
|
143
|
+
status=StructuredToolResultStatus.ERROR,
|
|
144
144
|
error="Internal error: Failed to fetch logs",
|
|
145
145
|
params=params.model_dump(),
|
|
146
146
|
)
|
|
@@ -162,7 +162,7 @@ class KubernetesLogsToolset(BasePodLoggingToolset):
|
|
|
162
162
|
):
|
|
163
163
|
# Both commands failed - return error from current logs
|
|
164
164
|
return StructuredToolResult(
|
|
165
|
-
status=
|
|
165
|
+
status=StructuredToolResultStatus.ERROR,
|
|
166
166
|
error=current_logs_result.error,
|
|
167
167
|
params=params.model_dump(),
|
|
168
168
|
return_code=return_code,
|
|
@@ -206,7 +206,7 @@ class KubernetesLogsToolset(BasePodLoggingToolset):
|
|
|
206
206
|
if len(filtered_logs) == 0:
|
|
207
207
|
# Return NO_DATA status when there are no logs
|
|
208
208
|
return StructuredToolResult(
|
|
209
|
-
status=
|
|
209
|
+
status=StructuredToolResultStatus.NO_DATA,
|
|
210
210
|
data="\n".join(
|
|
211
211
|
metadata_lines
|
|
212
212
|
), # Still include metadata for context
|
|
@@ -218,7 +218,7 @@ class KubernetesLogsToolset(BasePodLoggingToolset):
|
|
|
218
218
|
response_data = formatted_logs + "\n" + "\n".join(metadata_lines)
|
|
219
219
|
|
|
220
220
|
return StructuredToolResult(
|
|
221
|
-
status=
|
|
221
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
222
222
|
data=response_data,
|
|
223
223
|
params=params.model_dump(),
|
|
224
224
|
return_code=return_code,
|
|
@@ -226,7 +226,7 @@ class KubernetesLogsToolset(BasePodLoggingToolset):
|
|
|
226
226
|
except Exception as e:
|
|
227
227
|
logging.exception(f"Error fetching logs for pod {params.pod_name}")
|
|
228
228
|
return StructuredToolResult(
|
|
229
|
-
status=
|
|
229
|
+
status=StructuredToolResultStatus.ERROR,
|
|
230
230
|
error=f"Error fetching logs: {str(e)}",
|
|
231
231
|
params=params.model_dump(),
|
|
232
232
|
)
|
|
@@ -8,6 +8,10 @@ toolsets:
|
|
|
8
8
|
prerequisites:
|
|
9
9
|
- command: "kubectl version --client"
|
|
10
10
|
|
|
11
|
+
# Note: Log tools use transformers with llm_summarize to automatically
|
|
12
|
+
# summarize large log outputs when a fast model is configured. This helps
|
|
13
|
+
# focus on errors, patterns, and key information while reducing context usage.
|
|
14
|
+
|
|
11
15
|
tools:
|
|
12
16
|
- name: "kubectl_previous_logs"
|
|
13
17
|
description: "Run `kubectl logs --previous` on a single Kubernetes pod. Used to fetch logs for a pod that crashed and see logs from before the crash. Never give a deployment name or a resource that is not a pod."
|
|
@@ -24,10 +28,38 @@ toolsets:
|
|
|
24
28
|
- name: "kubectl_logs"
|
|
25
29
|
description: "Run `kubectl logs` on a single Kubernetes pod. Never give a deployment name or a resource that is not a pod."
|
|
26
30
|
command: "kubectl logs {{pod_name}} -n {{ namespace }}"
|
|
31
|
+
transformers:
|
|
32
|
+
- name: llm_summarize
|
|
33
|
+
config:
|
|
34
|
+
input_threshold: 1000
|
|
35
|
+
prompt: |
|
|
36
|
+
Summarize these pod logs focusing on:
|
|
37
|
+
- Errors, exceptions, and warning messages
|
|
38
|
+
- Recent activity patterns and trends
|
|
39
|
+
- Any authentication, connection, or startup issues
|
|
40
|
+
- Performance indicators (response times, throughput)
|
|
41
|
+
- Group similar log entries together
|
|
42
|
+
- When possible, mention exact error codes or keywords for easier searching
|
|
43
|
+
- Be concise: aim for ≤ 50% of the original text; prioritize aggregates and actionable outliers
|
|
44
|
+
- Include grep-ready keys/values; avoid repeating entire logs or unchanged defaults
|
|
27
45
|
|
|
28
46
|
- name: "kubectl_logs_all_containers"
|
|
29
47
|
description: "Run `kubectl logs` on all containers within a single Kubernetes pod."
|
|
30
48
|
command: "kubectl logs {{pod_name}} -n {{ namespace }} --all-containers"
|
|
49
|
+
transformers:
|
|
50
|
+
- name: llm_summarize
|
|
51
|
+
config:
|
|
52
|
+
input_threshold: 1000
|
|
53
|
+
prompt: |
|
|
54
|
+
Summarize these multi-container pod logs focusing on:
|
|
55
|
+
- Errors, exceptions, and warning messages by container
|
|
56
|
+
- Inter-container communication patterns
|
|
57
|
+
- Any authentication, connection, or startup issues
|
|
58
|
+
- Performance indicators and resource usage patterns
|
|
59
|
+
- Group similar log entries together by container
|
|
60
|
+
- When possible, mention exact error codes or keywords for easier searching
|
|
61
|
+
- Strive for ≤ 50% of the original size; keep results compact and grep-friendly (one line per aggregate)
|
|
62
|
+
- Prioritize aggregates and actionable outliers over comprehensive details
|
|
31
63
|
|
|
32
64
|
- name: "kubectl_container_logs"
|
|
33
65
|
description: "Run `kubectl logs` on a single container within a Kubernetes pod. This is to get the logs of a specific container in a multi-container pod."
|
|
@@ -1,27 +1,36 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from datetime import datetime, timedelta
|
|
3
3
|
import logging
|
|
4
|
+
from math import ceil
|
|
4
5
|
from typing import Optional, Set
|
|
5
6
|
from enum import Enum
|
|
6
7
|
|
|
7
8
|
from pydantic import BaseModel, field_validator
|
|
8
9
|
from datetime import timezone
|
|
10
|
+
from holmes.core.llm import LLM
|
|
9
11
|
from holmes.core.tools import (
|
|
10
12
|
StructuredToolResult,
|
|
11
13
|
Tool,
|
|
14
|
+
ToolInvokeContext,
|
|
12
15
|
ToolParameter,
|
|
13
16
|
Toolset,
|
|
14
17
|
)
|
|
18
|
+
from holmes.core.tools_utils.token_counting import count_tool_response_tokens
|
|
15
19
|
from holmes.plugins.toolsets.utils import get_param_or_raise
|
|
16
20
|
|
|
17
21
|
# Default values for log fetching
|
|
18
22
|
DEFAULT_LOG_LIMIT = 100
|
|
19
23
|
SECONDS_PER_DAY = 24 * 60 * 60
|
|
20
24
|
DEFAULT_TIME_SPAN_SECONDS = 7 * SECONDS_PER_DAY # 1 week in seconds
|
|
21
|
-
DEFAULT_GRAPH_TIME_SPAN_SECONDS = 1 *
|
|
25
|
+
DEFAULT_GRAPH_TIME_SPAN_SECONDS = 1 * 60 * 60 # 1 hour in seconds
|
|
22
26
|
|
|
23
27
|
POD_LOGGING_TOOL_NAME = "fetch_pod_logs"
|
|
24
28
|
|
|
29
|
+
TRUNCATION_PROMPT_PREFIX = "[... PREVIOUS LOGS ABOVE THIS LINE HAVE BEEN TRUNCATED]"
|
|
30
|
+
MIN_NUMBER_OF_CHARACTERS_TO_TRUNCATE: int = (
|
|
31
|
+
50 + len(TRUNCATION_PROMPT_PREFIX)
|
|
32
|
+
) # prevents the truncation algorithm from going too slow once the actual token count gets close to the expected limit
|
|
33
|
+
|
|
25
34
|
|
|
26
35
|
class LoggingCapability(str, Enum):
|
|
27
36
|
"""Optional advanced logging capabilities"""
|
|
@@ -74,6 +83,68 @@ class BasePodLoggingToolset(Toolset, ABC):
|
|
|
74
83
|
return ""
|
|
75
84
|
|
|
76
85
|
|
|
86
|
+
def truncate_logs(
|
|
87
|
+
logging_structured_tool_result: StructuredToolResult,
|
|
88
|
+
llm: LLM,
|
|
89
|
+
token_limit: int,
|
|
90
|
+
structured_params: FetchPodLogsParams,
|
|
91
|
+
):
|
|
92
|
+
original_token_count = count_tool_response_tokens(
|
|
93
|
+
llm=llm, structured_tool_result=logging_structured_tool_result
|
|
94
|
+
)
|
|
95
|
+
token_count = original_token_count
|
|
96
|
+
text = None
|
|
97
|
+
while token_count > token_limit:
|
|
98
|
+
# Loop because we are counting tokens but trimming characters. This means we try to trim a number of
|
|
99
|
+
# characters proportional to the number of tokens but we may still have too many tokens
|
|
100
|
+
if not text:
|
|
101
|
+
text = logging_structured_tool_result.get_stringified_data()
|
|
102
|
+
if not text:
|
|
103
|
+
# Weird scenario where the result exceeds the token allowance but there is not data.
|
|
104
|
+
# Exit and do nothing because I don't know how to handle such scenario.
|
|
105
|
+
logging.warning(
|
|
106
|
+
f"The calculated token count for logs is {token_count} but the limit is {token_limit}. However the data field is empty so there are no logs to truncate."
|
|
107
|
+
)
|
|
108
|
+
return
|
|
109
|
+
ratio = token_count / token_limit
|
|
110
|
+
character_count = len(text)
|
|
111
|
+
number_of_characters_to_truncate = character_count - ceil(
|
|
112
|
+
character_count / ratio
|
|
113
|
+
)
|
|
114
|
+
number_of_characters_to_truncate = max(
|
|
115
|
+
MIN_NUMBER_OF_CHARACTERS_TO_TRUNCATE, number_of_characters_to_truncate
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if len(text) <= number_of_characters_to_truncate:
|
|
119
|
+
logging.warning(
|
|
120
|
+
f"The calculated token count for logs is {token_count} (max allowed tokens={token_limit}) but the logs are only {len(text)} characters which is below the intended truncation of {number_of_characters_to_truncate} characters. Logs will no longer be truncated"
|
|
121
|
+
)
|
|
122
|
+
return
|
|
123
|
+
else:
|
|
124
|
+
linefeed_truncation_offset = max(
|
|
125
|
+
text[number_of_characters_to_truncate:].find("\n"), 0
|
|
126
|
+
) # keep log lines atomic
|
|
127
|
+
|
|
128
|
+
# Tentatively add the truncation prefix.
|
|
129
|
+
# When counting tokens, we want to include the TRUNCATION_PROMPT_PREFIX because it will be part of the tool response.
|
|
130
|
+
# Because we're truncating based on character counts but ultimately checking tokens count,
|
|
131
|
+
# it is possible that the character truncation is incorrect and more need to be truncated.
|
|
132
|
+
# This will be caught in the next iteration and the truncation prefix will be truncated
|
|
133
|
+
# because MIN_NUMBER_OF_CHARACTERS_TO_TRUNCATE cannot be smaller than TRUNCATION_PROMPT_PREFIX
|
|
134
|
+
text = (
|
|
135
|
+
TRUNCATION_PROMPT_PREFIX
|
|
136
|
+
+ text[number_of_characters_to_truncate + linefeed_truncation_offset :]
|
|
137
|
+
)
|
|
138
|
+
logging_structured_tool_result.data = text
|
|
139
|
+
token_count = count_tool_response_tokens(
|
|
140
|
+
llm=llm, structured_tool_result=logging_structured_tool_result
|
|
141
|
+
)
|
|
142
|
+
if token_count < original_token_count:
|
|
143
|
+
logging.info(
|
|
144
|
+
f"Logs for pod {structured_params.pod_name}/{structured_params.namespace} have been truncated from {original_token_count} tokens down to {token_count} tokens."
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
77
148
|
class PodLoggingTool(Tool):
|
|
78
149
|
"""Common tool for fetching pod logs across different logging backends"""
|
|
79
150
|
|
|
@@ -175,9 +246,7 @@ If you hit the log limit and see lots of repetitive INFO logs, use exclude_filte
|
|
|
175
246
|
|
|
176
247
|
return params
|
|
177
248
|
|
|
178
|
-
def _invoke(
|
|
179
|
-
self, params: dict, user_approved: bool = False
|
|
180
|
-
) -> StructuredToolResult:
|
|
249
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
181
250
|
structured_params = FetchPodLogsParams(
|
|
182
251
|
namespace=get_param_or_raise(params, "namespace"),
|
|
183
252
|
pod_name=get_param_or_raise(params, "pod_name"),
|
|
@@ -192,6 +261,13 @@ If you hit the log limit and see lots of repetitive INFO logs, use exclude_filte
|
|
|
192
261
|
params=structured_params,
|
|
193
262
|
)
|
|
194
263
|
|
|
264
|
+
truncate_logs(
|
|
265
|
+
logging_structured_tool_result=result,
|
|
266
|
+
llm=context.llm,
|
|
267
|
+
token_limit=context.max_token_count,
|
|
268
|
+
structured_params=structured_params,
|
|
269
|
+
)
|
|
270
|
+
|
|
195
271
|
return result
|
|
196
272
|
|
|
197
273
|
def get_parameterized_one_liner(self, params: dict) -> str:
|
|
@@ -1,71 +1,139 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from holmes.common.env_vars import SSE_READ_TIMEOUT
|
|
1
4
|
from holmes.core.tools import (
|
|
5
|
+
ToolInvokeContext,
|
|
2
6
|
Toolset,
|
|
3
7
|
Tool,
|
|
4
8
|
ToolParameter,
|
|
5
9
|
StructuredToolResult,
|
|
6
|
-
|
|
10
|
+
StructuredToolResultStatus,
|
|
7
11
|
CallablePrerequisite,
|
|
8
12
|
)
|
|
9
13
|
|
|
10
14
|
from typing import Dict, Any, List, Optional
|
|
11
15
|
from mcp.client.session import ClientSession
|
|
12
16
|
from mcp.client.sse import sse_client
|
|
17
|
+
from mcp.client.streamable_http import streamablehttp_client
|
|
13
18
|
|
|
14
19
|
from mcp.types import Tool as MCP_Tool
|
|
15
|
-
from mcp.types import CallToolResult
|
|
16
20
|
|
|
17
21
|
import asyncio
|
|
18
|
-
from
|
|
22
|
+
from contextlib import asynccontextmanager
|
|
23
|
+
from pydantic import BaseModel, Field, AnyUrl, model_validator
|
|
19
24
|
from typing import Tuple
|
|
20
25
|
import logging
|
|
26
|
+
from enum import Enum
|
|
27
|
+
import threading
|
|
21
28
|
|
|
29
|
+
# Lock per MCP server URL to serialize calls to the same server
|
|
30
|
+
_server_locks: Dict[str, threading.Lock] = {}
|
|
31
|
+
_locks_lock = threading.Lock()
|
|
22
32
|
|
|
23
|
-
|
|
24
|
-
|
|
33
|
+
|
|
34
|
+
def get_server_lock(url: str) -> threading.Lock:
|
|
35
|
+
"""Get or create a lock for a specific MCP server URL."""
|
|
36
|
+
with _locks_lock:
|
|
37
|
+
if url not in _server_locks:
|
|
38
|
+
_server_locks[url] = threading.Lock()
|
|
39
|
+
return _server_locks[url]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MCPMode(str, Enum):
|
|
43
|
+
SSE = "sse"
|
|
44
|
+
STREAMABLE_HTTP = "streamable-http"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class MCPConfig(BaseModel):
|
|
48
|
+
url: AnyUrl
|
|
49
|
+
mode: MCPMode = MCPMode.SSE
|
|
25
50
|
headers: Optional[Dict[str, str]] = None
|
|
26
51
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
52
|
+
|
|
53
|
+
@asynccontextmanager
|
|
54
|
+
async def get_initialized_mcp_session(
|
|
55
|
+
url: str, headers: Optional[Dict[str, str]], mode: MCPMode
|
|
56
|
+
):
|
|
57
|
+
if mode == MCPMode.SSE:
|
|
58
|
+
async with sse_client(
|
|
59
|
+
url, headers=headers, sse_read_timeout=SSE_READ_TIMEOUT
|
|
60
|
+
) as (
|
|
61
|
+
read_stream,
|
|
62
|
+
write_stream,
|
|
63
|
+
):
|
|
64
|
+
async with ClientSession(read_stream, write_stream) as session:
|
|
65
|
+
_ = await session.initialize()
|
|
66
|
+
yield session
|
|
67
|
+
else:
|
|
68
|
+
async with streamablehttp_client(
|
|
69
|
+
url, headers=headers, sse_read_timeout=SSE_READ_TIMEOUT
|
|
70
|
+
) as (
|
|
71
|
+
read_stream,
|
|
72
|
+
write_stream,
|
|
73
|
+
_,
|
|
74
|
+
):
|
|
75
|
+
async with ClientSession(read_stream, write_stream) as session:
|
|
76
|
+
_ = await session.initialize()
|
|
77
|
+
yield session
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class RemoteMCPTool(Tool):
|
|
81
|
+
toolset: "RemoteMCPToolset" = Field(exclude=True)
|
|
82
|
+
|
|
83
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
30
84
|
try:
|
|
31
|
-
|
|
85
|
+
# Serialize calls to the same MCP server to prevent SSE conflicts
|
|
86
|
+
# Different servers can still run in parallel
|
|
87
|
+
if not self.toolset._mcp_config:
|
|
88
|
+
raise ValueError("MCP config not initialized")
|
|
89
|
+
lock = get_server_lock(str(self.toolset._mcp_config.url))
|
|
90
|
+
with lock:
|
|
91
|
+
return asyncio.run(self._invoke_async(params))
|
|
32
92
|
except Exception as e:
|
|
33
93
|
return StructuredToolResult(
|
|
34
|
-
status=
|
|
94
|
+
status=StructuredToolResultStatus.ERROR,
|
|
35
95
|
error=str(e.args),
|
|
36
96
|
params=params,
|
|
37
97
|
invocation=f"MCPtool {self.name} with params {params}",
|
|
38
98
|
)
|
|
39
99
|
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _is_content_error(content: str) -> bool:
|
|
102
|
+
try: # aws mcp sometimes returns an error in content - status code != 200
|
|
103
|
+
json_content: dict = json.loads(content)
|
|
104
|
+
status_code = json_content.get("response", {}).get("status_code", 200)
|
|
105
|
+
return status_code >= 300
|
|
106
|
+
except Exception:
|
|
107
|
+
return False
|
|
108
|
+
|
|
40
109
|
async def _invoke_async(self, params: Dict) -> StructuredToolResult:
|
|
41
|
-
async with
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
data=merged_text,
|
|
56
|
-
params=params,
|
|
57
|
-
invocation=f"MCPtool {self.name} with params {params}",
|
|
58
|
-
)
|
|
110
|
+
async with self.toolset.get_initialized_session() as session:
|
|
111
|
+
tool_result = await session.call_tool(self.name, params)
|
|
112
|
+
|
|
113
|
+
merged_text = " ".join(c.text for c in tool_result.content if c.type == "text")
|
|
114
|
+
return StructuredToolResult(
|
|
115
|
+
status=(
|
|
116
|
+
StructuredToolResultStatus.ERROR
|
|
117
|
+
if (tool_result.isError or self._is_content_error(merged_text))
|
|
118
|
+
else StructuredToolResultStatus.SUCCESS
|
|
119
|
+
),
|
|
120
|
+
data=merged_text,
|
|
121
|
+
params=params,
|
|
122
|
+
invocation=f"MCPtool {self.name} with params {params}",
|
|
123
|
+
)
|
|
59
124
|
|
|
60
125
|
@classmethod
|
|
61
|
-
def create(
|
|
126
|
+
def create(
|
|
127
|
+
cls,
|
|
128
|
+
tool: MCP_Tool,
|
|
129
|
+
toolset: "RemoteMCPToolset",
|
|
130
|
+
):
|
|
62
131
|
parameters = cls.parse_input_schema(tool.inputSchema)
|
|
63
132
|
return cls(
|
|
64
|
-
url=url,
|
|
65
133
|
name=tool.name,
|
|
66
134
|
description=tool.description or "",
|
|
67
135
|
parameters=parameters,
|
|
68
|
-
|
|
136
|
+
toolset=toolset,
|
|
69
137
|
)
|
|
70
138
|
|
|
71
139
|
@classmethod
|
|
@@ -85,53 +153,111 @@ class RemoteMCPTool(Tool):
|
|
|
85
153
|
return parameters
|
|
86
154
|
|
|
87
155
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
88
|
-
|
|
156
|
+
if params:
|
|
157
|
+
if params.get("cli_command"): # Return AWS MCP cli command, if available
|
|
158
|
+
return f"{params.get('cli_command')}"
|
|
159
|
+
|
|
160
|
+
url = (
|
|
161
|
+
str(self.toolset._mcp_config.url) if self.toolset._mcp_config else "unknown"
|
|
162
|
+
)
|
|
163
|
+
return f"Call MCP Server ({url} - {self.name})"
|
|
89
164
|
|
|
90
165
|
|
|
91
166
|
class RemoteMCPToolset(Toolset):
|
|
92
|
-
url: AnyUrl
|
|
93
167
|
tools: List[RemoteMCPTool] = Field(default_factory=list) # type: ignore
|
|
94
168
|
icon_url: str = "https://registry.npmmirror.com/@lobehub/icons-static-png/1.46.0/files/light/mcp.png"
|
|
169
|
+
_mcp_config: Optional[MCPConfig] = None
|
|
95
170
|
|
|
96
171
|
def model_post_init(self, __context: Any) -> None:
|
|
97
|
-
self.prerequisites = [
|
|
172
|
+
self.prerequisites = [
|
|
173
|
+
CallablePrerequisite(callable=self.prerequisites_callable)
|
|
174
|
+
]
|
|
175
|
+
|
|
176
|
+
@model_validator(mode="before")
|
|
177
|
+
@classmethod
|
|
178
|
+
def migrate_url_to_config(cls, values: dict[str, Any]) -> dict[str, Any]:
|
|
179
|
+
"""
|
|
180
|
+
Migrates url from field parameter to config object.
|
|
181
|
+
If url is passed as a parameter, it's moved to config (or config is created if it doesn't exist).
|
|
182
|
+
"""
|
|
183
|
+
if not isinstance(values, dict) or "url" not in values:
|
|
184
|
+
return values
|
|
98
185
|
|
|
99
|
-
|
|
100
|
-
|
|
186
|
+
url_value = values.pop("url")
|
|
187
|
+
if url_value is None:
|
|
188
|
+
return values
|
|
101
189
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
return v
|
|
190
|
+
config = values.get("config")
|
|
191
|
+
if config is None:
|
|
192
|
+
config = {}
|
|
193
|
+
values["config"] = config
|
|
107
194
|
|
|
108
|
-
|
|
109
|
-
|
|
195
|
+
toolset_name = values.get("name", "unknown")
|
|
196
|
+
if "url" in config:
|
|
197
|
+
logging.warning(
|
|
198
|
+
f"Toolset {toolset_name}: has two urls defined, remove the 'url' field from the toolset configuration and keep the 'url' in the config section."
|
|
199
|
+
)
|
|
200
|
+
return values
|
|
201
|
+
|
|
202
|
+
logging.warning(
|
|
203
|
+
f"Toolset {toolset_name}: 'url' field has been migrated to config. "
|
|
204
|
+
"Please move 'url' to the config section."
|
|
205
|
+
)
|
|
206
|
+
config["url"] = url_value
|
|
207
|
+
return values
|
|
208
|
+
|
|
209
|
+
def prerequisites_callable(self, config) -> Tuple[bool, str]:
|
|
110
210
|
try:
|
|
211
|
+
if not config:
|
|
212
|
+
return (False, f"Config is required for {self.name}")
|
|
213
|
+
|
|
214
|
+
if "mode" in config:
|
|
215
|
+
mode_value = config.get("mode")
|
|
216
|
+
allowed_modes = [e.value for e in MCPMode]
|
|
217
|
+
if mode_value not in allowed_modes:
|
|
218
|
+
return (
|
|
219
|
+
False,
|
|
220
|
+
f'Invalid mode "{mode_value}", allowed modes are {", ".join(allowed_modes)}',
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
self._mcp_config = MCPConfig(**config)
|
|
224
|
+
|
|
225
|
+
clean_url_str = str(self._mcp_config.url).rstrip("/")
|
|
226
|
+
|
|
227
|
+
if self._mcp_config.mode == MCPMode.SSE and not clean_url_str.endswith(
|
|
228
|
+
"/sse"
|
|
229
|
+
):
|
|
230
|
+
self._mcp_config.url = AnyUrl(clean_url_str + "/sse")
|
|
231
|
+
|
|
111
232
|
tools_result = asyncio.run(self._get_server_tools())
|
|
233
|
+
|
|
112
234
|
self.tools = [
|
|
113
|
-
RemoteMCPTool.create(
|
|
114
|
-
for tool in tools_result.tools
|
|
235
|
+
RemoteMCPTool.create(tool, self) for tool in tools_result.tools
|
|
115
236
|
]
|
|
116
237
|
|
|
117
238
|
if not self.tools:
|
|
118
239
|
logging.warning(f"mcp server {self.name} loaded 0 tools.")
|
|
240
|
+
|
|
119
241
|
return (True, "")
|
|
120
242
|
except Exception as e:
|
|
121
|
-
# using e.args, the asyncio wrapper could stack another exception this helps printing them all.
|
|
122
243
|
return (
|
|
123
244
|
False,
|
|
124
|
-
f"Failed to load mcp server {self.name} {self.url} {str(e
|
|
245
|
+
f"Failed to load mcp server {self.name} {self._mcp_config.url if self._mcp_config else 'unknown'}: {str(e)}",
|
|
125
246
|
)
|
|
126
247
|
|
|
127
248
|
async def _get_server_tools(self):
|
|
128
|
-
async with
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
249
|
+
async with self.get_initialized_session() as session:
|
|
250
|
+
return await session.list_tools()
|
|
251
|
+
|
|
252
|
+
def get_initialized_session(self):
|
|
253
|
+
return get_initialized_mcp_session(
|
|
254
|
+
str(self._mcp_config.url), self._mcp_config.headers, self._mcp_config.mode
|
|
255
|
+
)
|
|
135
256
|
|
|
136
257
|
def get_example_config(self) -> Dict[str, Any]:
|
|
137
|
-
|
|
258
|
+
example_config = MCPConfig(
|
|
259
|
+
url=AnyUrl("http://example.com:8000/mcp/messages"),
|
|
260
|
+
mode=MCPMode.STREAMABLE_HTTP,
|
|
261
|
+
headers={"Authorization": "Bearer YOUR_TOKEN"},
|
|
262
|
+
)
|
|
263
|
+
return example_config.model_dump()
|
|
File without changes
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""NewRelic API wrapper for executing NRQL queries via GraphQL."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
import requests # type: ignore
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class NewRelicAPI:
|
|
13
|
+
"""Python wrapper for NewRelic GraphQL API.
|
|
14
|
+
|
|
15
|
+
This class provides a clean interface to execute NRQL queries via the NewRelic GraphQL API,
|
|
16
|
+
supporting both US and EU datacenters.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, api_key: str, account_id: str, is_eu_datacenter: bool = False):
|
|
20
|
+
"""Initialize the NewRelic API wrapper.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
api_key: NewRelic API key
|
|
24
|
+
account_id: NewRelic account ID
|
|
25
|
+
is_eu_datacenter: If True, use EU datacenter URL. Defaults to False (US).
|
|
26
|
+
"""
|
|
27
|
+
self.api_key = api_key
|
|
28
|
+
# Validate account_id is numeric to prevent injection
|
|
29
|
+
try:
|
|
30
|
+
self.account_id = int(account_id)
|
|
31
|
+
except ValueError:
|
|
32
|
+
raise ValueError(f"Invalid account_id: must be numeric, got '{account_id}'")
|
|
33
|
+
self.is_eu_datacenter = is_eu_datacenter
|
|
34
|
+
|
|
35
|
+
def _get_api_url(self) -> str:
|
|
36
|
+
"""Get the appropriate API URL based on datacenter location.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
str: The GraphQL API endpoint URL
|
|
40
|
+
"""
|
|
41
|
+
if self.is_eu_datacenter:
|
|
42
|
+
return "https://api.eu.newrelic.com/graphql"
|
|
43
|
+
return "https://api.newrelic.com/graphql"
|
|
44
|
+
|
|
45
|
+
def _make_request(
|
|
46
|
+
self, graphql_query: Dict[str, Any], timeout: int = 30
|
|
47
|
+
) -> Dict[str, Any]:
|
|
48
|
+
"""Make HTTP POST request to NewRelic GraphQL API.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
graphql_query: The GraphQL query as a dictionary
|
|
52
|
+
timeout: Request timeout in seconds
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
JSON response from the API
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
requests.exceptions.HTTPError: If the request fails
|
|
59
|
+
Exception: If GraphQL returns errors
|
|
60
|
+
"""
|
|
61
|
+
url = self._get_api_url()
|
|
62
|
+
headers = {
|
|
63
|
+
"Content-Type": "application/json",
|
|
64
|
+
"Api-Key": self.api_key,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
response = requests.post(
|
|
68
|
+
url,
|
|
69
|
+
headers=headers,
|
|
70
|
+
json=graphql_query,
|
|
71
|
+
timeout=timeout,
|
|
72
|
+
)
|
|
73
|
+
response.raise_for_status()
|
|
74
|
+
|
|
75
|
+
# Parse JSON response
|
|
76
|
+
data = response.json()
|
|
77
|
+
|
|
78
|
+
# Check for GraphQL errors even on 200 responses
|
|
79
|
+
if "errors" in data and data["errors"]:
|
|
80
|
+
error_msg = data["errors"][0].get("message", "Unknown GraphQL error")
|
|
81
|
+
raise Exception(f"NewRelic GraphQL error: {error_msg}")
|
|
82
|
+
|
|
83
|
+
return data
|
|
84
|
+
|
|
85
|
+
def execute_nrql_query(self, nrql_query: str) -> list:
|
|
86
|
+
"""Execute an NRQL query via the NewRelic GraphQL API.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
nrql_query: The NRQL query string to execute
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
list: The query results from NewRelic (extracted from the nested response)
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
requests.exceptions.HTTPError: If the API request fails
|
|
96
|
+
Exception: If GraphQL returns errors
|
|
97
|
+
"""
|
|
98
|
+
# Build the GraphQL query using variables to prevent injection
|
|
99
|
+
# Note: New Relic's GraphQL API requires the account ID to be inline, but we can use variables for the NRQL query
|
|
100
|
+
graphql_query = {
|
|
101
|
+
"query": f"""
|
|
102
|
+
query ExecuteNRQL($nrqlQuery: Nrql!) {{
|
|
103
|
+
actor {{
|
|
104
|
+
account(id: {self.account_id}) {{
|
|
105
|
+
nrql(query: $nrqlQuery) {{
|
|
106
|
+
results
|
|
107
|
+
}}
|
|
108
|
+
}}
|
|
109
|
+
}}
|
|
110
|
+
}}
|
|
111
|
+
""",
|
|
112
|
+
"variables": {"nrqlQuery": nrql_query},
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
logger.info(f"Executing NRQL query: {nrql_query}")
|
|
116
|
+
response = self._make_request(graphql_query)
|
|
117
|
+
|
|
118
|
+
# Extract just the results array from the nested response
|
|
119
|
+
try:
|
|
120
|
+
results = response["data"]["actor"]["account"]["nrql"]["results"]
|
|
121
|
+
return results
|
|
122
|
+
except (KeyError, TypeError) as e:
|
|
123
|
+
raise Exception(
|
|
124
|
+
f"Failed to extract results from NewRelic response: {e}"
|
|
125
|
+
) from e
|