holmesgpt 0.11.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/.git_archival.json +7 -0
- holmes/__init__.py +76 -0
- holmes/__init__.py.bak +76 -0
- holmes/clients/robusta_client.py +24 -0
- holmes/common/env_vars.py +47 -0
- holmes/config.py +526 -0
- holmes/core/__init__.py +0 -0
- holmes/core/conversations.py +578 -0
- holmes/core/investigation.py +152 -0
- holmes/core/investigation_structured_output.py +264 -0
- holmes/core/issue.py +54 -0
- holmes/core/llm.py +250 -0
- holmes/core/models.py +157 -0
- holmes/core/openai_formatting.py +51 -0
- holmes/core/performance_timing.py +72 -0
- holmes/core/prompt.py +42 -0
- holmes/core/resource_instruction.py +17 -0
- holmes/core/runbooks.py +26 -0
- holmes/core/safeguards.py +120 -0
- holmes/core/supabase_dal.py +540 -0
- holmes/core/tool_calling_llm.py +798 -0
- holmes/core/tools.py +566 -0
- holmes/core/tools_utils/__init__.py +0 -0
- holmes/core/tools_utils/tool_executor.py +65 -0
- holmes/core/tools_utils/toolset_utils.py +52 -0
- holmes/core/toolset_manager.py +418 -0
- holmes/interactive.py +229 -0
- holmes/main.py +1041 -0
- holmes/plugins/__init__.py +0 -0
- holmes/plugins/destinations/__init__.py +6 -0
- holmes/plugins/destinations/slack/__init__.py +2 -0
- holmes/plugins/destinations/slack/plugin.py +163 -0
- holmes/plugins/interfaces.py +32 -0
- holmes/plugins/prompts/__init__.py +48 -0
- holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
- holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
- holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
- holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
- holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
- holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
- holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
- holmes/plugins/prompts/generic_ask.jinja2 +36 -0
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
- holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
- holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
- holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
- holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
- holmes/plugins/runbooks/README.md +22 -0
- holmes/plugins/runbooks/__init__.py +100 -0
- holmes/plugins/runbooks/catalog.json +14 -0
- holmes/plugins/runbooks/jira.yaml +12 -0
- holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
- holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
- holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
- holmes/plugins/sources/github/__init__.py +77 -0
- holmes/plugins/sources/jira/__init__.py +123 -0
- holmes/plugins/sources/opsgenie/__init__.py +93 -0
- holmes/plugins/sources/pagerduty/__init__.py +147 -0
- holmes/plugins/sources/prometheus/__init__.py +0 -0
- holmes/plugins/sources/prometheus/models.py +104 -0
- holmes/plugins/sources/prometheus/plugin.py +154 -0
- holmes/plugins/toolsets/__init__.py +171 -0
- holmes/plugins/toolsets/aks-node-health.yaml +65 -0
- holmes/plugins/toolsets/aks.yaml +86 -0
- holmes/plugins/toolsets/argocd.yaml +70 -0
- holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
- holmes/plugins/toolsets/aws.yaml +76 -0
- holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
- holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
- holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
- holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
- holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
- holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
- holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
- holmes/plugins/toolsets/azure_sql/install.md +66 -0
- holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
- holmes/plugins/toolsets/azure_sql/utils.py +83 -0
- holmes/plugins/toolsets/bash/__init__.py +0 -0
- holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
- holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
- holmes/plugins/toolsets/bash/common/bash.py +52 -0
- holmes/plugins/toolsets/bash/common/config.py +14 -0
- holmes/plugins/toolsets/bash/common/stringify.py +25 -0
- holmes/plugins/toolsets/bash/common/validators.py +24 -0
- holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
- holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
- holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
- holmes/plugins/toolsets/bash/parse_command.py +103 -0
- holmes/plugins/toolsets/confluence.yaml +19 -0
- holmes/plugins/toolsets/consts.py +5 -0
- holmes/plugins/toolsets/coralogix/api.py +158 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
- holmes/plugins/toolsets/coralogix/utils.py +181 -0
- holmes/plugins/toolsets/datadog.py +153 -0
- holmes/plugins/toolsets/docker.yaml +46 -0
- holmes/plugins/toolsets/git.py +756 -0
- holmes/plugins/toolsets/grafana/__init__.py +0 -0
- holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
- holmes/plugins/toolsets/grafana/common.py +68 -0
- holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
- holmes/plugins/toolsets/grafana/loki_api.py +89 -0
- holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
- holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
- holmes/plugins/toolsets/helm.yaml +42 -0
- holmes/plugins/toolsets/internet/internet.py +275 -0
- holmes/plugins/toolsets/internet/notion.py +137 -0
- holmes/plugins/toolsets/kafka.py +638 -0
- holmes/plugins/toolsets/kubernetes.yaml +255 -0
- holmes/plugins/toolsets/kubernetes_logs.py +426 -0
- holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
- holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
- holmes/plugins/toolsets/logging_utils/types.py +0 -0
- holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
- holmes/plugins/toolsets/newrelic.py +222 -0
- holmes/plugins/toolsets/opensearch/__init__.py +0 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
- holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
- holmes/plugins/toolsets/rabbitmq/api.py +398 -0
- holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
- holmes/plugins/toolsets/robusta/__init__.py +0 -0
- holmes/plugins/toolsets/robusta/robusta.py +235 -0
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
- holmes/plugins/toolsets/runbook/__init__.py +0 -0
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
- holmes/plugins/toolsets/service_discovery.py +92 -0
- holmes/plugins/toolsets/servicenow/install.md +37 -0
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
- holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
- holmes/plugins/toolsets/slab.yaml +20 -0
- holmes/plugins/toolsets/utils.py +137 -0
- holmes/plugins/utils.py +14 -0
- holmes/utils/__init__.py +0 -0
- holmes/utils/cache.py +84 -0
- holmes/utils/cert_utils.py +40 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
- holmes/utils/definitions.py +13 -0
- holmes/utils/env.py +53 -0
- holmes/utils/file_utils.py +56 -0
- holmes/utils/global_instructions.py +20 -0
- holmes/utils/holmes_status.py +22 -0
- holmes/utils/holmes_sync_toolsets.py +80 -0
- holmes/utils/markdown_utils.py +55 -0
- holmes/utils/pydantic_utils.py +54 -0
- holmes/utils/robusta.py +10 -0
- holmes/utils/tags.py +97 -0
- holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
- holmesgpt-0.11.5.dist-info/METADATA +400 -0
- holmesgpt-0.11.5.dist-info/RECORD +183 -0
- holmesgpt-0.11.5.dist-info/WHEEL +4 -0
- holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0
|
File without changes
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, ClassVar, Tuple, Type
|
|
3
|
+
|
|
4
|
+
from holmes.core.tools import CallablePrerequisite, Tool, Toolset, ToolsetTag
|
|
5
|
+
from holmes.plugins.toolsets.consts import TOOLSET_CONFIG_MISSING_ERROR
|
|
6
|
+
from holmes.plugins.toolsets.grafana.common import GrafanaConfig
|
|
7
|
+
|
|
8
|
+
from holmes.plugins.toolsets.grafana.grafana_api import grafana_health_check
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseGrafanaToolset(Toolset):
|
|
12
|
+
config_class: ClassVar[Type[GrafanaConfig]] = GrafanaConfig
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
name: str,
|
|
17
|
+
description: str,
|
|
18
|
+
icon_url: str,
|
|
19
|
+
tools: list[Tool],
|
|
20
|
+
docs_url: str,
|
|
21
|
+
):
|
|
22
|
+
super().__init__(
|
|
23
|
+
name=name,
|
|
24
|
+
description=description,
|
|
25
|
+
icon_url=icon_url,
|
|
26
|
+
docs_url=docs_url,
|
|
27
|
+
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
|
|
28
|
+
tools=tools,
|
|
29
|
+
tags=[
|
|
30
|
+
ToolsetTag.CORE,
|
|
31
|
+
],
|
|
32
|
+
enabled=False,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
|
|
36
|
+
if not config:
|
|
37
|
+
logging.debug(f"Grafana config not provided {self.name}")
|
|
38
|
+
return False, TOOLSET_CONFIG_MISSING_ERROR
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
self._grafana_config = self.config_class(**config)
|
|
42
|
+
return grafana_health_check(self._grafana_config)
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
logging.exception(f"Failed to set up grafana toolset {self.name}")
|
|
46
|
+
return False, str(e)
|
|
47
|
+
|
|
48
|
+
def get_example_config(self):
|
|
49
|
+
example_config = GrafanaConfig(
|
|
50
|
+
api_key="YOUR API KEY",
|
|
51
|
+
url="YOUR GRAFANA URL",
|
|
52
|
+
grafana_datasource_uid="UID OF DATASOURCE IN GRAFANA",
|
|
53
|
+
)
|
|
54
|
+
return example_config.model_dump()
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
import datetime
|
|
5
|
+
|
|
6
|
+
from holmes.core.tools import StructuredToolResult, ToolResultStatus
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GrafanaConfig(BaseModel):
|
|
10
|
+
"""A config that represents one of the Grafana related tools like Loki or Tempo
|
|
11
|
+
If `grafana_datasource_uid` is set, then it is assume that Holmes will proxy all
|
|
12
|
+
requests through grafana. In this case `url` should be the grafana URL.
|
|
13
|
+
If `grafana_datasource_uid` is not set, it is assumed that the `url` is the
|
|
14
|
+
systems' URL
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
api_key: Optional[str] = None
|
|
18
|
+
headers: Optional[Dict[str, str]] = None
|
|
19
|
+
url: str
|
|
20
|
+
grafana_datasource_uid: Optional[str] = None
|
|
21
|
+
external_url: Optional[str] = None
|
|
22
|
+
healthcheck: Optional[str] = "ready"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def build_headers(api_key: Optional[str], additional_headers: Optional[Dict[str, str]]):
|
|
26
|
+
headers = {
|
|
27
|
+
"Accept": "application/json",
|
|
28
|
+
"Content-Type": "application/json",
|
|
29
|
+
}
|
|
30
|
+
if api_key:
|
|
31
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
32
|
+
|
|
33
|
+
if additional_headers:
|
|
34
|
+
headers.update(additional_headers)
|
|
35
|
+
|
|
36
|
+
return headers
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def format_log(log: Dict) -> str:
|
|
40
|
+
log_str = log.get("log", "")
|
|
41
|
+
timestamp_nanoseconds = log.get("timestamp")
|
|
42
|
+
if timestamp_nanoseconds:
|
|
43
|
+
timestamp_seconds = int(timestamp_nanoseconds) // 1_000_000_000
|
|
44
|
+
dt = datetime.datetime.fromtimestamp(timestamp_seconds)
|
|
45
|
+
log_str = dt.strftime("%Y-%m-%dT%H:%M:%SZ") + " " + log_str
|
|
46
|
+
else:
|
|
47
|
+
log_str = json.dumps(log)
|
|
48
|
+
|
|
49
|
+
return log_str
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_base_url(config: GrafanaConfig) -> str:
|
|
53
|
+
if config.grafana_datasource_uid:
|
|
54
|
+
return f"{config.url}/api/datasources/proxy/uid/{config.grafana_datasource_uid}"
|
|
55
|
+
else:
|
|
56
|
+
return config.url
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def ensure_grafana_uid_or_return_error_result(
|
|
60
|
+
config: GrafanaConfig,
|
|
61
|
+
) -> Optional[StructuredToolResult]:
|
|
62
|
+
if not config.grafana_datasource_uid:
|
|
63
|
+
return StructuredToolResult(
|
|
64
|
+
status=ToolResultStatus.ERROR,
|
|
65
|
+
error="This tool only works when the toolset is configued ",
|
|
66
|
+
)
|
|
67
|
+
else:
|
|
68
|
+
return None
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import requests # type: ignore
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
import backoff
|
|
5
|
+
|
|
6
|
+
from holmes.plugins.toolsets.grafana.common import (
|
|
7
|
+
GrafanaConfig,
|
|
8
|
+
build_headers,
|
|
9
|
+
get_base_url,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@backoff.on_exception(
|
|
14
|
+
backoff.expo, # Exponential backoff
|
|
15
|
+
requests.exceptions.RequestException, # Retry on request exceptions
|
|
16
|
+
max_tries=5, # Maximum retries
|
|
17
|
+
giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
|
|
18
|
+
and e.response.status_code < 500,
|
|
19
|
+
)
|
|
20
|
+
def grafana_health_check(config: GrafanaConfig) -> Tuple[bool, str]:
|
|
21
|
+
base_url = get_base_url(config)
|
|
22
|
+
url = f"{base_url}/{config.healthcheck}"
|
|
23
|
+
try:
|
|
24
|
+
headers_ = build_headers(api_key=config.api_key, additional_headers=None)
|
|
25
|
+
|
|
26
|
+
response = requests.get(url, headers=headers_, timeout=10) # Added timeout
|
|
27
|
+
response.raise_for_status()
|
|
28
|
+
return True, ""
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logging.error(f"Failed to fetch grafana health status at {url}", exc_info=True)
|
|
31
|
+
return False, f"Failed to fetch grafana health status at {url}. {str(e)}"
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from typing import Dict, List, Optional, Union
|
|
2
|
+
|
|
3
|
+
import backoff
|
|
4
|
+
import requests # type: ignore
|
|
5
|
+
|
|
6
|
+
from holmes.plugins.toolsets.grafana.common import build_headers
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_loki_response(results: List[Dict]) -> List[Dict]:
|
|
10
|
+
"""
|
|
11
|
+
Parse Loki response into a more usable format
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
results: Raw results from Loki query
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
List of formatted log entries
|
|
18
|
+
"""
|
|
19
|
+
parsed_logs = []
|
|
20
|
+
for result in results:
|
|
21
|
+
stream = result.get("stream", {})
|
|
22
|
+
for value in result.get("values", []):
|
|
23
|
+
timestamp, log_line = value
|
|
24
|
+
parsed_logs.append(
|
|
25
|
+
{"timestamp": timestamp, "log": log_line, "labels": stream}
|
|
26
|
+
)
|
|
27
|
+
return parsed_logs
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@backoff.on_exception(
|
|
31
|
+
backoff.expo, # Exponential backoff
|
|
32
|
+
requests.exceptions.RequestException, # Retry on request exceptions
|
|
33
|
+
max_tries=5, # Maximum retries
|
|
34
|
+
giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
|
|
35
|
+
and e.response.status_code < 500,
|
|
36
|
+
)
|
|
37
|
+
def execute_loki_query(
|
|
38
|
+
base_url: str,
|
|
39
|
+
api_key: Optional[str],
|
|
40
|
+
headers: Optional[Dict[str, str]],
|
|
41
|
+
query: str,
|
|
42
|
+
start: Union[int, str],
|
|
43
|
+
end: Union[int, str],
|
|
44
|
+
limit: int,
|
|
45
|
+
) -> List[Dict]:
|
|
46
|
+
params = {"query": query, "limit": limit, "start": start, "end": end}
|
|
47
|
+
try:
|
|
48
|
+
url = f"{base_url}/loki/api/v1/query_range"
|
|
49
|
+
response = requests.get(
|
|
50
|
+
url,
|
|
51
|
+
headers=build_headers(api_key=api_key, additional_headers=headers),
|
|
52
|
+
params=params, # type: ignore
|
|
53
|
+
)
|
|
54
|
+
response.raise_for_status()
|
|
55
|
+
|
|
56
|
+
result = response.json()
|
|
57
|
+
if "data" in result and "result" in result["data"]:
|
|
58
|
+
return parse_loki_response(result["data"]["result"])
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
except requests.exceptions.RequestException as e:
|
|
62
|
+
raise Exception(f"Failed to query Loki logs: {str(e)}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def query_loki_logs_by_label(
|
|
66
|
+
base_url: str,
|
|
67
|
+
api_key: Optional[str],
|
|
68
|
+
headers: Optional[Dict[str, str]],
|
|
69
|
+
namespace: str,
|
|
70
|
+
label_value: str,
|
|
71
|
+
filter: Optional[str],
|
|
72
|
+
start: Union[int, str],
|
|
73
|
+
end: Union[int, str],
|
|
74
|
+
label: str,
|
|
75
|
+
namespace_search_key: str = "namespace",
|
|
76
|
+
limit: int = 200,
|
|
77
|
+
) -> List[Dict]:
|
|
78
|
+
query = f'{{{namespace_search_key}="{namespace}", {label}="{label_value}"}}'
|
|
79
|
+
if filter:
|
|
80
|
+
query += f' |= "{filter}"'
|
|
81
|
+
return execute_loki_query(
|
|
82
|
+
base_url=base_url,
|
|
83
|
+
api_key=api_key,
|
|
84
|
+
headers=headers,
|
|
85
|
+
query=query,
|
|
86
|
+
start=start,
|
|
87
|
+
end=end,
|
|
88
|
+
limit=limit,
|
|
89
|
+
)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import requests # type: ignore
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
import backoff
|
|
4
|
+
|
|
5
|
+
from holmes.plugins.toolsets.grafana.common import build_headers
|
|
6
|
+
from holmes.plugins.toolsets.grafana.trace_parser import process_trace
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def execute_tempo_query_with_retry(
|
|
10
|
+
base_url: str,
|
|
11
|
+
api_key: Optional[str],
|
|
12
|
+
headers: Optional[Dict[str, str]],
|
|
13
|
+
query_params: dict,
|
|
14
|
+
retries: int = 3,
|
|
15
|
+
timeout: int = 5,
|
|
16
|
+
):
|
|
17
|
+
"""
|
|
18
|
+
Execute a Tempo API query through Grafana with retries and timeout.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
tempo_datasource_uid: The UID of the Tempo datasource.
|
|
22
|
+
query_params: Query parameters for the API.
|
|
23
|
+
retries: Number of retries for the request.
|
|
24
|
+
timeout: Timeout for each request in seconds.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
List of trace results.
|
|
28
|
+
"""
|
|
29
|
+
url = f"{base_url}/api/search"
|
|
30
|
+
|
|
31
|
+
@backoff.on_exception(
|
|
32
|
+
backoff.expo, # Exponential backoff
|
|
33
|
+
requests.exceptions.RequestException, # Retry on request exceptions
|
|
34
|
+
max_tries=retries, # Maximum retries
|
|
35
|
+
giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
|
|
36
|
+
and e.response.status_code < 500,
|
|
37
|
+
)
|
|
38
|
+
def make_request():
|
|
39
|
+
response = requests.post(
|
|
40
|
+
url,
|
|
41
|
+
headers=build_headers(api_key=api_key, additional_headers=headers),
|
|
42
|
+
json=query_params,
|
|
43
|
+
timeout=timeout, # Set timeout for the request
|
|
44
|
+
)
|
|
45
|
+
response.raise_for_status() # Raise an error for non-2xx responses
|
|
46
|
+
return response.json()
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
return make_request()
|
|
50
|
+
except requests.exceptions.RequestException as e:
|
|
51
|
+
raise Exception(f"Request to Tempo API failed after retries: {e}")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def query_tempo_traces(
|
|
55
|
+
base_url: str,
|
|
56
|
+
api_key: Optional[str],
|
|
57
|
+
headers: Optional[Dict[str, str]],
|
|
58
|
+
query: Optional[str],
|
|
59
|
+
start: int,
|
|
60
|
+
end: int,
|
|
61
|
+
limit: int,
|
|
62
|
+
) -> Dict:
|
|
63
|
+
query_params = {
|
|
64
|
+
"start": str(start),
|
|
65
|
+
"end": str(end),
|
|
66
|
+
"limit": str(limit),
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if query:
|
|
70
|
+
query_params["q"] = query
|
|
71
|
+
data = execute_tempo_query_with_retry(
|
|
72
|
+
base_url=base_url,
|
|
73
|
+
api_key=api_key,
|
|
74
|
+
headers=headers,
|
|
75
|
+
query_params=query_params,
|
|
76
|
+
)
|
|
77
|
+
return data
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def query_tempo_trace_by_id(
|
|
81
|
+
base_url: str,
|
|
82
|
+
api_key: Optional[str],
|
|
83
|
+
headers: Optional[Dict[str, str]],
|
|
84
|
+
trace_id: str,
|
|
85
|
+
key_labels: List[str],
|
|
86
|
+
retries: int = 3,
|
|
87
|
+
timeout: int = 5,
|
|
88
|
+
) -> str:
|
|
89
|
+
"""
|
|
90
|
+
Query Tempo for a specific trace by its ID with retries and backoff.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
tempo_datasource_id: The ID of the Tempo datasource.
|
|
94
|
+
trace_id: The trace ID to retrieve.
|
|
95
|
+
retries: Number of retries for the request.
|
|
96
|
+
timeout: Timeout for each request in seconds.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
A formatted trace details string
|
|
100
|
+
"""
|
|
101
|
+
url = f"{base_url}/api/traces/{trace_id}"
|
|
102
|
+
|
|
103
|
+
@backoff.on_exception(
|
|
104
|
+
backoff.expo,
|
|
105
|
+
requests.exceptions.RequestException,
|
|
106
|
+
max_tries=retries,
|
|
107
|
+
giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
|
|
108
|
+
and e.response.status_code < 500,
|
|
109
|
+
)
|
|
110
|
+
def make_request():
|
|
111
|
+
response = requests.get(
|
|
112
|
+
url,
|
|
113
|
+
headers=build_headers(api_key=api_key, additional_headers=headers),
|
|
114
|
+
timeout=timeout,
|
|
115
|
+
)
|
|
116
|
+
response.raise_for_status()
|
|
117
|
+
return process_trace(response.json(), key_labels)
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
return make_request()
|
|
121
|
+
except requests.exceptions.RequestException as e:
|
|
122
|
+
raise Exception(
|
|
123
|
+
f"Failed to retrieve trace by ID after retries: {e} \n for URL: {url}"
|
|
124
|
+
)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
from urllib.parse import urlencode, urljoin
|
|
3
|
+
from holmes.core.tools import Tool, ToolParameter
|
|
4
|
+
from holmes.plugins.toolsets.grafana.base_grafana_toolset import BaseGrafanaToolset
|
|
5
|
+
import requests # type: ignore
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ListAndBuildGrafanaDashboardURLs(Tool):
|
|
10
|
+
def __init__(self, toolset: BaseGrafanaToolset):
|
|
11
|
+
super().__init__(
|
|
12
|
+
name="list_and_build_grafana_dashboard_urls",
|
|
13
|
+
description="Lists all available Grafana dashboard urls",
|
|
14
|
+
parameters={
|
|
15
|
+
"cluster_name": ToolParameter(
|
|
16
|
+
description="The cluster name. Defaults to None.",
|
|
17
|
+
type="string",
|
|
18
|
+
required=False,
|
|
19
|
+
),
|
|
20
|
+
"namespace": ToolParameter(
|
|
21
|
+
description="The namespace for filtering dashboards.",
|
|
22
|
+
type="string",
|
|
23
|
+
required=False,
|
|
24
|
+
),
|
|
25
|
+
"node_name": ToolParameter(
|
|
26
|
+
description="The node name to filter for node-related dashboards.",
|
|
27
|
+
type="string",
|
|
28
|
+
required=False,
|
|
29
|
+
),
|
|
30
|
+
"pod_name": ToolParameter(
|
|
31
|
+
description="The pod name to filter dashboards.",
|
|
32
|
+
type="string",
|
|
33
|
+
required=False,
|
|
34
|
+
),
|
|
35
|
+
},
|
|
36
|
+
)
|
|
37
|
+
self._toolset = toolset
|
|
38
|
+
|
|
39
|
+
def _invoke(self, params: Dict) -> str: # type: ignore
|
|
40
|
+
url = urljoin(
|
|
41
|
+
self._toolset._grafana_config.url, "/api/search?query=&type=dash-db"
|
|
42
|
+
)
|
|
43
|
+
headers = {"Authorization": f"Bearer {self._toolset._grafana_config.api_key}"}
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
response = requests.get(url, headers=headers)
|
|
47
|
+
response.raise_for_status()
|
|
48
|
+
dashboards = response.json()
|
|
49
|
+
formatted_dashboards: List[str] = []
|
|
50
|
+
base_url = (
|
|
51
|
+
self._toolset._grafana_config.external_url
|
|
52
|
+
or self._toolset._grafana_config.url
|
|
53
|
+
)
|
|
54
|
+
for dash in dashboards:
|
|
55
|
+
dashboard_url = urljoin(
|
|
56
|
+
base_url,
|
|
57
|
+
f"/d/{dash['uid']}/{dash['uri'].split('/')[-1]}",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
params_dict = {
|
|
61
|
+
"var-cluster": params.get("cluster_name", ""),
|
|
62
|
+
"var-namespace": params.get("namespace", ""),
|
|
63
|
+
"var-pod": params.get("pod_name", ""),
|
|
64
|
+
"var-node": params.get("node_name", ""),
|
|
65
|
+
"var-datasource": self._toolset._grafana_config.grafana_datasource_uid,
|
|
66
|
+
"refresh": "5s",
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# If filtering for nodes, ensure only node-related dashboards are included
|
|
70
|
+
if params.get("node_name") and "node" not in dash["title"].lower():
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
# we add all params since if the dashboard isnt configured for a param it will ignore it if it is added
|
|
74
|
+
query_string = urlencode({k: v for k, v in params_dict.items() if v})
|
|
75
|
+
dashboard_url = (
|
|
76
|
+
f"{dashboard_url}?{query_string}" if query_string else dashboard_url
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
formatted_dashboards.append(
|
|
80
|
+
f"Title: {dash['title']}\nURL: {dashboard_url}\n"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return "\n".join(formatted_dashboards) or "No dashboards found."
|
|
84
|
+
except requests.RequestException as e:
|
|
85
|
+
logging.error(f"Error fetching dashboards: {str(e)}")
|
|
86
|
+
return f"Error fetching dashboards: {str(e)}"
|
|
87
|
+
|
|
88
|
+
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
89
|
+
return f"Lists Grafana dashboards and builds URLs with parameters: {params}"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class GrafanaToolset(BaseGrafanaToolset):
|
|
93
|
+
def __init__(self):
|
|
94
|
+
super().__init__(
|
|
95
|
+
name="grafana/grafana",
|
|
96
|
+
description="Provides tools for interacting with Grafana dashboards",
|
|
97
|
+
icon_url="https://w7.pngwing.com/pngs/434/923/png-transparent-grafana-hd-logo-thumbnail.png",
|
|
98
|
+
docs_url="",
|
|
99
|
+
tools=[
|
|
100
|
+
ListAndBuildGrafanaDashboardURLs(self),
|
|
101
|
+
],
|
|
102
|
+
)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from typing import Any, cast
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
|
|
4
|
+
from holmes.core.tools import CallablePrerequisite
|
|
5
|
+
from holmes.plugins.toolsets.grafana.common import (
|
|
6
|
+
GrafanaConfig,
|
|
7
|
+
format_log,
|
|
8
|
+
get_base_url,
|
|
9
|
+
)
|
|
10
|
+
from holmes.plugins.toolsets.grafana.grafana_api import grafana_health_check
|
|
11
|
+
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
12
|
+
BasePodLoggingToolset,
|
|
13
|
+
FetchPodLogsParams,
|
|
14
|
+
PodLoggingTool,
|
|
15
|
+
)
|
|
16
|
+
from holmes.plugins.toolsets.utils import (
|
|
17
|
+
process_timestamps_to_rfc3339,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from holmes.plugins.toolsets.grafana.loki_api import (
|
|
21
|
+
query_loki_logs_by_label,
|
|
22
|
+
)
|
|
23
|
+
from holmes.core.tools import StructuredToolResult, ToolResultStatus
|
|
24
|
+
|
|
25
|
+
DEFAULT_TIME_SPAN_SECONDS = 3600
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class GrafanaLokiLabelsConfig(BaseModel):
|
|
29
|
+
pod: str = "pod"
|
|
30
|
+
namespace: str = "namespace"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class GrafanaLokiConfig(GrafanaConfig):
|
|
34
|
+
labels: GrafanaLokiLabelsConfig = GrafanaLokiLabelsConfig()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class GrafanaLokiToolset(BasePodLoggingToolset):
|
|
38
|
+
def __init__(self):
|
|
39
|
+
super().__init__(
|
|
40
|
+
name="grafana/loki",
|
|
41
|
+
description="Fetches kubernetes pods logs from Loki",
|
|
42
|
+
icon_url="https://grafana.com/media/docs/loki/logo-grafana-loki.png",
|
|
43
|
+
docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/grafanaloki.html",
|
|
44
|
+
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
|
|
45
|
+
tools=[
|
|
46
|
+
PodLoggingTool(self),
|
|
47
|
+
],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def prerequisites_callable(self, config: dict[str, Any]) -> tuple[bool, str]:
|
|
51
|
+
if not config:
|
|
52
|
+
return False, "Missing Grafana Loki configuration. Check your config."
|
|
53
|
+
|
|
54
|
+
self.config = GrafanaLokiConfig(**config)
|
|
55
|
+
|
|
56
|
+
return grafana_health_check(self.config)
|
|
57
|
+
|
|
58
|
+
def get_example_config(self):
|
|
59
|
+
example_config = GrafanaLokiConfig(
|
|
60
|
+
api_key="YOUR API KEY",
|
|
61
|
+
url="YOUR GRAFANA URL",
|
|
62
|
+
grafana_datasource_uid="<UID of the loki datasource to use>",
|
|
63
|
+
)
|
|
64
|
+
return example_config.model_dump()
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def grafana_config(self) -> GrafanaLokiConfig:
|
|
68
|
+
return cast(GrafanaLokiConfig, self.config)
|
|
69
|
+
|
|
70
|
+
def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
|
|
71
|
+
(start, end) = process_timestamps_to_rfc3339(
|
|
72
|
+
start_timestamp=params.start_time,
|
|
73
|
+
end_timestamp=params.end_time,
|
|
74
|
+
default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
base_url = get_base_url(self.grafana_config)
|
|
78
|
+
logs = query_loki_logs_by_label(
|
|
79
|
+
base_url=base_url,
|
|
80
|
+
api_key=self.grafana_config.api_key,
|
|
81
|
+
headers=self.grafana_config.headers,
|
|
82
|
+
filter=params.filter,
|
|
83
|
+
namespace=params.namespace,
|
|
84
|
+
namespace_search_key=self.grafana_config.labels.namespace,
|
|
85
|
+
label=self.grafana_config.labels.pod,
|
|
86
|
+
label_value=params.pod_name,
|
|
87
|
+
start=start,
|
|
88
|
+
end=end,
|
|
89
|
+
limit=params.limit or 2000,
|
|
90
|
+
)
|
|
91
|
+
if logs:
|
|
92
|
+
logs.sort(key=lambda x: x["timestamp"])
|
|
93
|
+
return StructuredToolResult(
|
|
94
|
+
status=ToolResultStatus.SUCCESS,
|
|
95
|
+
data="\n".join([format_log(log) for log in logs]),
|
|
96
|
+
params=params.model_dump(),
|
|
97
|
+
)
|
|
98
|
+
else:
|
|
99
|
+
return StructuredToolResult(
|
|
100
|
+
status=ToolResultStatus.NO_DATA,
|
|
101
|
+
params=params.model_dump(),
|
|
102
|
+
)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Use Tempo when investigating latency or performance issues. Tempo provides traces information for application running on the cluster.
|
|
2
|
+
Assume every application provides tempo traces.
|
|
3
|
+
1. Start by identifying an initial filter to use. This can be a pod name, a deployment name or a service name
|
|
4
|
+
2. Use `fetch_tempo_traces` setting the appropriate query params
|
|
5
|
+
- Use the min_duration filter to ensure you get traces that trigger the alert when you are investigating a performance issue
|
|
6
|
+
- If possible, use start and end date to narrow down your search.
|
|
7
|
+
- Use fetch_finding_by_id if you are provided with a finding/alert id. It will contain details about when the alert was triggered
|
|
8
|
+
- Use at least one of the following argument to ensure you get relevant traces: `service_name`, `pod_name` or `deployment_name`.
|
|
9
|
+
3. Look at the duration of each span in any single trace and deduct any issues.
|
|
10
|
+
4. ALWAYS fetch the logs for a pod once you identify a span that is taking a long time. There may be an explanation for the slowness in the logs.
|