holmesgpt 0.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (183) hide show
  1. holmes/.git_archival.json +7 -0
  2. holmes/__init__.py +76 -0
  3. holmes/__init__.py.bak +76 -0
  4. holmes/clients/robusta_client.py +24 -0
  5. holmes/common/env_vars.py +47 -0
  6. holmes/config.py +526 -0
  7. holmes/core/__init__.py +0 -0
  8. holmes/core/conversations.py +578 -0
  9. holmes/core/investigation.py +152 -0
  10. holmes/core/investigation_structured_output.py +264 -0
  11. holmes/core/issue.py +54 -0
  12. holmes/core/llm.py +250 -0
  13. holmes/core/models.py +157 -0
  14. holmes/core/openai_formatting.py +51 -0
  15. holmes/core/performance_timing.py +72 -0
  16. holmes/core/prompt.py +42 -0
  17. holmes/core/resource_instruction.py +17 -0
  18. holmes/core/runbooks.py +26 -0
  19. holmes/core/safeguards.py +120 -0
  20. holmes/core/supabase_dal.py +540 -0
  21. holmes/core/tool_calling_llm.py +798 -0
  22. holmes/core/tools.py +566 -0
  23. holmes/core/tools_utils/__init__.py +0 -0
  24. holmes/core/tools_utils/tool_executor.py +65 -0
  25. holmes/core/tools_utils/toolset_utils.py +52 -0
  26. holmes/core/toolset_manager.py +418 -0
  27. holmes/interactive.py +229 -0
  28. holmes/main.py +1041 -0
  29. holmes/plugins/__init__.py +0 -0
  30. holmes/plugins/destinations/__init__.py +6 -0
  31. holmes/plugins/destinations/slack/__init__.py +2 -0
  32. holmes/plugins/destinations/slack/plugin.py +163 -0
  33. holmes/plugins/interfaces.py +32 -0
  34. holmes/plugins/prompts/__init__.py +48 -0
  35. holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
  36. holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
  37. holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
  38. holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
  39. holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
  41. holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
  42. holmes/plugins/prompts/generic_ask.jinja2 +36 -0
  43. holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
  44. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
  45. holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
  46. holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
  47. holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
  48. holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
  49. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
  50. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
  51. holmes/plugins/runbooks/README.md +22 -0
  52. holmes/plugins/runbooks/__init__.py +100 -0
  53. holmes/plugins/runbooks/catalog.json +14 -0
  54. holmes/plugins/runbooks/jira.yaml +12 -0
  55. holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
  56. holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
  57. holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
  58. holmes/plugins/sources/github/__init__.py +77 -0
  59. holmes/plugins/sources/jira/__init__.py +123 -0
  60. holmes/plugins/sources/opsgenie/__init__.py +93 -0
  61. holmes/plugins/sources/pagerduty/__init__.py +147 -0
  62. holmes/plugins/sources/prometheus/__init__.py +0 -0
  63. holmes/plugins/sources/prometheus/models.py +104 -0
  64. holmes/plugins/sources/prometheus/plugin.py +154 -0
  65. holmes/plugins/toolsets/__init__.py +171 -0
  66. holmes/plugins/toolsets/aks-node-health.yaml +65 -0
  67. holmes/plugins/toolsets/aks.yaml +86 -0
  68. holmes/plugins/toolsets/argocd.yaml +70 -0
  69. holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
  70. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
  71. holmes/plugins/toolsets/aws.yaml +76 -0
  72. holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
  73. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
  74. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
  75. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
  76. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
  77. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
  78. holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
  79. holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
  80. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
  81. holmes/plugins/toolsets/azure_sql/install.md +66 -0
  82. holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
  83. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
  84. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
  85. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
  86. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
  87. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
  88. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
  89. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
  90. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
  91. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
  92. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
  93. holmes/plugins/toolsets/azure_sql/utils.py +83 -0
  94. holmes/plugins/toolsets/bash/__init__.py +0 -0
  95. holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
  96. holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
  97. holmes/plugins/toolsets/bash/common/bash.py +52 -0
  98. holmes/plugins/toolsets/bash/common/config.py +14 -0
  99. holmes/plugins/toolsets/bash/common/stringify.py +25 -0
  100. holmes/plugins/toolsets/bash/common/validators.py +24 -0
  101. holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
  102. holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
  103. holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
  104. holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
  105. holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
  106. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
  107. holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
  108. holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
  109. holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
  110. holmes/plugins/toolsets/bash/parse_command.py +103 -0
  111. holmes/plugins/toolsets/confluence.yaml +19 -0
  112. holmes/plugins/toolsets/consts.py +5 -0
  113. holmes/plugins/toolsets/coralogix/api.py +158 -0
  114. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
  115. holmes/plugins/toolsets/coralogix/utils.py +181 -0
  116. holmes/plugins/toolsets/datadog.py +153 -0
  117. holmes/plugins/toolsets/docker.yaml +46 -0
  118. holmes/plugins/toolsets/git.py +756 -0
  119. holmes/plugins/toolsets/grafana/__init__.py +0 -0
  120. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
  121. holmes/plugins/toolsets/grafana/common.py +68 -0
  122. holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
  123. holmes/plugins/toolsets/grafana/loki_api.py +89 -0
  124. holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
  125. holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
  126. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
  127. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
  128. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
  129. holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
  130. holmes/plugins/toolsets/helm.yaml +42 -0
  131. holmes/plugins/toolsets/internet/internet.py +275 -0
  132. holmes/plugins/toolsets/internet/notion.py +137 -0
  133. holmes/plugins/toolsets/kafka.py +638 -0
  134. holmes/plugins/toolsets/kubernetes.yaml +255 -0
  135. holmes/plugins/toolsets/kubernetes_logs.py +426 -0
  136. holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
  137. holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
  138. holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
  139. holmes/plugins/toolsets/logging_utils/types.py +0 -0
  140. holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
  141. holmes/plugins/toolsets/newrelic.py +222 -0
  142. holmes/plugins/toolsets/opensearch/__init__.py +0 -0
  143. holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
  144. holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
  145. holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
  146. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
  147. holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
  148. holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
  149. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
  150. holmes/plugins/toolsets/rabbitmq/api.py +398 -0
  151. holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
  152. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
  153. holmes/plugins/toolsets/robusta/__init__.py +0 -0
  154. holmes/plugins/toolsets/robusta/robusta.py +235 -0
  155. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
  156. holmes/plugins/toolsets/runbook/__init__.py +0 -0
  157. holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
  158. holmes/plugins/toolsets/service_discovery.py +92 -0
  159. holmes/plugins/toolsets/servicenow/install.md +37 -0
  160. holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
  161. holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
  162. holmes/plugins/toolsets/slab.yaml +20 -0
  163. holmes/plugins/toolsets/utils.py +137 -0
  164. holmes/plugins/utils.py +14 -0
  165. holmes/utils/__init__.py +0 -0
  166. holmes/utils/cache.py +84 -0
  167. holmes/utils/cert_utils.py +40 -0
  168. holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
  169. holmes/utils/definitions.py +13 -0
  170. holmes/utils/env.py +53 -0
  171. holmes/utils/file_utils.py +56 -0
  172. holmes/utils/global_instructions.py +20 -0
  173. holmes/utils/holmes_status.py +22 -0
  174. holmes/utils/holmes_sync_toolsets.py +80 -0
  175. holmes/utils/markdown_utils.py +55 -0
  176. holmes/utils/pydantic_utils.py +54 -0
  177. holmes/utils/robusta.py +10 -0
  178. holmes/utils/tags.py +97 -0
  179. holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
  180. holmesgpt-0.11.5.dist-info/METADATA +400 -0
  181. holmesgpt-0.11.5.dist-info/RECORD +183 -0
  182. holmesgpt-0.11.5.dist-info/WHEEL +4 -0
  183. holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0
File without changes
@@ -0,0 +1,54 @@
1
+ import logging
2
+ from typing import Any, ClassVar, Tuple, Type
3
+
4
+ from holmes.core.tools import CallablePrerequisite, Tool, Toolset, ToolsetTag
5
+ from holmes.plugins.toolsets.consts import TOOLSET_CONFIG_MISSING_ERROR
6
+ from holmes.plugins.toolsets.grafana.common import GrafanaConfig
7
+
8
+ from holmes.plugins.toolsets.grafana.grafana_api import grafana_health_check
9
+
10
+
11
+ class BaseGrafanaToolset(Toolset):
12
+ config_class: ClassVar[Type[GrafanaConfig]] = GrafanaConfig
13
+
14
+ def __init__(
15
+ self,
16
+ name: str,
17
+ description: str,
18
+ icon_url: str,
19
+ tools: list[Tool],
20
+ docs_url: str,
21
+ ):
22
+ super().__init__(
23
+ name=name,
24
+ description=description,
25
+ icon_url=icon_url,
26
+ docs_url=docs_url,
27
+ prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
28
+ tools=tools,
29
+ tags=[
30
+ ToolsetTag.CORE,
31
+ ],
32
+ enabled=False,
33
+ )
34
+
35
+ def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
36
+ if not config:
37
+ logging.debug(f"Grafana config not provided {self.name}")
38
+ return False, TOOLSET_CONFIG_MISSING_ERROR
39
+
40
+ try:
41
+ self._grafana_config = self.config_class(**config)
42
+ return grafana_health_check(self._grafana_config)
43
+
44
+ except Exception as e:
45
+ logging.exception(f"Failed to set up grafana toolset {self.name}")
46
+ return False, str(e)
47
+
48
+ def get_example_config(self):
49
+ example_config = GrafanaConfig(
50
+ api_key="YOUR API KEY",
51
+ url="YOUR GRAFANA URL",
52
+ grafana_datasource_uid="UID OF DATASOURCE IN GRAFANA",
53
+ )
54
+ return example_config.model_dump()
@@ -0,0 +1,68 @@
1
+ import json
2
+ from typing import Dict, Optional
3
+ from pydantic import BaseModel
4
+ import datetime
5
+
6
+ from holmes.core.tools import StructuredToolResult, ToolResultStatus
7
+
8
+
9
+ class GrafanaConfig(BaseModel):
10
+ """A config that represents one of the Grafana related tools like Loki or Tempo
11
+ If `grafana_datasource_uid` is set, then it is assume that Holmes will proxy all
12
+ requests through grafana. In this case `url` should be the grafana URL.
13
+ If `grafana_datasource_uid` is not set, it is assumed that the `url` is the
14
+ systems' URL
15
+ """
16
+
17
+ api_key: Optional[str] = None
18
+ headers: Optional[Dict[str, str]] = None
19
+ url: str
20
+ grafana_datasource_uid: Optional[str] = None
21
+ external_url: Optional[str] = None
22
+ healthcheck: Optional[str] = "ready"
23
+
24
+
25
+ def build_headers(api_key: Optional[str], additional_headers: Optional[Dict[str, str]]):
26
+ headers = {
27
+ "Accept": "application/json",
28
+ "Content-Type": "application/json",
29
+ }
30
+ if api_key:
31
+ headers["Authorization"] = f"Bearer {api_key}"
32
+
33
+ if additional_headers:
34
+ headers.update(additional_headers)
35
+
36
+ return headers
37
+
38
+
39
+ def format_log(log: Dict) -> str:
40
+ log_str = log.get("log", "")
41
+ timestamp_nanoseconds = log.get("timestamp")
42
+ if timestamp_nanoseconds:
43
+ timestamp_seconds = int(timestamp_nanoseconds) // 1_000_000_000
44
+ dt = datetime.datetime.fromtimestamp(timestamp_seconds)
45
+ log_str = dt.strftime("%Y-%m-%dT%H:%M:%SZ") + " " + log_str
46
+ else:
47
+ log_str = json.dumps(log)
48
+
49
+ return log_str
50
+
51
+
52
+ def get_base_url(config: GrafanaConfig) -> str:
53
+ if config.grafana_datasource_uid:
54
+ return f"{config.url}/api/datasources/proxy/uid/{config.grafana_datasource_uid}"
55
+ else:
56
+ return config.url
57
+
58
+
59
+ def ensure_grafana_uid_or_return_error_result(
60
+ config: GrafanaConfig,
61
+ ) -> Optional[StructuredToolResult]:
62
+ if not config.grafana_datasource_uid:
63
+ return StructuredToolResult(
64
+ status=ToolResultStatus.ERROR,
65
+ error="This tool only works when the toolset is configued ",
66
+ )
67
+ else:
68
+ return None
@@ -0,0 +1,31 @@
1
+ import logging
2
+ import requests # type: ignore
3
+ from typing import Tuple
4
+ import backoff
5
+
6
+ from holmes.plugins.toolsets.grafana.common import (
7
+ GrafanaConfig,
8
+ build_headers,
9
+ get_base_url,
10
+ )
11
+
12
+
13
+ @backoff.on_exception(
14
+ backoff.expo, # Exponential backoff
15
+ requests.exceptions.RequestException, # Retry on request exceptions
16
+ max_tries=5, # Maximum retries
17
+ giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
18
+ and e.response.status_code < 500,
19
+ )
20
+ def grafana_health_check(config: GrafanaConfig) -> Tuple[bool, str]:
21
+ base_url = get_base_url(config)
22
+ url = f"{base_url}/{config.healthcheck}"
23
+ try:
24
+ headers_ = build_headers(api_key=config.api_key, additional_headers=None)
25
+
26
+ response = requests.get(url, headers=headers_, timeout=10) # Added timeout
27
+ response.raise_for_status()
28
+ return True, ""
29
+ except Exception as e:
30
+ logging.error(f"Failed to fetch grafana health status at {url}", exc_info=True)
31
+ return False, f"Failed to fetch grafana health status at {url}. {str(e)}"
@@ -0,0 +1,89 @@
1
+ from typing import Dict, List, Optional, Union
2
+
3
+ import backoff
4
+ import requests # type: ignore
5
+
6
+ from holmes.plugins.toolsets.grafana.common import build_headers
7
+
8
+
9
+ def parse_loki_response(results: List[Dict]) -> List[Dict]:
10
+ """
11
+ Parse Loki response into a more usable format
12
+
13
+ Args:
14
+ results: Raw results from Loki query
15
+
16
+ Returns:
17
+ List of formatted log entries
18
+ """
19
+ parsed_logs = []
20
+ for result in results:
21
+ stream = result.get("stream", {})
22
+ for value in result.get("values", []):
23
+ timestamp, log_line = value
24
+ parsed_logs.append(
25
+ {"timestamp": timestamp, "log": log_line, "labels": stream}
26
+ )
27
+ return parsed_logs
28
+
29
+
30
+ @backoff.on_exception(
31
+ backoff.expo, # Exponential backoff
32
+ requests.exceptions.RequestException, # Retry on request exceptions
33
+ max_tries=5, # Maximum retries
34
+ giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
35
+ and e.response.status_code < 500,
36
+ )
37
+ def execute_loki_query(
38
+ base_url: str,
39
+ api_key: Optional[str],
40
+ headers: Optional[Dict[str, str]],
41
+ query: str,
42
+ start: Union[int, str],
43
+ end: Union[int, str],
44
+ limit: int,
45
+ ) -> List[Dict]:
46
+ params = {"query": query, "limit": limit, "start": start, "end": end}
47
+ try:
48
+ url = f"{base_url}/loki/api/v1/query_range"
49
+ response = requests.get(
50
+ url,
51
+ headers=build_headers(api_key=api_key, additional_headers=headers),
52
+ params=params, # type: ignore
53
+ )
54
+ response.raise_for_status()
55
+
56
+ result = response.json()
57
+ if "data" in result and "result" in result["data"]:
58
+ return parse_loki_response(result["data"]["result"])
59
+ return []
60
+
61
+ except requests.exceptions.RequestException as e:
62
+ raise Exception(f"Failed to query Loki logs: {str(e)}")
63
+
64
+
65
+ def query_loki_logs_by_label(
66
+ base_url: str,
67
+ api_key: Optional[str],
68
+ headers: Optional[Dict[str, str]],
69
+ namespace: str,
70
+ label_value: str,
71
+ filter: Optional[str],
72
+ start: Union[int, str],
73
+ end: Union[int, str],
74
+ label: str,
75
+ namespace_search_key: str = "namespace",
76
+ limit: int = 200,
77
+ ) -> List[Dict]:
78
+ query = f'{{{namespace_search_key}="{namespace}", {label}="{label_value}"}}'
79
+ if filter:
80
+ query += f' |= "{filter}"'
81
+ return execute_loki_query(
82
+ base_url=base_url,
83
+ api_key=api_key,
84
+ headers=headers,
85
+ query=query,
86
+ start=start,
87
+ end=end,
88
+ limit=limit,
89
+ )
@@ -0,0 +1,124 @@
1
+ import requests # type: ignore
2
+ from typing import Dict, List, Optional
3
+ import backoff
4
+
5
+ from holmes.plugins.toolsets.grafana.common import build_headers
6
+ from holmes.plugins.toolsets.grafana.trace_parser import process_trace
7
+
8
+
9
+ def execute_tempo_query_with_retry(
10
+ base_url: str,
11
+ api_key: Optional[str],
12
+ headers: Optional[Dict[str, str]],
13
+ query_params: dict,
14
+ retries: int = 3,
15
+ timeout: int = 5,
16
+ ):
17
+ """
18
+ Execute a Tempo API query through Grafana with retries and timeout.
19
+
20
+ Args:
21
+ tempo_datasource_uid: The UID of the Tempo datasource.
22
+ query_params: Query parameters for the API.
23
+ retries: Number of retries for the request.
24
+ timeout: Timeout for each request in seconds.
25
+
26
+ Returns:
27
+ List of trace results.
28
+ """
29
+ url = f"{base_url}/api/search"
30
+
31
+ @backoff.on_exception(
32
+ backoff.expo, # Exponential backoff
33
+ requests.exceptions.RequestException, # Retry on request exceptions
34
+ max_tries=retries, # Maximum retries
35
+ giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
36
+ and e.response.status_code < 500,
37
+ )
38
+ def make_request():
39
+ response = requests.post(
40
+ url,
41
+ headers=build_headers(api_key=api_key, additional_headers=headers),
42
+ json=query_params,
43
+ timeout=timeout, # Set timeout for the request
44
+ )
45
+ response.raise_for_status() # Raise an error for non-2xx responses
46
+ return response.json()
47
+
48
+ try:
49
+ return make_request()
50
+ except requests.exceptions.RequestException as e:
51
+ raise Exception(f"Request to Tempo API failed after retries: {e}")
52
+
53
+
54
+ def query_tempo_traces(
55
+ base_url: str,
56
+ api_key: Optional[str],
57
+ headers: Optional[Dict[str, str]],
58
+ query: Optional[str],
59
+ start: int,
60
+ end: int,
61
+ limit: int,
62
+ ) -> Dict:
63
+ query_params = {
64
+ "start": str(start),
65
+ "end": str(end),
66
+ "limit": str(limit),
67
+ }
68
+
69
+ if query:
70
+ query_params["q"] = query
71
+ data = execute_tempo_query_with_retry(
72
+ base_url=base_url,
73
+ api_key=api_key,
74
+ headers=headers,
75
+ query_params=query_params,
76
+ )
77
+ return data
78
+
79
+
80
+ def query_tempo_trace_by_id(
81
+ base_url: str,
82
+ api_key: Optional[str],
83
+ headers: Optional[Dict[str, str]],
84
+ trace_id: str,
85
+ key_labels: List[str],
86
+ retries: int = 3,
87
+ timeout: int = 5,
88
+ ) -> str:
89
+ """
90
+ Query Tempo for a specific trace by its ID with retries and backoff.
91
+
92
+ Args:
93
+ tempo_datasource_id: The ID of the Tempo datasource.
94
+ trace_id: The trace ID to retrieve.
95
+ retries: Number of retries for the request.
96
+ timeout: Timeout for each request in seconds.
97
+
98
+ Returns:
99
+ A formatted trace details string
100
+ """
101
+ url = f"{base_url}/api/traces/{trace_id}"
102
+
103
+ @backoff.on_exception(
104
+ backoff.expo,
105
+ requests.exceptions.RequestException,
106
+ max_tries=retries,
107
+ giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
108
+ and e.response.status_code < 500,
109
+ )
110
+ def make_request():
111
+ response = requests.get(
112
+ url,
113
+ headers=build_headers(api_key=api_key, additional_headers=headers),
114
+ timeout=timeout,
115
+ )
116
+ response.raise_for_status()
117
+ return process_trace(response.json(), key_labels)
118
+
119
+ try:
120
+ return make_request()
121
+ except requests.exceptions.RequestException as e:
122
+ raise Exception(
123
+ f"Failed to retrieve trace by ID after retries: {e} \n for URL: {url}"
124
+ )
@@ -0,0 +1,102 @@
1
+ from typing import Dict, List
2
+ from urllib.parse import urlencode, urljoin
3
+ from holmes.core.tools import Tool, ToolParameter
4
+ from holmes.plugins.toolsets.grafana.base_grafana_toolset import BaseGrafanaToolset
5
+ import requests # type: ignore
6
+ import logging
7
+
8
+
9
+ class ListAndBuildGrafanaDashboardURLs(Tool):
10
+ def __init__(self, toolset: BaseGrafanaToolset):
11
+ super().__init__(
12
+ name="list_and_build_grafana_dashboard_urls",
13
+ description="Lists all available Grafana dashboard urls",
14
+ parameters={
15
+ "cluster_name": ToolParameter(
16
+ description="The cluster name. Defaults to None.",
17
+ type="string",
18
+ required=False,
19
+ ),
20
+ "namespace": ToolParameter(
21
+ description="The namespace for filtering dashboards.",
22
+ type="string",
23
+ required=False,
24
+ ),
25
+ "node_name": ToolParameter(
26
+ description="The node name to filter for node-related dashboards.",
27
+ type="string",
28
+ required=False,
29
+ ),
30
+ "pod_name": ToolParameter(
31
+ description="The pod name to filter dashboards.",
32
+ type="string",
33
+ required=False,
34
+ ),
35
+ },
36
+ )
37
+ self._toolset = toolset
38
+
39
+ def _invoke(self, params: Dict) -> str: # type: ignore
40
+ url = urljoin(
41
+ self._toolset._grafana_config.url, "/api/search?query=&type=dash-db"
42
+ )
43
+ headers = {"Authorization": f"Bearer {self._toolset._grafana_config.api_key}"}
44
+
45
+ try:
46
+ response = requests.get(url, headers=headers)
47
+ response.raise_for_status()
48
+ dashboards = response.json()
49
+ formatted_dashboards: List[str] = []
50
+ base_url = (
51
+ self._toolset._grafana_config.external_url
52
+ or self._toolset._grafana_config.url
53
+ )
54
+ for dash in dashboards:
55
+ dashboard_url = urljoin(
56
+ base_url,
57
+ f"/d/{dash['uid']}/{dash['uri'].split('/')[-1]}",
58
+ )
59
+
60
+ params_dict = {
61
+ "var-cluster": params.get("cluster_name", ""),
62
+ "var-namespace": params.get("namespace", ""),
63
+ "var-pod": params.get("pod_name", ""),
64
+ "var-node": params.get("node_name", ""),
65
+ "var-datasource": self._toolset._grafana_config.grafana_datasource_uid,
66
+ "refresh": "5s",
67
+ }
68
+
69
+ # If filtering for nodes, ensure only node-related dashboards are included
70
+ if params.get("node_name") and "node" not in dash["title"].lower():
71
+ continue
72
+
73
+ # we add all params since if the dashboard isnt configured for a param it will ignore it if it is added
74
+ query_string = urlencode({k: v for k, v in params_dict.items() if v})
75
+ dashboard_url = (
76
+ f"{dashboard_url}?{query_string}" if query_string else dashboard_url
77
+ )
78
+
79
+ formatted_dashboards.append(
80
+ f"Title: {dash['title']}\nURL: {dashboard_url}\n"
81
+ )
82
+
83
+ return "\n".join(formatted_dashboards) or "No dashboards found."
84
+ except requests.RequestException as e:
85
+ logging.error(f"Error fetching dashboards: {str(e)}")
86
+ return f"Error fetching dashboards: {str(e)}"
87
+
88
+ def get_parameterized_one_liner(self, params: Dict) -> str:
89
+ return f"Lists Grafana dashboards and builds URLs with parameters: {params}"
90
+
91
+
92
+ class GrafanaToolset(BaseGrafanaToolset):
93
+ def __init__(self):
94
+ super().__init__(
95
+ name="grafana/grafana",
96
+ description="Provides tools for interacting with Grafana dashboards",
97
+ icon_url="https://w7.pngwing.com/pngs/434/923/png-transparent-grafana-hd-logo-thumbnail.png",
98
+ docs_url="",
99
+ tools=[
100
+ ListAndBuildGrafanaDashboardURLs(self),
101
+ ],
102
+ )
@@ -0,0 +1,102 @@
1
+ from typing import Any, cast
2
+ from pydantic import BaseModel
3
+
4
+ from holmes.core.tools import CallablePrerequisite
5
+ from holmes.plugins.toolsets.grafana.common import (
6
+ GrafanaConfig,
7
+ format_log,
8
+ get_base_url,
9
+ )
10
+ from holmes.plugins.toolsets.grafana.grafana_api import grafana_health_check
11
+ from holmes.plugins.toolsets.logging_utils.logging_api import (
12
+ BasePodLoggingToolset,
13
+ FetchPodLogsParams,
14
+ PodLoggingTool,
15
+ )
16
+ from holmes.plugins.toolsets.utils import (
17
+ process_timestamps_to_rfc3339,
18
+ )
19
+
20
+ from holmes.plugins.toolsets.grafana.loki_api import (
21
+ query_loki_logs_by_label,
22
+ )
23
+ from holmes.core.tools import StructuredToolResult, ToolResultStatus
24
+
25
+ DEFAULT_TIME_SPAN_SECONDS = 3600
26
+
27
+
28
+ class GrafanaLokiLabelsConfig(BaseModel):
29
+ pod: str = "pod"
30
+ namespace: str = "namespace"
31
+
32
+
33
+ class GrafanaLokiConfig(GrafanaConfig):
34
+ labels: GrafanaLokiLabelsConfig = GrafanaLokiLabelsConfig()
35
+
36
+
37
+ class GrafanaLokiToolset(BasePodLoggingToolset):
38
+ def __init__(self):
39
+ super().__init__(
40
+ name="grafana/loki",
41
+ description="Fetches kubernetes pods logs from Loki",
42
+ icon_url="https://grafana.com/media/docs/loki/logo-grafana-loki.png",
43
+ docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/grafanaloki.html",
44
+ prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
45
+ tools=[
46
+ PodLoggingTool(self),
47
+ ],
48
+ )
49
+
50
+ def prerequisites_callable(self, config: dict[str, Any]) -> tuple[bool, str]:
51
+ if not config:
52
+ return False, "Missing Grafana Loki configuration. Check your config."
53
+
54
+ self.config = GrafanaLokiConfig(**config)
55
+
56
+ return grafana_health_check(self.config)
57
+
58
+ def get_example_config(self):
59
+ example_config = GrafanaLokiConfig(
60
+ api_key="YOUR API KEY",
61
+ url="YOUR GRAFANA URL",
62
+ grafana_datasource_uid="<UID of the loki datasource to use>",
63
+ )
64
+ return example_config.model_dump()
65
+
66
+ @property
67
+ def grafana_config(self) -> GrafanaLokiConfig:
68
+ return cast(GrafanaLokiConfig, self.config)
69
+
70
+ def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
71
+ (start, end) = process_timestamps_to_rfc3339(
72
+ start_timestamp=params.start_time,
73
+ end_timestamp=params.end_time,
74
+ default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
75
+ )
76
+
77
+ base_url = get_base_url(self.grafana_config)
78
+ logs = query_loki_logs_by_label(
79
+ base_url=base_url,
80
+ api_key=self.grafana_config.api_key,
81
+ headers=self.grafana_config.headers,
82
+ filter=params.filter,
83
+ namespace=params.namespace,
84
+ namespace_search_key=self.grafana_config.labels.namespace,
85
+ label=self.grafana_config.labels.pod,
86
+ label_value=params.pod_name,
87
+ start=start,
88
+ end=end,
89
+ limit=params.limit or 2000,
90
+ )
91
+ if logs:
92
+ logs.sort(key=lambda x: x["timestamp"])
93
+ return StructuredToolResult(
94
+ status=ToolResultStatus.SUCCESS,
95
+ data="\n".join([format_log(log) for log in logs]),
96
+ params=params.model_dump(),
97
+ )
98
+ else:
99
+ return StructuredToolResult(
100
+ status=ToolResultStatus.NO_DATA,
101
+ params=params.model_dump(),
102
+ )
@@ -0,0 +1,10 @@
1
+ Use Tempo when investigating latency or performance issues. Tempo provides traces information for application running on the cluster.
2
+ Assume every application provides tempo traces.
3
+ 1. Start by identifying an initial filter to use. This can be a pod name, a deployment name or a service name
4
+ 2. Use `fetch_tempo_traces` setting the appropriate query params
5
+ - Use the min_duration filter to ensure you get traces that trigger the alert when you are investigating a performance issue
6
+ - If possible, use start and end date to narrow down your search.
7
+ - Use fetch_finding_by_id if you are provided with a finding/alert id. It will contain details about when the alert was triggered
8
+ - Use at least one of the following argument to ensure you get relevant traces: `service_name`, `pod_name` or `deployment_name`.
9
+ 3. Look at the duration of each span in any single trace and deduct any issues.
10
+ 4. ALWAYS fetch the logs for a pod once you identify a span that is taking a long time. There may be an explanation for the slowness in the logs.