holmesgpt 0.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (183) hide show
  1. holmes/.git_archival.json +7 -0
  2. holmes/__init__.py +76 -0
  3. holmes/__init__.py.bak +76 -0
  4. holmes/clients/robusta_client.py +24 -0
  5. holmes/common/env_vars.py +47 -0
  6. holmes/config.py +526 -0
  7. holmes/core/__init__.py +0 -0
  8. holmes/core/conversations.py +578 -0
  9. holmes/core/investigation.py +152 -0
  10. holmes/core/investigation_structured_output.py +264 -0
  11. holmes/core/issue.py +54 -0
  12. holmes/core/llm.py +250 -0
  13. holmes/core/models.py +157 -0
  14. holmes/core/openai_formatting.py +51 -0
  15. holmes/core/performance_timing.py +72 -0
  16. holmes/core/prompt.py +42 -0
  17. holmes/core/resource_instruction.py +17 -0
  18. holmes/core/runbooks.py +26 -0
  19. holmes/core/safeguards.py +120 -0
  20. holmes/core/supabase_dal.py +540 -0
  21. holmes/core/tool_calling_llm.py +798 -0
  22. holmes/core/tools.py +566 -0
  23. holmes/core/tools_utils/__init__.py +0 -0
  24. holmes/core/tools_utils/tool_executor.py +65 -0
  25. holmes/core/tools_utils/toolset_utils.py +52 -0
  26. holmes/core/toolset_manager.py +418 -0
  27. holmes/interactive.py +229 -0
  28. holmes/main.py +1041 -0
  29. holmes/plugins/__init__.py +0 -0
  30. holmes/plugins/destinations/__init__.py +6 -0
  31. holmes/plugins/destinations/slack/__init__.py +2 -0
  32. holmes/plugins/destinations/slack/plugin.py +163 -0
  33. holmes/plugins/interfaces.py +32 -0
  34. holmes/plugins/prompts/__init__.py +48 -0
  35. holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
  36. holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
  37. holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
  38. holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
  39. holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
  41. holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
  42. holmes/plugins/prompts/generic_ask.jinja2 +36 -0
  43. holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
  44. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
  45. holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
  46. holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
  47. holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
  48. holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
  49. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
  50. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
  51. holmes/plugins/runbooks/README.md +22 -0
  52. holmes/plugins/runbooks/__init__.py +100 -0
  53. holmes/plugins/runbooks/catalog.json +14 -0
  54. holmes/plugins/runbooks/jira.yaml +12 -0
  55. holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
  56. holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
  57. holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
  58. holmes/plugins/sources/github/__init__.py +77 -0
  59. holmes/plugins/sources/jira/__init__.py +123 -0
  60. holmes/plugins/sources/opsgenie/__init__.py +93 -0
  61. holmes/plugins/sources/pagerduty/__init__.py +147 -0
  62. holmes/plugins/sources/prometheus/__init__.py +0 -0
  63. holmes/plugins/sources/prometheus/models.py +104 -0
  64. holmes/plugins/sources/prometheus/plugin.py +154 -0
  65. holmes/plugins/toolsets/__init__.py +171 -0
  66. holmes/plugins/toolsets/aks-node-health.yaml +65 -0
  67. holmes/plugins/toolsets/aks.yaml +86 -0
  68. holmes/plugins/toolsets/argocd.yaml +70 -0
  69. holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
  70. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
  71. holmes/plugins/toolsets/aws.yaml +76 -0
  72. holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
  73. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
  74. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
  75. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
  76. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
  77. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
  78. holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
  79. holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
  80. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
  81. holmes/plugins/toolsets/azure_sql/install.md +66 -0
  82. holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
  83. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
  84. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
  85. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
  86. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
  87. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
  88. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
  89. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
  90. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
  91. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
  92. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
  93. holmes/plugins/toolsets/azure_sql/utils.py +83 -0
  94. holmes/plugins/toolsets/bash/__init__.py +0 -0
  95. holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
  96. holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
  97. holmes/plugins/toolsets/bash/common/bash.py +52 -0
  98. holmes/plugins/toolsets/bash/common/config.py +14 -0
  99. holmes/plugins/toolsets/bash/common/stringify.py +25 -0
  100. holmes/plugins/toolsets/bash/common/validators.py +24 -0
  101. holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
  102. holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
  103. holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
  104. holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
  105. holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
  106. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
  107. holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
  108. holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
  109. holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
  110. holmes/plugins/toolsets/bash/parse_command.py +103 -0
  111. holmes/plugins/toolsets/confluence.yaml +19 -0
  112. holmes/plugins/toolsets/consts.py +5 -0
  113. holmes/plugins/toolsets/coralogix/api.py +158 -0
  114. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
  115. holmes/plugins/toolsets/coralogix/utils.py +181 -0
  116. holmes/plugins/toolsets/datadog.py +153 -0
  117. holmes/plugins/toolsets/docker.yaml +46 -0
  118. holmes/plugins/toolsets/git.py +756 -0
  119. holmes/plugins/toolsets/grafana/__init__.py +0 -0
  120. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
  121. holmes/plugins/toolsets/grafana/common.py +68 -0
  122. holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
  123. holmes/plugins/toolsets/grafana/loki_api.py +89 -0
  124. holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
  125. holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
  126. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
  127. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
  128. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
  129. holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
  130. holmes/plugins/toolsets/helm.yaml +42 -0
  131. holmes/plugins/toolsets/internet/internet.py +275 -0
  132. holmes/plugins/toolsets/internet/notion.py +137 -0
  133. holmes/plugins/toolsets/kafka.py +638 -0
  134. holmes/plugins/toolsets/kubernetes.yaml +255 -0
  135. holmes/plugins/toolsets/kubernetes_logs.py +426 -0
  136. holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
  137. holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
  138. holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
  139. holmes/plugins/toolsets/logging_utils/types.py +0 -0
  140. holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
  141. holmes/plugins/toolsets/newrelic.py +222 -0
  142. holmes/plugins/toolsets/opensearch/__init__.py +0 -0
  143. holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
  144. holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
  145. holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
  146. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
  147. holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
  148. holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
  149. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
  150. holmes/plugins/toolsets/rabbitmq/api.py +398 -0
  151. holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
  152. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
  153. holmes/plugins/toolsets/robusta/__init__.py +0 -0
  154. holmes/plugins/toolsets/robusta/robusta.py +235 -0
  155. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
  156. holmes/plugins/toolsets/runbook/__init__.py +0 -0
  157. holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
  158. holmes/plugins/toolsets/service_discovery.py +92 -0
  159. holmes/plugins/toolsets/servicenow/install.md +37 -0
  160. holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
  161. holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
  162. holmes/plugins/toolsets/slab.yaml +20 -0
  163. holmes/plugins/toolsets/utils.py +137 -0
  164. holmes/plugins/utils.py +14 -0
  165. holmes/utils/__init__.py +0 -0
  166. holmes/utils/cache.py +84 -0
  167. holmes/utils/cert_utils.py +40 -0
  168. holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
  169. holmes/utils/definitions.py +13 -0
  170. holmes/utils/env.py +53 -0
  171. holmes/utils/file_utils.py +56 -0
  172. holmes/utils/global_instructions.py +20 -0
  173. holmes/utils/holmes_status.py +22 -0
  174. holmes/utils/holmes_sync_toolsets.py +80 -0
  175. holmes/utils/markdown_utils.py +55 -0
  176. holmes/utils/pydantic_utils.py +54 -0
  177. holmes/utils/robusta.py +10 -0
  178. holmes/utils/tags.py +97 -0
  179. holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
  180. holmesgpt-0.11.5.dist-info/METADATA +400 -0
  181. holmesgpt-0.11.5.dist-info/RECORD +183 -0
  182. holmesgpt-0.11.5.dist-info/WHEEL +4 -0
  183. holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0
File without changes
@@ -0,0 +1,6 @@
1
+ from strenum import StrEnum
2
+
3
+
4
+ class DestinationType(StrEnum):
5
+ SLACK = "slack"
6
+ CLI = "cli"
@@ -0,0 +1,2 @@
1
+ # ruff: noqa: F401
2
+ from .plugin import SlackDestination
@@ -0,0 +1,163 @@
1
+ import logging
2
+
3
+ from slack_sdk import WebClient
4
+ from slack_sdk.errors import SlackApiError
5
+
6
+ from holmes.core.issue import Issue, IssueStatus
7
+ from holmes.core.tool_calling_llm import LLMResult
8
+ from holmes.plugins.interfaces import DestinationPlugin
9
+
10
+
11
+ class SlackDestination(DestinationPlugin):
12
+ def __init__(self, token, channel):
13
+ self.token = token
14
+ self.channel = channel
15
+ self.client = WebClient(token=self.token)
16
+
17
+ def send_issue(self, issue: Issue, result: LLMResult) -> None:
18
+ color = (
19
+ "#FF0000" if issue.presentation_status == IssueStatus.OPEN else "#00FF00"
20
+ ) # Red for firing, green for resolved
21
+ if issue.presentation_status:
22
+ title = f"{issue.name} - {issue.presentation_status.value}"
23
+ else:
24
+ title = f"{issue.name}"
25
+
26
+ if issue.url:
27
+ text = f"*<{issue.url}|{title}>*"
28
+ else:
29
+ text = f"*{title}*"
30
+
31
+ blocks = [
32
+ {
33
+ # TODO: consider moving outside of block
34
+ "type": "section",
35
+ "text": {
36
+ "type": "mrkdwn",
37
+ "text": f":robot_face: {result.result}",
38
+ },
39
+ }
40
+ ]
41
+ if issue.presentation_key_metadata:
42
+ blocks.append(
43
+ {
44
+ "type": "context",
45
+ "elements": [
46
+ {
47
+ "type": "mrkdwn",
48
+ "text": issue.presentation_key_metadata,
49
+ } # type: ignore
50
+ ],
51
+ }
52
+ )
53
+
54
+ try:
55
+ response = self.client.chat_postMessage(
56
+ channel=self.channel,
57
+ text=text,
58
+ attachments=[
59
+ {
60
+ "color": color,
61
+ "blocks": blocks,
62
+ }
63
+ ],
64
+ )
65
+ self.__send_tool_usage(response["ts"], result)
66
+ self.__send_issue_metadata(response["ts"], issue)
67
+ self.__send_prompt_for_debugging(response["ts"], result)
68
+
69
+ except SlackApiError as e:
70
+ if e.response.data["error"] == "channel_not_found":
71
+ logging.error(
72
+ f"The channel {self.channel} is not found. Please check the value of --slack-channel"
73
+ )
74
+ elif e.response.data["error"] == "invalid_auth":
75
+ logging.error(
76
+ "Unable to authenticate using the provided Slack token. Please verify the setting of --slack-token"
77
+ )
78
+ else:
79
+ logging.error(f"Error sending message: {e}. message={text}")
80
+
81
+ def __send_tool_usage(self, parent_thread, result: LLMResult) -> None:
82
+ if not result.tool_calls:
83
+ return
84
+
85
+ text = "*AI used info from alert and the following tools:*"
86
+ for tool in result.tool_calls:
87
+ file_response = self.client.files_upload_v2(
88
+ content=tool.result, title=f"{tool.description}"
89
+ )
90
+ permalink = file_response["file"]["permalink"]
91
+ text += f"\n• `<{permalink}|{tool.description}>`"
92
+
93
+ self.client.chat_postMessage(
94
+ channel=self.channel,
95
+ thread_ts=parent_thread,
96
+ text=text,
97
+ blocks=[
98
+ {
99
+ "type": "section",
100
+ "text": {"type": "mrkdwn", "text": text},
101
+ }
102
+ ],
103
+ )
104
+
105
+ def __send_prompt_for_debugging(self, parent_thread, result: LLMResult) -> None:
106
+ if not result.prompt:
107
+ return
108
+
109
+ text = "*🐞 DEBUG: messages with OpenAI*"
110
+ file_response = self.client.files_upload_v2(
111
+ content=result.prompt, title="ai-prompt"
112
+ )
113
+ permalink = file_response["file"]["permalink"]
114
+ text += f"\n`<{permalink}|ai-prompt>`"
115
+
116
+ self.client.chat_postMessage(
117
+ channel=self.channel,
118
+ thread_ts=parent_thread,
119
+ text=text,
120
+ blocks=[
121
+ {
122
+ "type": "section",
123
+ "text": {"type": "mrkdwn", "text": text},
124
+ }
125
+ ],
126
+ )
127
+
128
+ def __send_issue_metadata(self, parent_thread, issue: Issue) -> None:
129
+ if not issue.presentation_all_metadata:
130
+ return
131
+
132
+ filename = f"{issue.name}"
133
+ issue_json = issue.model_dump_json()
134
+ file_response = self.client.files_upload_v2(content=issue_json, title=filename)
135
+ permalink = file_response["file"]["permalink"]
136
+ text = issue.presentation_all_metadata
137
+ text += f"\n<{permalink}|{filename}>\n"
138
+
139
+ self.client.chat_postMessage(
140
+ channel=self.channel,
141
+ thread_ts=parent_thread,
142
+ text=text,
143
+ unfurl_links=False,
144
+ unfurl_media=True,
145
+ blocks=[
146
+ {
147
+ "type": "section",
148
+ "text": {
149
+ "text": f"*{issue.source_type.capitalize()} Metadata*",
150
+ "type": "mrkdwn",
151
+ },
152
+ },
153
+ {
154
+ "type": "context",
155
+ "elements": [
156
+ {
157
+ "type": "mrkdwn",
158
+ "text": text,
159
+ }
160
+ ],
161
+ },
162
+ ],
163
+ )
@@ -0,0 +1,32 @@
1
+ from typing import List, Iterable
2
+ from holmes.core.issue import Issue
3
+ from holmes.core.tool_calling_llm import LLMResult
4
+
5
+
6
+ # Sources must implement this
7
+ class SourcePlugin:
8
+ def fetch_issues(self) -> List[Issue]:
9
+ raise NotImplementedError()
10
+
11
+ def fetch_issue(self, id: str) -> Issue:
12
+ raise NotImplementedError()
13
+
14
+ # optional
15
+ def stream_issues(self) -> Iterable[Issue]:
16
+ raise NotImplementedError()
17
+
18
+ # optional
19
+ def write_back_result(self, issue_id: str, result_data: LLMResult) -> None:
20
+ raise NotImplementedError()
21
+
22
+
23
+ # Destinations must implement this
24
+ class DestinationPlugin:
25
+ def send_issue(self, issue: Issue, result: LLMResult):
26
+ raise NotImplementedError()
27
+
28
+ # def send_grouped_issues(self, issues: List[Issue]):
29
+ # raise NotImplementedError()
30
+
31
+ # def group_issues()
32
+ # raise NotImplementedError()
@@ -0,0 +1,48 @@
1
+ import os
2
+ import os.path
3
+ from typing import Optional
4
+ from jinja2 import Environment, FileSystemLoader
5
+ from datetime import datetime, timezone
6
+
7
+ THIS_DIR = os.path.abspath(os.path.dirname(__file__))
8
+
9
+
10
+ def load_prompt(prompt: str) -> str:
11
+ """
12
+ prompt is either in the format 'builtin://' or 'file://' or a regular string
13
+ builtins are loaded as a file from this directory
14
+ files are loaded from the file system normally
15
+ regular strings are returned as is (as literal strings)
16
+ """
17
+ if prompt.startswith("builtin://"):
18
+ path = os.path.join(THIS_DIR, prompt[len("builtin://") :])
19
+ elif prompt.startswith("file://"):
20
+ path = prompt[len("file://") :]
21
+ else:
22
+ return prompt
23
+
24
+ return open(path, encoding="utf-8").read()
25
+
26
+
27
+ def load_and_render_prompt(prompt: str, context: Optional[dict] = None) -> str:
28
+ """
29
+ prompt is in the format 'builtin://' or 'file://' or a regular string
30
+ see load_prompt() for details
31
+
32
+ context is a dictionary of variables to be passed to the jinja2 template
33
+ """
34
+ prompt_as_str = load_prompt(prompt)
35
+
36
+ env = Environment(
37
+ loader=FileSystemLoader(THIS_DIR),
38
+ )
39
+
40
+ template = env.from_string(prompt_as_str)
41
+
42
+ if context is None:
43
+ context = {}
44
+
45
+ now = datetime.now(timezone.utc)
46
+ context.update({"now": f"{now}", "now_timestamp_seconds": int(now.timestamp())})
47
+
48
+ return template.render(**context)
@@ -0,0 +1 @@
1
+ When querying tools, always query for the relevant time period. The current UTC date and time are {{ now }}. The current UTC timestamp in seconds is {{ now_timestamp_seconds }}.
@@ -0,0 +1,11 @@
1
+ * Use the tool `fetch_pod_logs` to access an application's logs
2
+ * Prior to fetching logs, ensure the pod exists using kubectl tools
3
+ * If you find no logs, double check that the namespace and pod names are exact. Use kubectl tools to find the right resource names and pod name.
4
+ * If you are not given the pod's namespace, look for existing pods using kubectl tools and infer the namespace that way
5
+ * If you are not given the pod's exact name, or only have an application name or a deployment name, look for related pods using kubectl commands. Ask the user if you can't infer the pod logs.
6
+ * Do fetch application logs yourself and DO not ask users to do so
7
+ * If you have an issue id or finding id, use `fetch_finding_by_id` as it contains time information about the issue (`starts_at`, `updated_at` and `ends_at`).
8
+ ** Then, use `start_time=-300` (5 minutes before `end_time`) and `end_time=<issue start_at time>` when calling `fetch_pod_logs`.
9
+ ** If there are too many logs, or not enough, narrow or widen the timestamps
10
+ ** If looking for a specific keyword, use the `filter` argument
11
+ * If you are not provided with time information. Ignore the `start_time` and `end_time`. The tool `fetch_pod_logs` will default to the latest logs.
@@ -0,0 +1,36 @@
1
+ {%- set loki_ts = toolsets | selectattr("name", "equalto", "grafana/loki") | first -%}
2
+ {%- set coralogix_ts = toolsets | selectattr("name", "equalto", "coralogix/logs") | first -%}
3
+ {%- set k8s_base_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | selectattr("fetch_pod_logs", "defined") | first -%}
4
+ {%- set k8s_yaml_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | rejectattr("fetch_pod_logs", "defined") | first -%}
5
+ {%- set opensearch_ts = toolsets | selectattr("name", "equalto", "opensearch/logs") | first -%}
6
+
7
+ # Logs
8
+ {% if loki_ts and loki_ts.status == "enabled" -%}
9
+ * For any logs, including for investigating kubernetes problems, use Loki
10
+ * Use the tool fetch_loki_logs_for_resource to get the logs of any kubernetes pod or node
11
+ * Use fetch_loki_logs and build a query for any other logs
12
+ * Prefer fetching loki logs and avoid using kubectl logs commands
13
+ * Before fetching logs through Loki, use `kubectl` commands to get the namespace and correct name of a resource
14
+ * If you have an issue id or finding id, use `fetch_finding_by_id` as it contains time information about the issue (`starts_at`, `updated_at` and `ends_at`).
15
+ ** Then, defaults to `start_timestamp=-300` (5 minutes before end_timestamp) and `end_timestamp=<issue start_at time>`.
16
+ ** If there are too many logs, or not enough, narrow or widen the timestamps
17
+ * If you are not provided with time information. Ignore start_timestamp and end_timestamp. Loki will default to the latest logs.
18
+ {%- elif coralogix_ts and coralogix_ts.status == "enabled" -%}
19
+ {% include '_default_log_prompt.jinja2' %}
20
+ {%- elif k8s_base_ts and k8s_base_ts.status == "enabled" -%}
21
+ {% include '_default_log_prompt.jinja2' %}
22
+ {%- elif k8s_yaml_ts and k8s_yaml_ts.status == "enabled" -%}
23
+ * if the user wants to find a specific term in a pod's logs, use kubectl_logs_grep
24
+ * use both kubectl_previous_logs and kubectl_logs when reading logs. Treat the output of both as a single unified logs stream
25
+ * if a pod has multiple containers, make sure you fetch the logs for either all or relevant containers using one of the containers log functions like kubectl_logs_all_containers, kubectl_logs_all_containers_grep or any other.
26
+ * Check both kubectl_logs and kubectl_previous_logs because a pod restart mean kubectl_logs may not have relevant logs
27
+ {%- elif opensearch_ts and opensearch_ts.status == "enabled" -%}
28
+ {% include '_default_log_prompt.jinja2' %}
29
+ {%- else -%}
30
+ * You have not been given access to tools to fetch kubernetes logs for nodes, pods, services or apps. This is likely a misconfiguration.
31
+ * If you need logs to answer questions or investigate issues, tell the user to configure the documentation and enable one of these toolsets:
32
+ ** 'kubernetes/logs'
33
+ ** 'grafana/loki'
34
+ ** 'opensearch/logs'
35
+ ** 'coralogix/logs'
36
+ {%- endif -%}
@@ -0,0 +1,86 @@
1
+ # In general
2
+
3
+ * when it can provide extra information, first run as many tools as you need to gather more information, then respond.
4
+ * if possible, do so repeatedly with different tool calls each time to gather more information.
5
+ * do not stop investigating until you are at the final root cause you are able to find.
6
+ * use the "five whys" methodology to find the root cause.
7
+ * for example, if you found a problem in microservice A that is due to an error in microservice B, look at microservice B too and find the error in that.
8
+ * if you cannot find the resource/application that the user referred to, assume they made a typo or included/excluded characters like - and.
9
+ * in this case, try to find substrings or search for the correct spellings
10
+ * always provide detailed information like exact resource names, versions, labels, etc
11
+ * even if you found the root cause, keep investigating to find other possible root causes and to gather data for the answer like exact names
12
+ * if a runbook url is present as well as tool that can fetch it, you MUST fetch the runbook before beginning your investigation.
13
+ * if you don't know, say that the analysis was inconclusive.
14
+ * if there are multiple possible causes list them in a numbered list.
15
+ * there will often be errors in the data that are not relevant or that do not have an impact - ignore them in your conclusion if you were not able to tie them to an actual error.
16
+ * ALWAYS check the logs when checking if an app, pod, service or deployment is having issues. Something "running" and reporting healthy does not mean it is without issues.
17
+
18
+ # If investigating Kubernetes problems
19
+
20
+ * run as many kubectl commands as you need to gather more information, then respond.
21
+ * if possible, do so repeatedly on different Kubernetes objects.
22
+ * for example, for deployments first run kubectl on the deployment then a replicaset inside it, then a pod inside that.
23
+ * when investigating a pod that crashed or application errors, always run kubectl_describe and fetch the logs
24
+ * Do check both the status of the kubernetes resources and the application runtime as well, by investigating logs
25
+ * do not give an answer like "The pod is pending" as that doesn't state why the pod is pending and how to fix it.
26
+ * do not give an answer like "Pod's node affinity/selector doesn't match any available nodes" because that doesn't include data on WHICH label doesn't match
27
+ * if investigating an issue on many pods, there is no need to check more than 3 individual pods in the same deployment. pick up to a representative 3 from each deployment if relevant
28
+ * if the user says something isn't working, ALWAYS:
29
+ ** use kubectl_describe on the owner workload + individual pods and look for any transient issues they might have been referring to
30
+ ** look for misconfigured ingresses/services etc
31
+ ** check the application logs because there may be runtime issues
32
+
33
+ {% include '_toolsets_instructions.jinja2' %}
34
+
35
+ {% include '_fetch_logs.jinja2' %}
36
+
37
+ # Handling Permission Errors
38
+
39
+ If during the investigation you encounter a permissions error (e.g., `Error from server (Forbidden):`), **ALWAYS** follow these steps to ensure a thorough resolution:
40
+ 1.**Analyze the Error Message**
41
+ - Identify the missing resource, API group, and verbs from the error details.
42
+ - Never stop at reporting the error
43
+ - Proceed with an in-depth investigation.
44
+ 2.**Locate the Relevant Helm Release**
45
+ Check if Helm tools are available, if they are available always use Helm commands to help user find the release associated with the Holmes pod:
46
+ - Run `helm list -A | grep holmes` to identify the release name.
47
+ - Run `helm get values <RELEASE_NAME> -n <NAMESPACE>` to retrieve details such as `customClusterRoleRules` and `clusterName`.
48
+ If Helm tools are unavailable, skip this step.
49
+ 3. **Check for Missing Permissions**
50
+ - Check for a cluster role with <RELEASE_NAME>-holmes-cluster-role in its name and a service account with <RELEASE_NAME>-holmes-service-account in its name to troubleshoot missing permissions where release name is the name you found earlier if helm tools are available (If the exact cluster role or service account isn't found, search for similar or related names, including variations or prefixes/suffixes that might be used in the cluster.)
51
+ - Focus on identifying absent permissions that align with the error message.
52
+ 4. **Update the Configuration**
53
+ If necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows:
54
+ ```
55
+ holmes:
56
+ customClusterRoleRules:
57
+ - apiGroups: ["<API_GROUP>"]
58
+ resources: ["<RESOURCE_1>", "<RESOURCE_2>"]
59
+ verbs: ["<VERB_1>", "<VERB_2>", "<VERB_3>"]
60
+ ```
61
+ After that instruct them to apply the changes with::
62
+ ```
63
+ helm upgrade <RELEASE_NAME> robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>
64
+ ```
65
+ 5. **Fallback Guidelines**
66
+ - If you cannot determine the release or cluster name, use placeholders `<RELEASE_NAME>` and `<YOUR_CLUSTER_NAME>`.
67
+ - While you should attempt to retrieve details using Helm commands, do **not** direct the user to execute these commands themselves.
68
+ Reminder:
69
+ * Always adhere to this process, even if Helm tools are unavailable.
70
+ * Strive for thoroughness and precision, ensuring the issue is fully addressed.
71
+
72
+ # Special cases and how to reply
73
+
74
+ * Make sure you differentiate between "I investigated and found error X caused this problem" and "I tried to investigate but while investigating I got some errors that prevented me from completing the investigation."
75
+ * As a special case of that, If a tool generates a permission error when attempting to run it, follow the Handling Permission Errors section for detailed guidance.
76
+ * That is different than - for example - fetching a pod's logs and seeing that the pod itself has permission errors. in that case, you explain say that permission errors are the cause of the problem and give details
77
+ * Issues are a subset of findings. When asked about an issue or a finding and you have an id, use the tool `fetch_finding_by_id`.
78
+ * For any question, try to make the answer specific to the user's cluster.
79
+ ** For example, if asked to port forward, find out the app or pod port (kubectl describe) and provide a port forward command specific to the user's question
80
+
81
+ # Tool/function calls
82
+
83
+ You are able to make tool calls / function calls. Recognise when a tool has already been called and reuse its result.
84
+ If a tool call returns nothing, modify the parameters as required instead of repeating the tool call.
85
+ When searching for resources in specific namespaces, test a cluster level tool to find the resource(s) and identify what namespace they are part of.
86
+ You are limited in use to a maximum of 5 tool calls for each specific tool. Therefore make sure are smart about what tools you call and how you call them.
@@ -0,0 +1,12 @@
1
+ # Global Instructions
2
+
3
+ You may receive a set of “Global Instructions” that describe how to perform certain tasks, handle certain situations, or apply certain best practices. They are not mandatory for every request, but serve as a reference resource and must be used if the current scenario or user prmopt aligns with one of the described methods or conditions.
4
+ Use these rules when deciding how to apply them:
5
+
6
+ * If the user prompt includes Global Instructions, treat them as a reference resource.
7
+ * Some Global Instructions may describe how to handle specific tasks or scenarios. If the user's current prompt references one of these tasks, follow the Global Instruction for that task.
8
+ * If some Global Instructions define general conditions (e.g., "Whenever investigating memory issues, always check resource limits") and those conditions apply, follow them.
9
+ * If user's prompt direct you to perform a task (e.g., “Find owner”) and there is a Global Instruction on how to do that task, follow the Global Instructions on how to perform it.
10
+ * If multiple Global Instructions are relevant, apply all that fit.
11
+ * If no Global Instruction is relevant, or no condition applies, ignore them and proceed as normal.
12
+ * Before finalizing your answer, double-check if any Global Instructions apply. If so, ensure you have correctly followed those instructions.
@@ -0,0 +1,13 @@
1
+ {% if runbooks and runbooks.catalog|length > 0 %}
2
+ # Runbook Selection
3
+
4
+ ## Available Runbooks
5
+ {% for runbook in runbooks.catalog %}
6
+ ### description: {{ runbook.description }}
7
+ link: {{ runbook.link }}
8
+ {% endfor %}
9
+ ALWAYS try to find the runbooks that can provide troubleshooting instructions when the user describes an operational issue, debugging scenario, or asks for step‑by‑step troubleshooting.
10
+ To get the runbook details, use `fetch_runbook` tool by comparing the runbook description with the user prompt.
11
+ ALWAYS follow the steps described in the runbook.
12
+ If you decided not to follow one or more steps, ALWAYS explain why.
13
+ {%- endif -%}
@@ -0,0 +1,56 @@
1
+ {%- set enabled_toolsets_with_instructions = [] -%}
2
+ {%- set disabled_toolsets = [] -%}
3
+
4
+ {%- for toolset in toolsets -%}
5
+ {%- if toolset.llm_instructions and toolset.status.value == "enabled" -%}
6
+ {%- set _ = enabled_toolsets_with_instructions.append(toolset) -%}
7
+ {%- elif toolset.status.value != "enabled" -%}
8
+ {%- set _ = disabled_toolsets.append(toolset) -%}
9
+ {%- endif -%}
10
+ {%- endfor -%}
11
+
12
+ {% if enabled_toolsets_with_instructions|list -%}
13
+ # Available Toolsets
14
+ {%- for toolset in enabled_toolsets_with_instructions -%}
15
+ {% if toolset.llm_instructions %}
16
+
17
+ ## {{ toolset.name }}
18
+ {{ toolset.llm_instructions }}
19
+ {%- endif -%}
20
+ {%- endfor -%}
21
+ {%- endif -%}
22
+ {% if disabled_toolsets %}
23
+ # Disabled & failed Toolsets
24
+
25
+ The following toolsets are either disabled or failed to initialize:
26
+ {% for toolset in disabled_toolsets %}
27
+ * toolset "{{ toolset.name }}": {{ toolset.description }}
28
+ {%- if toolset.status == "failed" %}
29
+ * status: The toolset is enabled but misconfigured and failed to initialize.
30
+ {%- if toolset.error %}
31
+ * error: {{ toolset.error }}
32
+ {%- endif -%}
33
+ {%- else %}
34
+ * status: {{ toolset.status.value }}
35
+ {%- endif %}
36
+ {%- if toolset.docs_url %}
37
+ * setup instructions: {{ toolset.docs_url }}
38
+ {%- endif -%}
39
+ {%- endfor %}
40
+
41
+ If you need a toolset to access a system that you don't otherwise have access to:
42
+ - Check the list of toolsets above and see if any loosely match the needs
43
+ - If the toolset has `status: failed`: Tell the user and copy the error in your response for the user to see
44
+ - If the toolset has `status: disabled`: Ask the user to configure the it.
45
+ - Share the setup instructions URL with the user
46
+ - Invoke the tool fetch_webpage on the toolset URL and summarize setup steps
47
+ - If there are no relevant toolsets in the list below, tell the user that you are missing an integration to access XYZ:
48
+ you should give an answer similar to "I don't have access to <system>. Please add a Holmes integration for <system> so
49
+ that I can investigate this."
50
+ {% else %}
51
+
52
+ # Disabled & failed Toolsets
53
+
54
+ If you need a toolset to access a system that you don't otherwise have access to, tell the user that you are missing an integration to access XYZ.
55
+ You should give an answer similar to "I don't have access to <system>. Please add a Holmes integration for <system> so that I can investigate this."
56
+ {%- endif -%}
@@ -0,0 +1,36 @@
1
+ You are a tool-calling AI assist provided with common devops and IT tools that you can use to troubleshoot problems or answer questions.
2
+ Whenever possible you MUST first use tools to investigate then answer the question.
3
+ Do not say 'based on the tool output' or explicitly refer to tools at all.
4
+ If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
5
+ If you have a good and concrete suggestion for how the user can fix something, tell them even if not asked explicitly
6
+
7
+ Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
8
+
9
+ If you are unsure about the answer to the user's request or how to satisfy their request, you should gather more information. This can be done by asking the user for more information.
10
+ Bias towards not asking the user for help if you can find the answer yourself.
11
+
12
+ {% include '_general_instructions.jinja2' %}
13
+
14
+ {% include '_runbook_instructions.jinja2' %}
15
+
16
+ # Style guide
17
+
18
+ * Reply with terse output.
19
+ * Be painfully concise.
20
+ * Leave out "the" and filler words when possible.
21
+ * Be terse but not at the expense of leaving out important data like the root cause and how to fix.
22
+
23
+ ## Examples
24
+
25
+ User: Why did the webserver-example app crash?
26
+ (Call tool kubectl_find_resource kind=pod keyword=webserver`)
27
+ (Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call)
28
+
29
+ AI: `webserver-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
30
+ Relevant logs:
31
+
32
+ ```
33
+ 2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body
34
+ ```
35
+
36
+ Validation error led to unhandled Java exception causing a crash.
@@ -0,0 +1,32 @@
1
+ You are a tool-calling AI assist provided with common devops and IT tools that you can use to troubleshoot problems or answer questions.
2
+ Whenever possible you MUST first use tools to investigate then answer the question.
3
+ Do not say 'based on the tool output' or explicitly refer to tools at all.
4
+ If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
5
+ If you have a good and concrete suggestion for how the user can fix something, tell them even if not asked explicitly
6
+ {% include '_current_date_time.jinja2' %}
7
+
8
+ Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
9
+
10
+ {% include '_general_instructions.jinja2' %}
11
+
12
+
13
+ # Style guide
14
+ * Reply with terse output.
15
+ * Be painfully concise.
16
+ * Leave out "the" and filler words when possible.
17
+ * Be terse but not at the expense of leaving out important data like the root cause and how to fix.
18
+
19
+ ## Examples
20
+
21
+ User: Why did the webserver-example app crash?
22
+ (Call tool kubectl_find_resource kind=pod keyword=webserver`)
23
+ (Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call)
24
+
25
+ AI: `webserver-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
26
+ Relevant logs:
27
+
28
+ ```
29
+ 2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body
30
+ ```
31
+
32
+ Validation error led to unhandled Java exception causing a crash.
@@ -0,0 +1,50 @@
1
+ You are a tool-calling AI assist provided with common devops and IT tools that you can use to troubleshoot problems or answer questions.
2
+ Whenever possible you MUST first use tools to investigate then answer the question.
3
+ Do not say 'based on the tool output' or explicitly refer to tools at all.
4
+ If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
5
+ {% include '_current_date_time.jinja2' %}
6
+
7
+ ### Context Awareness:
8
+ Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{issue}}.
9
+ However, not all questions may be directly related to that investigation.
10
+ Use results of the investigation and conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
11
+
12
+ #### Results of issue Investigation:
13
+ {{investigation}}
14
+
15
+ {% if tools_called_for_investigation %}
16
+ Tools used for the investigation:
17
+ {% for r in tools_called_for_investigation %}
18
+ - {{ r }}
19
+ {% endfor %}
20
+ {% endif %}
21
+
22
+ {% if conversation_history %}
23
+ Conversation history:
24
+ {{conversation_history}}
25
+ {% endif %}
26
+
27
+ {% include '_global_instructions.jinja2' %}
28
+
29
+ {% include '_general_instructions.jinja2' %}
30
+
31
+ Style guide:
32
+ * Reply with terse output.
33
+ * Be painfully concise.
34
+ * Leave out "the" and filler words when possible.
35
+ * Be terse but not at the expense of leaving out important data like the root cause and how to fix.
36
+
37
+ Examples:
38
+
39
+ User: Why did the webserver-example app crash?
40
+ (Call tool kubectl_find_resource kind=pod keyword=webserver`)
41
+ (Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call)
42
+
43
+ AI: `webserver-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
44
+ Relevant logs:
45
+
46
+ ```
47
+ 2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body
48
+ ```
49
+
50
+ Validation error led to unhandled Java exception causing a crash.
@@ -0,0 +1,42 @@
1
+ You are a tool-calling AI assist provided with common devops and IT tools that you can use to troubleshoot problems or answer questions.
2
+ Whenever possible you MUST first use tools to investigate then answer the question.
3
+ Do not say 'based on the tool output'
4
+
5
+ Provide an terse analysis of the following {{ issue.source_type }} alert/issue and why it is firing.
6
+ * {% include '_current_date_time.jinja2' %}
7
+ * If the tool requires string format timestamps, query from 'start_timestamp' until 'end_timestamp'
8
+ * If the tool requires timestamps in milliseconds, query from 'start_timestamp' until 'end_timestamp'
9
+ * If you need timestamp in string format, query from 'start_timestamp_millis' until 'end_timestamp_millis'
10
+ * Always try to search for BOTH relevant logs and traces
11
+
12
+ If the user provides you with extra instructions in a triple quotes section, ALWAYS perform their instructions and then perform your investigation.
13
+
14
+
15
+ # Global Instructions
16
+
17
+ You may receive a set of “Global Instructions” that describe how to perform certain tasks, handle certain situations, or apply certain best practices. They are not mandatory for every request, but serve as a reference resource and must be used if the current scenario or user request aligns with one of the described methods or conditions.
18
+ Use these rules when deciding how to apply them:
19
+
20
+ * If the user prompt includes Global Instructions, treat them as a reference resource.
21
+ * Some Global Instructions may describe how to handle specific tasks or scenarios. If the user's current request or the instructions in a triple quotes section reference one of these tasks, follow the Global Instruction for that task.
22
+ * Some Global Instructions may define general conditions that always apply if a certain scenario occurs (e.g., "whenever investigating a memory issue, always check resource limits"). If such a condition matches the current situation, apply the Global Instruction accordingly.
23
+ * If user's prompt or the instructions in a triple quotes section direct you to perform a task (e.g., “Find owner”) and there is a Global Instruction on how to do that task, follow the Global Instructions on how to perform it.
24
+ * If multiple Global Instructions are relevant, apply all that fit.
25
+ * If no Global Instruction is relevant, or no condition applies, ignore them and proceed as normal.
26
+ * Before finalizing your answer double-check if any Global Instructions apply. If so, ensure you have correctly followed those instructions.
27
+
28
+ {% include '_general_instructions.jinja2' %}
29
+
30
+ # Style Guide
31
+
32
+ * `code block` exact names of IT/cloud resources like specific virtual machines.
33
+ * *Surround the title of the root cause like this*.
34
+ * Whenever there are precise numbers in the data available, quote them. For example:
35
+ * Don't say an app is repeatedly crashing, rather say the app has crashed X times so far
36
+ * Don't just say x/y nodes don't match a pod's affinity selector, rather say x/y nodes don't match the selector ABC
37
+ * Don't say "The alert indicates a warning event related to a Kubernetes pod failing to start due to a container creation error" rather say "The pod <pod name> failed to start due to a container creation error."
38
+ * And so on
39
+ * But only quote relevant numbers or metrics that are available. Do not guess.
40
+ * Remove unnecessary words
41
+
42
+ {% include 'investigation_output_format.jinja2' %}