holmesgpt 0.12.6__py3-none-any.whl → 0.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (125) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +19 -1
  3. holmes/common/env_vars.py +17 -0
  4. holmes/config.py +69 -9
  5. holmes/core/conversations.py +11 -0
  6. holmes/core/investigation.py +16 -3
  7. holmes/core/investigation_structured_output.py +12 -0
  8. holmes/core/llm.py +13 -1
  9. holmes/core/models.py +9 -1
  10. holmes/core/openai_formatting.py +72 -12
  11. holmes/core/prompt.py +13 -0
  12. holmes/core/supabase_dal.py +3 -0
  13. holmes/core/todo_manager.py +88 -0
  14. holmes/core/tool_calling_llm.py +230 -157
  15. holmes/core/tools.py +10 -1
  16. holmes/core/tools_utils/tool_executor.py +7 -2
  17. holmes/core/tools_utils/toolset_utils.py +7 -2
  18. holmes/core/toolset_manager.py +1 -5
  19. holmes/core/tracing.py +4 -3
  20. holmes/interactive.py +1 -0
  21. holmes/main.py +9 -2
  22. holmes/plugins/prompts/__init__.py +7 -1
  23. holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
  24. holmes/plugins/prompts/_default_log_prompt.jinja2 +4 -2
  25. holmes/plugins/prompts/_fetch_logs.jinja2 +10 -1
  26. holmes/plugins/prompts/_general_instructions.jinja2 +14 -0
  27. holmes/plugins/prompts/_permission_errors.jinja2 +1 -1
  28. holmes/plugins/prompts/_toolsets_instructions.jinja2 +4 -4
  29. holmes/plugins/prompts/generic_ask.jinja2 +4 -3
  30. holmes/plugins/prompts/investigation_procedure.jinja2 +210 -0
  31. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -0
  32. holmes/plugins/runbooks/CLAUDE.md +85 -0
  33. holmes/plugins/runbooks/README.md +24 -0
  34. holmes/plugins/toolsets/__init__.py +19 -6
  35. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +27 -0
  36. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +2 -2
  37. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +2 -1
  38. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -1
  39. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +2 -1
  40. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +2 -1
  41. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +3 -1
  42. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +2 -1
  43. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +2 -1
  44. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +2 -1
  45. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +2 -1
  46. holmes/plugins/toolsets/bash/argocd/__init__.py +65 -0
  47. holmes/plugins/toolsets/bash/argocd/constants.py +120 -0
  48. holmes/plugins/toolsets/bash/aws/__init__.py +66 -0
  49. holmes/plugins/toolsets/bash/aws/constants.py +529 -0
  50. holmes/plugins/toolsets/bash/azure/__init__.py +56 -0
  51. holmes/plugins/toolsets/bash/azure/constants.py +339 -0
  52. holmes/plugins/toolsets/bash/bash_instructions.jinja2 +6 -7
  53. holmes/plugins/toolsets/bash/bash_toolset.py +47 -13
  54. holmes/plugins/toolsets/bash/common/bash_command.py +131 -0
  55. holmes/plugins/toolsets/bash/common/stringify.py +14 -1
  56. holmes/plugins/toolsets/bash/common/validators.py +91 -0
  57. holmes/plugins/toolsets/bash/docker/__init__.py +59 -0
  58. holmes/plugins/toolsets/bash/docker/constants.py +255 -0
  59. holmes/plugins/toolsets/bash/helm/__init__.py +61 -0
  60. holmes/plugins/toolsets/bash/helm/constants.py +92 -0
  61. holmes/plugins/toolsets/bash/kubectl/__init__.py +80 -79
  62. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -14
  63. holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +38 -56
  64. holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +28 -76
  65. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +39 -99
  66. holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +34 -15
  67. holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +1 -1
  68. holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +38 -77
  69. holmes/plugins/toolsets/bash/parse_command.py +106 -32
  70. holmes/plugins/toolsets/bash/utilities/__init__.py +0 -0
  71. holmes/plugins/toolsets/bash/utilities/base64_util.py +12 -0
  72. holmes/plugins/toolsets/bash/utilities/cut.py +12 -0
  73. holmes/plugins/toolsets/bash/utilities/grep/__init__.py +10 -0
  74. holmes/plugins/toolsets/bash/utilities/head.py +12 -0
  75. holmes/plugins/toolsets/bash/utilities/jq.py +79 -0
  76. holmes/plugins/toolsets/bash/utilities/sed.py +164 -0
  77. holmes/plugins/toolsets/bash/utilities/sort.py +15 -0
  78. holmes/plugins/toolsets/bash/utilities/tail.py +12 -0
  79. holmes/plugins/toolsets/bash/utilities/tr.py +57 -0
  80. holmes/plugins/toolsets/bash/utilities/uniq.py +12 -0
  81. holmes/plugins/toolsets/bash/utilities/wc.py +12 -0
  82. holmes/plugins/toolsets/coralogix/api.py +6 -6
  83. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +7 -1
  84. holmes/plugins/toolsets/datadog/datadog_api.py +20 -8
  85. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +8 -1
  86. holmes/plugins/toolsets/datadog/datadog_rds_instructions.jinja2 +82 -0
  87. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +12 -5
  88. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +20 -11
  89. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +735 -0
  90. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +18 -11
  91. holmes/plugins/toolsets/git.py +15 -15
  92. holmes/plugins/toolsets/grafana/grafana_api.py +12 -1
  93. holmes/plugins/toolsets/grafana/toolset_grafana.py +5 -1
  94. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +9 -4
  95. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +12 -5
  96. holmes/plugins/toolsets/internet/internet.py +2 -1
  97. holmes/plugins/toolsets/internet/notion.py +2 -1
  98. holmes/plugins/toolsets/investigator/__init__.py +0 -0
  99. holmes/plugins/toolsets/investigator/core_investigation.py +157 -0
  100. holmes/plugins/toolsets/investigator/investigator_instructions.jinja2 +253 -0
  101. holmes/plugins/toolsets/investigator/model.py +15 -0
  102. holmes/plugins/toolsets/kafka.py +14 -7
  103. holmes/plugins/toolsets/kubernetes_logs.py +454 -25
  104. holmes/plugins/toolsets/logging_utils/logging_api.py +115 -55
  105. holmes/plugins/toolsets/mcp/toolset_mcp.py +1 -1
  106. holmes/plugins/toolsets/newrelic.py +8 -3
  107. holmes/plugins/toolsets/opensearch/opensearch.py +8 -4
  108. holmes/plugins/toolsets/opensearch/opensearch_logs.py +9 -2
  109. holmes/plugins/toolsets/opensearch/opensearch_traces.py +6 -2
  110. holmes/plugins/toolsets/prometheus/prometheus.py +179 -44
  111. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +8 -2
  112. holmes/plugins/toolsets/robusta/robusta.py +4 -4
  113. holmes/plugins/toolsets/runbook/runbook_fetcher.py +6 -5
  114. holmes/plugins/toolsets/servicenow/servicenow.py +18 -3
  115. holmes/plugins/toolsets/utils.py +8 -1
  116. holmes/utils/console/logging.py +6 -1
  117. holmes/utils/llms.py +20 -0
  118. holmes/utils/stream.py +90 -0
  119. {holmesgpt-0.12.6.dist-info → holmesgpt-0.13.1.dist-info}/METADATA +47 -34
  120. {holmesgpt-0.12.6.dist-info → holmesgpt-0.13.1.dist-info}/RECORD +123 -91
  121. holmes/plugins/toolsets/bash/grep/__init__.py +0 -52
  122. holmes/utils/robusta.py +0 -9
  123. {holmesgpt-0.12.6.dist-info → holmesgpt-0.13.1.dist-info}/LICENSE.txt +0 -0
  124. {holmesgpt-0.12.6.dist-info → holmesgpt-0.13.1.dist-info}/WHEEL +0 -0
  125. {holmesgpt-0.12.6.dist-info → holmesgpt-0.13.1.dist-info}/entry_points.txt +0 -0
@@ -16,12 +16,17 @@ def filter_out_default_logging_toolset(toolsets: list[Toolset]) -> list[Toolset]
16
16
  All other types of toolsets are included as is.
17
17
  """
18
18
 
19
- logging_toolsets: list[BasePodLoggingToolset] = []
19
+ logging_toolsets: list[Toolset] = []
20
20
  final_toolsets: list[Toolset] = []
21
21
 
22
22
  for ts in toolsets:
23
+ toolset_type = (
24
+ ts.original_toolset_type
25
+ if hasattr(ts, "original_toolset_type")
26
+ else type(ts)
27
+ )
23
28
  if (
24
- isinstance(ts, BasePodLoggingToolset)
29
+ issubclass(toolset_type, BasePodLoggingToolset)
25
30
  and ts.status == ToolsetStatusEnum.ENABLED
26
31
  ):
27
32
  logging_toolsets.append(ts)
@@ -266,11 +266,7 @@ class ToolsetManager:
266
266
  toolset.path = cached_status.get("path", None)
267
267
  # check prerequisites for only enabled toolset when the toolset is loaded from cache. When the toolset is
268
268
  # not loaded from cache, the prerequisites are checked in the refresh_toolset_status method.
269
- if (
270
- toolset.enabled
271
- and toolset.status == ToolsetStatusEnum.ENABLED
272
- and using_cached
273
- ):
269
+ if toolset.enabled and toolset.status == ToolsetStatusEnum.ENABLED:
274
270
  enabled_toolsets_from_cache.append(toolset)
275
271
  self.check_toolset_prerequisites(enabled_toolsets_from_cache)
276
272
 
holmes/core/tracing.py CHANGED
@@ -91,10 +91,11 @@ class SpanType(Enum):
91
91
  """Standard span types for tracing categorization."""
92
92
 
93
93
  LLM = "llm"
94
- TOOL = "tool"
95
- TASK = "task"
96
94
  SCORE = "score"
95
+ FUNCTION = "function"
97
96
  EVAL = "eval"
97
+ TASK = "task"
98
+ TOOL = "tool"
98
99
 
99
100
 
100
101
  class DummySpan:
@@ -119,7 +120,7 @@ class DummySpan:
119
120
  class DummyTracer:
120
121
  """A no-op tracer implementation for when tracing is disabled."""
121
122
 
122
- def start_experiment(self, experiment_name=None, metadata=None):
123
+ def start_experiment(self, experiment_name=None, additional_metadata=None):
123
124
  """No-op experiment creation."""
124
125
  return None
125
126
 
holmes/interactive.py CHANGED
@@ -1002,6 +1002,7 @@ def run_interactive_loop(
1002
1002
  user_input,
1003
1003
  include_files,
1004
1004
  ai.tool_executor,
1005
+ ai.investigation_id,
1005
1006
  runbooks,
1006
1007
  system_prompt_additions,
1007
1008
  )
holmes/main.py CHANGED
@@ -94,7 +94,7 @@ opt_custom_runbooks: Optional[List[Path]] = typer.Option(
94
94
  help="Path to a custom runbooks (can specify -r multiple times to add multiple runbooks)",
95
95
  )
96
96
  opt_max_steps: Optional[int] = typer.Option(
97
- 10,
97
+ 40,
98
98
  "--max-steps",
99
99
  help="Advanced. Maximum number of steps the LLM can take to investigate the issue",
100
100
  )
@@ -104,6 +104,11 @@ opt_verbose: Optional[List[bool]] = typer.Option(
104
104
  "-v",
105
105
  help="Verbose output. You can pass multiple times to increase the verbosity. e.g. -v or -vv or -vvv",
106
106
  )
107
+ opt_log_costs: bool = typer.Option(
108
+ False,
109
+ "--log-costs",
110
+ help="Show LLM cost information in the output",
111
+ )
107
112
  opt_echo_request: bool = typer.Option(
108
113
  True,
109
114
  "--echo/--no-echo",
@@ -176,6 +181,7 @@ def ask(
176
181
  custom_toolsets: Optional[List[Path]] = opt_custom_toolsets,
177
182
  max_steps: Optional[int] = opt_max_steps,
178
183
  verbose: Optional[List[bool]] = opt_verbose,
184
+ log_costs: bool = opt_log_costs,
179
185
  # semi-common options
180
186
  destination: Optional[DestinationType] = opt_destination,
181
187
  slack_token: Optional[str] = opt_slack_token,
@@ -219,7 +225,7 @@ def ask(
219
225
  """
220
226
  Ask any question and answer using available tools
221
227
  """
222
- console = init_logging(verbose) # type: ignore
228
+ console = init_logging(verbose, log_costs) # type: ignore
223
229
  # Detect and read piped input
224
230
  piped_data = None
225
231
 
@@ -302,6 +308,7 @@ def ask(
302
308
  prompt, # type: ignore
303
309
  include_file,
304
310
  ai.tool_executor,
311
+ ai.investigation_id,
305
312
  config.get_runbook_catalog(),
306
313
  system_prompt_additions,
307
314
  )
@@ -43,6 +43,12 @@ def load_and_render_prompt(prompt: str, context: Optional[dict] = None) -> str:
43
43
  context = {}
44
44
 
45
45
  now = datetime.now(timezone.utc)
46
- context.update({"now": f"{now}", "now_timestamp_seconds": int(now.timestamp())})
46
+ context.update(
47
+ {
48
+ "now": f"{now}",
49
+ "now_timestamp_seconds": int(now.timestamp()),
50
+ "current_year": now.year,
51
+ }
52
+ )
47
53
 
48
54
  return template.render(**context)
@@ -1 +1,2 @@
1
1
  When querying tools, always query for the relevant time period. The current UTC date and time are {{ now }}. The current UTC timestamp in seconds is {{ now_timestamp_seconds }}.
2
+ When users mention dates without years (e.g., 'March 25th', 'last May', etc.), assume they either mean the current year ({{ current_year }}) unless context suggests otherwise.
@@ -7,5 +7,7 @@
7
7
  * If you have an issue id or finding id, use `fetch_finding_by_id` as it contains time information about the issue (`starts_at`, `updated_at` and `ends_at`).
8
8
  ** Then, use `start_time=-300` (5 minutes before `end_time`) and `end_time=<issue start_at time>` when calling `fetch_pod_logs`.
9
9
  ** If there are too many logs, or not enough, narrow or widen the timestamps
10
- ** If looking for a specific keyword, use the `filter` argument
11
- * If you are not provided with time information. Ignore the `start_time` and `end_time`. The tool `fetch_pod_logs` will default to the latest logs.
10
+ * If the user did not explicitly ask about a given timeframe, ignore the `start_time` and `end_time` so it will use the default.
11
+ * IMPORTANT: ALWAYS inform the user about the actual time period fetched (e.g., "Looking at logs from the last <X> days")
12
+ * IMPORTANT: If a limit was applied, ALWAYS tell the user how many logs were shown vs total (e.g., "Showing latest <Y> of <Z> logs")
13
+ * IMPORTANT: If any filters were applied, ALWAYS mention them explicitly
@@ -4,8 +4,14 @@
4
4
  {%- set k8s_yaml_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | rejectattr("fetch_pod_logs", "defined") | first -%}
5
5
  {%- set opensearch_ts = toolsets | selectattr("name", "equalto", "opensearch/logs") | first -%}
6
6
  {%- set datadog_ts = toolsets | selectattr("name", "equalto", "datadog/logs") | first -%}
7
+ {%- set bash_ts = toolsets | selectattr("name", "equalto", "bash") | first -%}
7
8
 
8
9
  ## Logs
10
+
11
+ * IMPORTANT: ALWAYS inform the user about what logs you fetched. For example: "Here are pod logs for ..."
12
+ * IMPORTANT: If logs commands have limits mention them. For example: "Showing last 100 lines of logs:"
13
+ * IMPORTANT: If a filter was used, mention the filter. For example: "Logs filtered for 'error':"
14
+
9
15
  {% if loki_ts and loki_ts.status == "enabled" -%}
10
16
  * For any logs, including for investigating kubernetes problems, use Loki
11
17
  * Use the tool fetch_loki_logs_for_resource to get the logs of any kubernetes pod or node
@@ -15,7 +21,7 @@
15
21
  * If you have an issue id or finding id, use `fetch_finding_by_id` as it contains time information about the issue (`starts_at`, `updated_at` and `ends_at`).
16
22
  ** Then, defaults to `start_timestamp=-300` (5 minutes before end_timestamp) and `end_timestamp=<issue start_at time>`.
17
23
  ** If there are too many logs, or not enough, narrow or widen the timestamps
18
- * If you are not provided with time information. Ignore start_timestamp and end_timestamp. Loki will default to the latest logs.
24
+ * If you are not provided with time information. Ignore start_timestamp and end_timestamp.
19
25
  {%- elif coralogix_ts and coralogix_ts.status == "enabled" -%}
20
26
  ### coralogix/logs
21
27
  #### Coralogix Logs Toolset
@@ -39,6 +45,9 @@ Tools to search and fetch logs from Kubernetes.
39
45
  * Check both kubectl_logs and kubectl_previous_logs because a pod restart mean kubectl_logs may not have relevant logs
40
46
  {%- elif opensearch_ts and opensearch_ts.status == "enabled" -%}
41
47
  {% include '_default_log_prompt.jinja2' %}
48
+ {%- elif bash_ts and bash_ts.status == "enabled" -%}
49
+ Use the tool `run_bash_command` to run `kubectl logs` commands and fetch any relevant pod logs.
50
+ DO NOT use `--tail` or `| tail` when calling `kubectl logs` because you may miss critical information.
42
51
  {%- else -%}
43
52
  * You have not been given access to tools to fetch kubernetes logs for nodes, pods, services or apps. This is likely a misconfiguration.
44
53
  * If you need logs to answer questions or investigate issues, tell the user to configure the documentation and enable one of these toolsets:
@@ -1,3 +1,5 @@
1
+ {% include 'investigation_procedure.jinja2' %}
2
+
1
3
  {% include '_ai_safety.jinja2' %}
2
4
 
3
5
  # In general
@@ -49,6 +51,18 @@
49
51
  * For any question, try to make the answer specific to the user's cluster.
50
52
  ** For example, if asked to port forward, find out the app or pod port (kubectl describe) and provide a port forward command specific to the user's question
51
53
 
54
+ # MANDATORY Task Management
55
+
56
+ * You MUST use the TodoWrite tool for ANY investigation requiring multiple steps
57
+ * Your FIRST tool call MUST be TodoWrite to create your investigation plan
58
+ * Break down ALL complex problems into smaller, manageable tasks
59
+ * You MUST update task status (pending → in_progress → completed) as you work through your investigation
60
+ * The TodoWrite tool will show you a formatted task list - reference this throughout your investigation
61
+ * Mark tasks as 'in_progress' when you start them, 'completed' when finished
62
+ * Follow ALL tasks in your plan - don't skip any tasks
63
+ * Use task management to ensure you don't miss important investigation steps
64
+ * If you discover additional steps during investigation, add them to your task list using TodoWrite
65
+
52
66
  # Tool/function calls
53
67
 
54
68
  You are able to make tool calls / function calls. Recognise when a tool has already been called and reuse its result.
@@ -3,4 +3,4 @@
3
3
  If during the investigation you encounter a permissions error (e.g., `Error from server (Forbidden):`), **ALWAYS** follow these steps to ensure a thorough resolution:
4
4
  1. Analyze the Error Message: Identify the missing resource, API group, and verbs from the error details.
5
5
  2. Check which user/service account you're running with and what permissions it has
6
- 3. Report this to the user and refer them to https://robusta-dev.github.io/holmesgpt/data-sources/permissions/
6
+ 3. Report this to the user and refer them to https://holmesgpt.dev/data-sources/permissions/
@@ -51,14 +51,14 @@ If you need a toolset to access a system that you don't otherwise have access to
51
51
  - If the toolset has `status: disabled`: Ask the user to configure it.
52
52
  - Share the setup instructions URL with the user
53
53
  - If there are no relevant toolsets in the list above, tell the user that you are missing an integration to access XYZ:
54
- You should give an answer similar to "I don't have access to <system>. To add a HolmesGPT integration for <system> you can [connect an MCP server](https://robusta-dev.github.io/holmesgpt/data-sources/remote-mcp-servers/) or add a [custom toolset](https://robusta-dev.github.io/holmesgpt/data-sources/custom-toolsets/)."
54
+ You should give an answer similar to "I don't have access to <system>. To add a HolmesGPT integration for <system> you can [connect an MCP server](https://holmesgpt.dev/data-sources/remote-mcp-servers/) or add a [custom toolset](https://holmesgpt.dev/data-sources/custom-toolsets/)."
55
55
 
56
56
  Likewise, if users ask about setting up or configuring integrations (e.g., "How can I give you access to ArgoCD applications?"):
57
57
  ALWAYS check if there's a disabled or failed toolset that matches what the user is asking about. If you find one:
58
58
  1. If the toolset has a specific documentation URL (toolset.docs_url), ALWAYS direct them to that URL first
59
59
  2. If no specific documentation exists, then direct them to the general Holmes documentation:
60
- - For all toolset configurations: https://robusta-dev.github.io/holmesgpt/data-sources/
61
- - For custom toolsets: https://robusta-dev.github.io/holmesgpt/data-sources/custom-toolsets/
62
- - For remote MCP servers: https://robusta-dev.github.io/holmesgpt/data-sources/remote-mcp-servers/
60
+ - For all toolset configurations: https://holmesgpt.dev/data-sources/
61
+ - For custom toolsets: https://holmesgpt.dev/data-sources/custom-toolsets/
62
+ - For remote MCP servers: https://holmesgpt.dev/data-sources/remote-mcp-servers/
63
63
 
64
64
  When providing configuration guidance, always prefer the specific toolset documentation URL when available.
@@ -4,13 +4,14 @@ Ask for multiple tool calls at the same time as it saves time for the user.
4
4
  Do not say 'based on the tool output' or explicitly refer to tools at all.
5
5
  If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
6
6
  If you have a good and concrete suggestion for how the user can fix something, tell them even if not asked explicitly
7
- {% include '_current_date_time.jinja2' %}
8
-
9
- Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
10
7
 
11
8
  If you are unsure about the answer to the user's request or how to satisfy their request, you should gather more information. This can be done by asking the user for more information.
12
9
  Bias towards not asking the user for help if you can find the answer yourself.
13
10
 
11
+ {% include '_current_date_time.jinja2' %}
12
+
13
+ Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
14
+
14
15
  {% include '_general_instructions.jinja2' %}
15
16
 
16
17
  {% include '_runbook_instructions.jinja2' %}
@@ -0,0 +1,210 @@
1
+ {% if investigation_id %}
2
+ # Investigation ID for this session
3
+ Investigation id: {{ investigation_id }}
4
+ {% endif %}
5
+
6
+ CLARIFICATION REQUIREMENT: Before starting ANY investigation, if the user's question is ambiguous or lacks critical details, you MUST ask for clarification first. Do NOT create TodoWrite tasks for unclear questions.
7
+ Only proceed with TodoWrite and investigation AFTER you have clear, specific requirements.
8
+
9
+ CRITICAL: For multi-step questions, you MUST start by calling the TodoWrite tool with a `todos` parameter containing an array of task objects. Each task must have:
10
+ - `id`: unique identifier (string)
11
+ - `content`: specific task description (string)
12
+ - `status`: "pending" for new tasks (string)
13
+
14
+ MANDATORY Task Status Updates:
15
+ - When starting a task: Call TodoWrite changing that task's status to "in_progress"
16
+ - When completing a task: Call TodoWrite changing that task's status to "completed"
17
+
18
+ PARALLEL EXECUTION RULES:
19
+ - When possible, work on multiple tasks at a time. If tasks depend on one another, do them one after the other.
20
+ - You MAY execute multiple INDEPENDENT tasks simultaneously
21
+ - Mark multiple tasks as "in_progress" if they don't depend on each other
22
+ - Wait for dependent tasks to complete before starting tasks that need their results
23
+ - Always use a single TodoWrite call to update multiple task statuses
24
+
25
+ DEPENDENCY ANALYSIS:
26
+ Before marking tasks as "in_progress", determine if they are:
27
+ - ✅ INDEPENDENT: Can run simultaneously (e.g., "Check pod A logs" + "Check pod B logs")
28
+ - ❌ DEPENDENT: One needs results from another (e.g., "Find pod name" → "Get pod logs")
29
+
30
+ PARALLEL EXECUTION EXAMPLE:
31
+ TodoWrite(todos=[
32
+ {"id": "1", "content": "Check frontend pod logs", "status": "in_progress"},
33
+ {"id": "2", "content": "Check backend service config", "status": "in_progress"},
34
+ {"id": "3", "content": "Analyze network policies", "status": "in_progress"},
35
+ {"id": "4", "content": "Compare logs from both pods", "status": "pending"} # Depends on 1,2
36
+ ])
37
+
38
+
39
+ Examples:
40
+ - Task 1: find the pod name
41
+ Task 2: get the pod logs
42
+ Execution Order: Perform Task 2 after Task 1
43
+ - Task 1: get the pod events
44
+ Task 2: get the pod logs
45
+ Execution Order: Perform both tasks together
46
+
47
+ MAXIMIZE PARALLEL TOOL CALLS:
48
+ - When executing multiple in_progress tasks, make ALL their tool calls at once
49
+ - Example: If tasks 1,2,3 are in_progress, call kubectl_logs + kubectl_describe + kubectl_get simultaneously
50
+
51
+ # CRITICAL: TASK COMPLETION ENFORCEMENT
52
+
53
+ YOU MUST COMPLETE EVERY SINGLE TASK before providing your final answer. NO EXCEPTIONS.
54
+
55
+ **BEFORE providing any final answer or conclusion, you MUST:**
56
+
57
+ 1. **Check TodoWrite status**: Verify ALL tasks show "completed" status
58
+ 2. **If ANY task is "pending" or "in_progress"**:
59
+ - DO NOT provide a final answer
60
+ - Continue working on the next pending task
61
+ - Use TodoWrite to mark it "in_progress"
62
+ - Complete the task
63
+ - Mark it "completed" with TodoWrite
64
+ 3. **Only after ALL tasks are "completed"**: Proceed to verification and final answer
65
+
66
+ **VIOLATION CONSEQUENCES**:
67
+ - Providing answers with pending tasks = INVESTIGATION FAILURE
68
+ - You MUST complete the verification task as the final step before any answer
69
+ - Incomplete investigations are unacceptable and must be continued
70
+
71
+ **Task Status Check Example:**
72
+ Before final answer, confirm you see something like:
73
+ [✓] completed - Task 1
74
+ [✓] completed - Task 2[✓] completed - Task 3
75
+ [✓] completed - Investigation Verification
76
+
77
+ If you see ANY `[ ] pending` or `[~] in_progress` tasks, DO NOT provide final answer.
78
+
79
+ Status Update Example:
80
+ # Starting task 2:
81
+ TodoWrite(todos=[
82
+ {"id": "1", "content": "Check pod status", "status": "completed"},
83
+ {"id": "2", "content": "Examine logs", "status": "in_progress"},
84
+ {"id": "3", "content": "Check resources", "status": "pending"}
85
+ ])
86
+
87
+
88
+ {% if todo_list %}
89
+ {{ todo_list }}
90
+ {% endif %}
91
+
92
+ # MANDATORY Multi-Phase Investigation Process
93
+
94
+ For ANY question requiring investigation, you MUST follow this structured approach:
95
+
96
+ ## Phase 1: Initial Investigation
97
+ 1. **IMMEDIATELY START with TodoWrite**: Create initial investigation task list
98
+ 2. **Execute ALL tasks systematically**: Mark each task in_progress → completed
99
+ 3. **Complete EVERY task** in the current list before proceeding
100
+
101
+ ## Phase Evaluation and Continuation
102
+ After completing ALL tasks in current list, you MUST:
103
+
104
+ 1. **STOP and Evaluate**: Ask yourself these critical questions:
105
+ - "Do I have enough information to completely answer the user's question?"
106
+ - "Are there gaps, unexplored areas, or additional root causes to investigate?"
107
+ - "Have I followed the 'five whys' methodology to the actual root cause?"
108
+ - "Did my investigation reveal new questions or areas that need exploration?"
109
+ - "Are there any additional investigation steps I can perform, in order to provide a more accurate solution?"
110
+
111
+ If the answer to any of those questions is 'yes' - The investigation is INCOMPLETE!
112
+
113
+ 2. **If Investigation is INCOMPLETE**:
114
+ - Call TodoWrite to create a NEW task list for the next investigation phase
115
+ - Label it clearly: "Investigation Phase 2: [specific focus area]"
116
+ - Focus tasks on the specific gaps/questions discovered in the previous phase
117
+ - Execute ALL tasks in this new list
118
+ - Repeat this evaluation process
119
+
120
+ 3. **Continue Creating New Phases** until you can answer "YES" to:
121
+ - "Do I have enough information to completely answer the user's question?"
122
+ - "Are there gaps, unexplored areas, or additional root causes to investigate?"
123
+ - "Have I followed the 'five whys' methodology to the actual root cause?"
124
+ - "Did my investigation reveal new questions or areas that need exploration?"
125
+ - "Are there any additional investigation steps I can perform, in order to provide a more accurate solution?"
126
+ - "I have thoroughly investigated all aspects of this problem"
127
+ - "I can provide a complete answer with specific, actionable information"
128
+ - "No additional investigation would improve my answer"
129
+
130
+ ## MANDATORY Final Phase: Final Review
131
+
132
+ **Before providing final answer, you MUST:**
133
+ - Confirm answer addresses user question completely! This is the most important thing
134
+ - Verify all claims backed by tool evidence
135
+ - Ensure actionable information provided
136
+ - If additional investigation steps are required, start a new investigation phase, and create a new task list to gather the missing information.
137
+
138
+ ## CRITICAL ENFORCEMENT RULES
139
+
140
+ **ABSOLUTE REQUIREMENTS:**
141
+ - NO final answer until the final review phase is 100% completed
142
+ - Each investigation phase must have ALL tasks completed before evaluation
143
+ - You MUST explicitly create new investigation phases when gaps are identified
144
+ - Final Review phase is MANDATORY - never skip it
145
+
146
+ **EXAMPLES of Phase Progression:**
147
+
148
+ *Phase 1*: Initial investigation discovers pod crashes
149
+ *Phase 2*: Deep dive into specific pod logs and resource constraints
150
+ *Phase 3*: Investigate upstream services causing the crashes
151
+ *Final Review Phase*: Self-critique and validate the complete solution
152
+
153
+ *Phase 1*: Initial investigation - check pod health, metrics, logs, traces
154
+ *Phase 2*: Based on data from the traces in Phase 1, investigate another workload in the cluster, that seem to be the root cause of the issue. Investigate this workload as well
155
+ *Phase 3*: Based on logs gathered in Phase 2, investigate a 3rd party managed service, that seems to be the cause for the whole chain of events.
156
+ *Final Review Phase*: Validate that the chain of events, accross the different components, can lead to the investigated scenario.
157
+
158
+ **VIOLATION CONSEQUENCES:**
159
+ - Providing answers without Final Review phase = INVESTIGATION FAILURE
160
+ - Skipping investigation phases when gaps exist = INCOMPLETE ANALYSIS
161
+ - Not completing all tasks in a phase = PROCESS VIOLATION
162
+
163
+ # FINAL REVIEW PHASE EXECUTION GUIDE
164
+
165
+ When executing Final Review, you must:
166
+ - Reread the original user question word-by-word
167
+ - Compare against your proposed answer
168
+ - Identify any aspects not addressed
169
+ - Make sure you answer what the user asked!
170
+ - List each claim in your answer
171
+ - Trace each claim back to specific tool outputs
172
+ - Flag any unsupported statements
173
+ - Walk through your "five whys" chain
174
+ - Verify each "why" logically follows from evidence
175
+ - Ensure you reached actual root cause, not just symptoms
176
+ - Verify exact resource names are provided (not generic examples)
177
+ - Check commands are complete and runnable
178
+ - Ensure steps are specific to user's environment
179
+ - List any resource names, namespaces, configurations mentioned
180
+ - Verify each was confirmed via tool calls
181
+ - Flag anything assumed without verification
182
+ - Identify potential weaknesses in your investigation
183
+ - Consider alternative explanations not explored
184
+ - Assess if additional investigation would strengthen answer
185
+ - If there are additional investigation steps that can help the user, start a new phase, and create a new task list to perform these steps
186
+
187
+
188
+ # INVESTIGATION PHASE TRANSITION EXAMPLES
189
+
190
+ **Example 1: Increased Error Rate**
191
+ Phase 1: Check pod status, basic connectivity, logs, traces
192
+ → Evaluation: From traces, detected that the error is related to an upstream service
193
+ Phase 2: Investigate the upstream service detected in Phase 1
194
+ → Evaluation: Found the upstream service has error while connecting to a managed storage service.
195
+ Phase 3: Investigate the external managed storage found in Phase 2
196
+ → Evaluation: Complete - found managed service is down due to outage
197
+ Verification Phase: Validate solution addresses original increased error rate.
198
+
199
+ **Example 2: Application Performance Issue**
200
+ Phase 1: Check application metrics, resource usage
201
+ → Evaluation: Found high CPU usage, but root cause unclear
202
+ Phase 2: Investigate database connections, query performance
203
+ → Evaluation: Complete - found slow database queries causing CPU spike
204
+ Verification Phase: Confirm analysis provides actionable database optimization steps
205
+
206
+ **REMEMBER:** Each evaluation is a decision point:
207
+ - Continue investigating (create new phase) OR
208
+ - Proceed to verification (investigation complete)
209
+
210
+ Never guess - if unsure whether investigation is complete, create another phase.
@@ -6,6 +6,8 @@ If you output an answer and then realize you need to call more tools or there ar
6
6
  If the user provides you with extra instructions in a triple single quotes section, ALWAYS perform their instructions and then perform your investigation.
7
7
  {% include '_current_date_time.jinja2' %}
8
8
 
9
+ {% include 'investigation_procedure.jinja2' %}
10
+
9
11
  {% include '_ai_safety.jinja2' %}
10
12
 
11
13
  Global Instructions
@@ -0,0 +1,85 @@
1
+ You are an expert in automated diagnostics and runbook creation for an AI-driven troubleshooting agents. I will provide you with one or more issue descriptions or test scenarios.
2
+
3
+ Your task is to generate a strictly executable runbook for AI Agent to follow. The runbook should be machine-readable but human-understandable, and must include the following sections:
4
+
5
+ # Runbook Content Structure
6
+
7
+ ## 1. Goal
8
+ - **Primary Objective:** Clearly define the specific category of issues this runbook addresses (e.g., "diagnose network connectivity problems", "troubleshoot pod startup failures", "investigate performance degradation").
9
+ - **Scope:** Specify the environment, technology stack, or system components covered by this runbook.
10
+ - **Agent Mandate:** Explicitly state that the AI agent must follow the workflow steps sequentially and systematically without deviation to ensure consistent, thorough troubleshooting.
11
+ - **Expected Outcome:** Define what successful completion of this runbook should achieve (root cause identification, issue resolution, or escalation criteria).
12
+
13
+ ## 2. Workflow for [Issue Category] Diagnosis
14
+ - Provide numbered, sequential steps the AI agent must execute in order.
15
+ - Each step should specify:
16
+ - **Action:** Describe the diagnostic function conceptually (e.g., "retrieve container logs from specified pod", "check service connectivity between components", "examine resource utilization metrics")
17
+ - **Function Description:** Explain what the function should accomplish rather than naming specific tools (e.g., "query the cluster to list all pods in a namespace and their current status" instead of "kubectl_get_pods()")
18
+ - **Parameters:** What data/arguments to pass to the function (namespace, pod name, time range, etc.)
19
+ - **Expected Output:** What information to gather from the result (status codes, error messages, metrics, configurations)
20
+ - **Success/Failure Criteria:** How to interpret the output and what indicates normal vs. problematic conditions
21
+ - Use conditional logic (IF/ELSE) when branching is required based on findings.
22
+ - Describe functions generically so they can be mapped to available tools (e.g., "execute a command to test network connectivity" rather than "ping_host()")
23
+ - Include verification steps to confirm each diagnostic action was successful.
24
+
25
+ ## 3. Synthesize Findings
26
+ - **Data Correlation:** Describe how the AI agent should combine outputs from multiple workflow steps.
27
+ - **Pattern Recognition:** Specify what patterns, error messages, or metrics indicate specific root causes.
28
+ - **Prioritization Logic:** Provide criteria for ranking potential causes by likelihood or severity.
29
+ - **Evidence Requirements:** Define what evidence is needed to confidently identify each potential root cause.
30
+ - **Example Scenarios:** Include sample synthesis statements showing how findings should be summarized.
31
+
32
+ ## 4. Recommended Remediation Steps
33
+ - **Immediate Actions:** List temporary workarounds or urgent fixes for critical issues.
34
+ - **Permanent Solutions:** Provide step-by-step permanent remediation procedures.
35
+ - **Verification Steps:** Define how to confirm each remediation action was successful.
36
+ - **Documentation References:** Include links to official documentation, best practices, or vendor guidance.
37
+ - **Escalation Criteria:** Specify when and how to escalate if remediation steps fail.
38
+ - **Post-Remediation Monitoring:** Describe what to monitor to prevent recurrence.
39
+
40
+ # File Organization Guidelines
41
+
42
+ ## Folder Structure
43
+ *Category folders are used to distinguish and categorize different runbooks based on their focus area or technology domain. Each runbook must be placed into a specific category folder under `holmes/plugins/runbooks/` for better organization and discoverability. Create a new category folder if your runbook doesn't fit into existing categories.*
44
+
45
+ ## File Naming
46
+ *Use consistent naming conventions for runbook files:*
47
+
48
+ - Use descriptive, lowercase names with hyphens: `dns-resolution-troubleshooting.md`
49
+ - Include the issue type or technology: `redis-connection-issues.md`
50
+ - Avoid generic names like `troubleshooting.md` or `debug.md`
51
+
52
+ ### Catalog Registration
53
+ After creating your runbook, you must add an entry to `catalog.json` in the runbooks directory to make it discoverable by AI agents.
54
+
55
+ **Steps to add a new catalog entry:**
56
+
57
+ 1. **Open** `holmes/plugins/runbooks/catalog.json`
58
+ 2. **Add your entry** to the JSON array following this structure:
59
+ ```json
60
+ {
61
+ "name": "Brief, descriptive name of the runbook",
62
+ "path": "category-folder/your-runbook-filename.md",
63
+ "description": "Clear description of what issues this runbook addresses",
64
+ "tags": ["relevant", "tags", "for", "search"]
65
+ }
66
+ ```
67
+
68
+ 3. **Ensure proper JSON formatting** - add a comma after the previous entry if needed
69
+ 4. **Validate the JSON** is properly formatted before committing
70
+
71
+ **Field Guidelines:**
72
+ - `name`: Keep concise but descriptive (e.g., "Redis Connection Issues")
73
+ - `path`: Always include the category folder (e.g., "database/redis-connection-issues.md")
74
+ - `description`: Explain what specific problems this runbook solves
75
+ - `tags`: Include technology names, issue types, and relevant keywords
76
+
77
+ Example catalog entry:
78
+ ```json
79
+ {
80
+ "name": "DNS Resolution Troubleshooting",
81
+ "path": "networking/dns-resolution-troubleshooting.md",
82
+ "description": "Comprehensive guide for diagnosing and resolving DNS resolution issues in Kubernetes clusters",
83
+ "tags": ["dns", "networking", "kubernetes", "troubleshooting"]
84
+ }
85
+ ```
@@ -20,3 +20,27 @@ This runbook is mainly used for `holmes investigate`
20
20
 
21
21
  Catalog specified in [catalog.json](catalog.json) contains a collection of runbooks written in markdown.
22
22
  During runtime, LLM will compare the runbook description with the user question and return the most matched runbook for investigation. It's possible no runbook is returned for no match.
23
+
24
+ ## Generating Runbooks
25
+
26
+ To ensure all runbooks follow a consistent format and improve troubleshooting accuracy, contributors should use the standardized [runbook format prompt](runbook-format.prompt.md) when creating new runbooks.
27
+
28
+ ### Using the Runbook Format Prompt
29
+
30
+ 1. **Start with the Template**: Use `prompt.md` as your guide when creating new runbooks
31
+ 2. **Follow the Structure**: Ensure your runbook includes all required sections:
32
+ - **Goal**: Clear definition of issues addressed and agent mandate
33
+ - **Workflow**: Sequential diagnostic steps with detailed function descriptions
34
+ - **Synthesize Findings**: Logic for combining outputs and identifying root causes
35
+ - **Recommended Remediation Steps**: Both immediate and permanent solutions
36
+
37
+ ### Benefits of Using the Standard Format
38
+
39
+ - **Consistency**: All runbooks follow the same structure and terminology
40
+ - **AI Agent Compatibility**: Ensures runbooks are machine-readable and executable by AI agents
41
+ - **Improved Accuracy**: Standardized format reduces ambiguity and improves diagnostic success rates
42
+ - **Maintainability**: Easier to update and maintain runbooks across the project
43
+
44
+ ### Example Usage
45
+
46
+ When creating a runbook for a new issue category (e.g., storage problems, authentication failures), provide the issue description to an LLM along with the prompt template to generate a properly formatted runbook that follows the established patterns.