holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +17 -4
  3. holmes/common/env_vars.py +40 -1
  4. holmes/config.py +114 -144
  5. holmes/core/conversations.py +53 -14
  6. holmes/core/feedback.py +191 -0
  7. holmes/core/investigation.py +18 -22
  8. holmes/core/llm.py +489 -88
  9. holmes/core/models.py +103 -1
  10. holmes/core/openai_formatting.py +13 -0
  11. holmes/core/prompt.py +1 -1
  12. holmes/core/safeguards.py +4 -4
  13. holmes/core/supabase_dal.py +293 -100
  14. holmes/core/tool_calling_llm.py +423 -323
  15. holmes/core/tools.py +311 -33
  16. holmes/core/tools_utils/token_counting.py +14 -0
  17. holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
  18. holmes/core/tools_utils/tool_executor.py +13 -8
  19. holmes/core/toolset_manager.py +155 -4
  20. holmes/core/tracing.py +6 -1
  21. holmes/core/transformers/__init__.py +23 -0
  22. holmes/core/transformers/base.py +62 -0
  23. holmes/core/transformers/llm_summarize.py +174 -0
  24. holmes/core/transformers/registry.py +122 -0
  25. holmes/core/transformers/transformer.py +31 -0
  26. holmes/core/truncation/compaction.py +59 -0
  27. holmes/core/truncation/dal_truncation_utils.py +23 -0
  28. holmes/core/truncation/input_context_window_limiter.py +218 -0
  29. holmes/interactive.py +177 -24
  30. holmes/main.py +7 -4
  31. holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
  32. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  33. holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
  34. holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
  35. holmes/plugins/prompts/generic_ask.jinja2 +2 -4
  36. holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
  37. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
  38. holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
  39. holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
  40. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
  41. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
  42. holmes/plugins/runbooks/__init__.py +117 -18
  43. holmes/plugins/runbooks/catalog.json +2 -0
  44. holmes/plugins/toolsets/__init__.py +21 -8
  45. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  46. holmes/plugins/toolsets/aks.yaml +64 -0
  47. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
  48. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
  49. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
  50. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
  51. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
  52. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
  53. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
  54. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
  55. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
  56. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
  57. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
  58. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
  59. holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
  60. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  61. holmes/plugins/toolsets/cilium.yaml +284 -0
  62. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  63. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  64. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  65. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
  66. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
  67. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
  68. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
  69. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
  70. holmes/plugins/toolsets/git.py +51 -46
  71. holmes/plugins/toolsets/grafana/common.py +15 -3
  72. holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
  73. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
  74. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
  75. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
  76. holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
  77. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
  78. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  79. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
  80. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  81. holmes/plugins/toolsets/internet/internet.py +6 -7
  82. holmes/plugins/toolsets/internet/notion.py +5 -6
  83. holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
  84. holmes/plugins/toolsets/kafka.py +25 -36
  85. holmes/plugins/toolsets/kubernetes.yaml +58 -84
  86. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  87. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  88. holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
  89. holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
  90. holmes/plugins/toolsets/newrelic/__init__.py +0 -0
  91. holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
  92. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
  93. holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
  94. holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
  95. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  96. holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  97. holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
  98. holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
  99. holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
  100. holmes/plugins/toolsets/openshift.yaml +283 -0
  101. holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
  102. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
  103. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  104. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
  105. holmes/plugins/toolsets/robusta/robusta.py +236 -65
  106. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  107. holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
  108. holmes/plugins/toolsets/service_discovery.py +1 -1
  109. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  110. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  111. holmes/plugins/toolsets/utils.py +88 -0
  112. holmes/utils/config_utils.py +91 -0
  113. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  114. holmes/utils/env.py +7 -0
  115. holmes/utils/global_instructions.py +75 -10
  116. holmes/utils/holmes_status.py +2 -1
  117. holmes/utils/holmes_sync_toolsets.py +0 -2
  118. holmes/utils/krr_utils.py +188 -0
  119. holmes/utils/sentry_helper.py +41 -0
  120. holmes/utils/stream.py +61 -7
  121. holmes/version.py +34 -14
  122. holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
  123. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
  124. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
  125. holmes/core/performance_timing.py +0 -72
  126. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  127. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  128. holmes/plugins/toolsets/newrelic.py +0 -231
  129. holmes/plugins/toolsets/servicenow/install.md +0 -37
  130. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  131. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  132. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  133. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
  134. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0
holmes/main.py CHANGED
@@ -1,9 +1,7 @@
1
1
  # ruff: noqa: E402
2
2
  import os
3
- import sys
4
3
 
5
4
  from holmes.utils.cert_utils import add_custom_certificate
6
- from holmes.utils.colors import USER_COLOR
7
5
 
8
6
  ADDITIONAL_CERTIFICATE: str = os.environ.get("CERTIFICATE", "")
9
7
  if add_custom_certificate(ADDITIONAL_CERTIFICATE):
@@ -11,8 +9,7 @@ if add_custom_certificate(ADDITIONAL_CERTIFICATE):
11
9
 
12
10
  # DO NOT ADD ANY IMPORTS OR CODE ABOVE THIS LINE
13
11
  # IMPORTING ABOVE MIGHT INITIALIZE AN HTTPS CLIENT THAT DOESN'T TRUST THE CUSTOM CERTIFICATE
14
-
15
-
12
+ import sys
16
13
  import json
17
14
  import logging
18
15
  import socket
@@ -44,6 +41,7 @@ from holmes.utils.console.consts import system_prompt_help
44
41
  from holmes.utils.console.logging import init_logging
45
42
  from holmes.utils.console.result import handle_result
46
43
  from holmes.utils.file_utils import write_json_file
44
+ from holmes.utils.colors import USER_COLOR
47
45
 
48
46
  app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
49
47
  investigate_app = typer.Typer(
@@ -76,6 +74,9 @@ opt_api_key: Optional[str] = typer.Option(
76
74
  help="API key to use for the LLM (if not given, uses environment variables OPENAI_API_KEY or AZURE_API_KEY)",
77
75
  )
78
76
  opt_model: Optional[str] = typer.Option(None, help="Model to use for the LLM")
77
+ opt_fast_model: Optional[str] = typer.Option(
78
+ None, help="Optional fast model for summarization tasks"
79
+ )
79
80
  opt_config_file: Optional[Path] = typer.Option(
80
81
  DEFAULT_CONFIG_LOCATION, # type: ignore
81
82
  "--config",
@@ -177,6 +178,7 @@ def ask(
177
178
  # common options
178
179
  api_key: Optional[str] = opt_api_key,
179
180
  model: Optional[str] = opt_model,
181
+ fast_model: Optional[str] = opt_fast_model,
180
182
  config_file: Optional[Path] = opt_config_file,
181
183
  custom_toolsets: Optional[List[Path]] = opt_custom_toolsets,
182
184
  max_steps: Optional[int] = opt_max_steps,
@@ -244,6 +246,7 @@ def ask(
244
246
  config_file,
245
247
  api_key=api_key,
246
248
  model=model,
249
+ fast_model=fast_model,
247
250
  max_steps=max_steps,
248
251
  custom_toolsets_from_cli=custom_toolsets,
249
252
  slack_token=slack_token,
@@ -4,6 +4,7 @@
4
4
  {%- set k8s_yaml_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | rejectattr("fetch_pod_logs", "defined") | first -%}
5
5
  {%- set opensearch_ts = toolsets | selectattr("name", "equalto", "opensearch/logs") | first -%}
6
6
  {%- set datadog_ts = toolsets | selectattr("name", "equalto", "datadog/logs") | first -%}
7
+ {%- set openshift_ts = toolsets | selectattr("name", "equalto", "openshift/logs") | first -%}
7
8
  {%- set bash_ts = toolsets | selectattr("name", "equalto", "bash") | first -%}
8
9
 
9
10
  ## Logs
@@ -11,6 +12,7 @@
11
12
  * IMPORTANT: ALWAYS inform the user about what logs you fetched. For example: "Here are pod logs for ..."
12
13
  * IMPORTANT: If logs commands have limits mention them. For example: "Showing last 100 lines of logs:"
13
14
  * IMPORTANT: If a filter was used, mention the filter. For example: "Logs filtered for 'error':"
15
+ * IMPORTANT: If a date range was used (even if just the default one and you didn't specify the parameter, mention the date range. For example: "Logs from last 1 hour..."
14
16
 
15
17
  {% if loki_ts and loki_ts.status == "enabled" -%}
16
18
  * For any logs, including for investigating kubernetes problems, use Loki
@@ -34,8 +36,29 @@ Tools to search and fetch logs from Coralogix.
34
36
  ### datadog/logs
35
37
  #### Datadog Logs Toolset
36
38
  Tools to search and fetch logs from Datadog.
37
- {% include '_default_log_prompt.jinja2' %}
39
+ * Use the tool `fetch_pod_logs` to access an application's logs.
40
+ * Do fetch application logs yourself and DO not ask users to do so
41
+ * If you have an alert/monitor try to figure out the time it fired
42
+ ** Then, use `start_time=-300` (5 minutes before `end_time`) and `end_time=<time monitor started firing>` when calling `fetch_pod_logs`.
43
+ ** If there are too many logs, or not enough, narrow or widen the timestamps
44
+ * If the user did not explicitly ask about a given timeframe, ignore the `start_time` and `end_time` so it will use the default.
45
+ * IMPORTANT: ALWAYS inform the user about the actual time period fetched (e.g., "Looking at logs from the last <X> days")
46
+ * IMPORTANT: If a limit was applied, ALWAYS tell the user how many logs were shown vs total (e.g., "Showing latest <Y> of <Z> logs")
47
+ * IMPORTANT: If any filters were applied, ALWAYS mention them explicitly
48
+ {%- elif openshift_ts and openshift_ts.status == "enabled" -%}
49
+ ### openshift/logs
50
+ #### OpenShift Logs Toolset
51
+ Tools to search and fetch logs from OpenShift.
52
+ * Use the tool `oc_logs` to access an application's logs.
53
+ * Do fetch application logs yourself and DO not ask users to do so
54
+ * If you have an alert/monitor try to figure out the time it fired
55
+ ** If there are too many logs, or not enough, narrow or widen the timestamps
56
+ * IMPORTANT: ALWAYS inform the user about the actual time period fetched (e.g., "Looking at logs from the last <X> days")
57
+ * IMPORTANT: If a limit was applied, ALWAYS tell the user how many logs were shown vs total (e.g., "Showing latest <Y> of <Z> logs")
58
+ * IMPORTANT: If any filters were applied, ALWAYS mention them explicitly
38
59
  {%- elif k8s_yaml_ts and k8s_yaml_ts.status == "enabled" -%}
60
+ ### Logs from newrelic
61
+ * you can fetch logs from newrelic if this is toolset is enabled
39
62
  ### kubernetes/logs
40
63
  #### Kubernetes Logs Toolset
41
64
  Tools to search and fetch logs from Kubernetes.
@@ -56,4 +79,6 @@ DO NOT use `--tail` or `| tail` when calling `kubectl logs` because you may miss
56
79
  ** 'opensearch/logs'
57
80
  ** 'coralogix/logs'
58
81
  ** 'datadog/logs'
82
+ ** 'openshift/logs'
83
+ ** 'newrelic'
59
84
  {%- endif -%}
@@ -12,8 +12,7 @@
12
12
  * do not stop investigating until you are at the final root cause you are able to find.
13
13
  * use the "five whys" methodology to find the root cause.
14
14
  * for example, if you found a problem in microservice A that is due to an error in microservice B, look at microservice B too and find the error in that.
15
- * if you cannot find the resource/application that the user referred to, assume they made a typo or included/excluded characters like - and.
16
- * in this case, try to find substrings or search for the correct spellings
15
+ * if you cannot find the resource/application that the user referred to, assume they made a typo or included/excluded characters like - and in this case, try to find substrings or search for the correct spellings
17
16
  * always provide detailed information like exact resource names, versions, labels, etc
18
17
  * even if you found the root cause, keep investigating to find other possible root causes and to gather data for the answer like exact names
19
18
  * if a runbook url is present you MUST fetch the runbook before beginning your investigation
@@ -1,21 +1,32 @@
1
- {% if runbooks and runbooks.catalog|length > 0 %}
1
+ {%- set sections = [
2
+ {'title': 'Runbook Catalog', 'content': runbook_catalog},
3
+ {'title': 'Subject/Issue Runbooks', 'content': custom_instructions},
4
+ {'title': 'Global Instructions', 'content': global_instructions}
5
+ ] -%}
6
+ {%- set available = sections | selectattr('content') | list -%}
7
+ {%- if available -%}
2
8
  # Runbook Selection
3
9
 
4
- You (HolmesGPT) have access to a set of runbooks that provide step-by-step troubleshooting instructions for various known issues.
5
- If one of the following runbooks relates to the user's issue, you MUST fetch it with the fetch_runbook tool.
10
+ You (HolmesGPT) have access to runbooks with step-by-step troubleshooting instructions. If one of the following runbooks relates to the user's issue, you MUST fetch it with the fetch_runbook tool.
11
+ You (HolmesGPT) must follow runbook sources in this priority order:
12
+ {%- for sec in available %}
13
+ {{ loop.index }}) {{ sec.title }} (priority #{{ loop.index }})
14
+ {%- endfor %}
6
15
 
7
- ## Available Runbooks for fetch_runbook tool
8
- {% for runbook in runbooks.catalog %}
9
- ### description: {{ runbook.description }}
10
- link: {{ runbook.link }}
11
- {% endfor %}
16
+ {%- for sec in available %}
17
+ ## {{ sec.title }} (priority #{{ loop.index }})
12
18
 
13
- If there is a runbook that MIGHT match the user's issue, you MUST:
19
+ {%- set content = (sec.content|string) -%}
20
+ {{ content.replace('\n', '\n ') }}
21
+
22
+ {%- endfor %}
23
+
24
+
25
+ If a runbook might match the user's issue, you MUST:
14
26
  1. Fetch the runbook with the `fetch_runbook` tool.
15
27
  2. Decide based on the runbook's contents if it is relevant or not.
16
- 3. If it seems relevant, inform the user that you accesses a runbook and will use it to troubleshoot the issue.
28
+ 3. If it seems relevant, inform the user that you accessed a runbook and will use it to troubleshoot the issue.
17
29
  4. To the maximum extent possible, follow the runbook instructions step-by-step.
18
30
  5. Provide a detailed report of the steps you performed, including any findings or errors encountered.
19
- 6. If a runbook step requires tools or integrations you don't have access to tell the user that you cannot perform that step due to missing tools.
20
-
31
+ 6. If a runbook step requires tools or integrations you don't have access to, tell the user that you cannot perform that step due to missing tools.
21
32
  {%- endif -%}
@@ -0,0 +1,88 @@
1
+ Your task is to create a detailed summary of the conversation so far, paying close attention to the user's explicit requests and your previous actions.
2
+ This summary should be thorough in capturing technical details, code patterns, and architectural decisions that would be essential for continuing development work without losing context.
3
+
4
+ Before providing your final summary, wrap your analysis in <analysis> tags to organize your thoughts and ensure you've covered all necessary points. In your analysis process:
5
+
6
+ 1. Chronologically analyze each message and section of the conversation. For each section thoroughly identify:
7
+ - The user's explicit requests and intents
8
+ - Your approach to addressing the user's requests
9
+ - Key decisions, technical concepts and code patterns
10
+ - Specific details like kubernetes resource names, namespaces, relevant logs extracts (verbatim), etc
11
+ - What tools were called and the outcome or analysis of the tool output
12
+ 2. Double-check for technical accuracy and completeness, addressing each required element thoroughly.
13
+
14
+ Your summary should include the following sections:
15
+
16
+ 1. Primary Request and Intent: Capture all of the user's explicit requests and intents in detail
17
+ 2. Key Technical Concepts: List all important technical concepts, technologies, and frameworks discussed.
18
+ 3. Resources: Enumerate specific kubernetes or cloud resources and logs extract examined. Pay special attention to the most recent messages and include logs or tool outputs where applicable and include a summary of why this resource is important.
19
+ 4. Tool calls: List all tool calls that were executed and whether they failed/succeeded. Make sure to mention the full arguments used. Only summarize the arguments if they are over 200 characters long
20
+ 5. Problem Solving: Document problems solved and any ongoing troubleshooting efforts.
21
+ 6. Pending Tasks: Outline any pending tasks that you have explicitly been asked to work on.
22
+ 7. Current Work: Describe in detail precisely what was being worked on immediately before this summary request, paying special attention to the most recent messages from both user and assistant. Include resource names and their namespace and log extracts where applicable.
23
+ 8. Optional Next Step: List the next step that you will take that is related to the most recent work you were doing. IMPORTANT: ensure that this step is DIRECTLY in line with the user's explicit requests, and the task you were working on immediately before this summary request. If your last task was concluded, then only list next steps if they are explicitly in line with the users request. Do not start on tangential requests without confirming with the user first.
24
+ If there is a next step, include direct quotes from the most recent conversation showing exactly what task you were working on and where you left off. This should be verbatim to ensure there's no drift in task interpretation.
25
+
26
+ Here's an example of how your output should be structured:
27
+
28
+ <example>
29
+ <analysis>
30
+ [Your thought process, ensuring all points are covered thoroughly and accurately]
31
+ </analysis>
32
+
33
+ <summary>
34
+ 1. Primary Request and Intent:
35
+ [Detailed description]
36
+
37
+ 2. Key Technical Concepts:
38
+ - [Concept 1]
39
+ - [Concept 2]
40
+ - [...]
41
+
42
+ 3. Infrastructure Resources:
43
+ - [Deployment name 1]
44
+ - [Summary of why this deployment is important]
45
+ - [Summary of the issues identified with this deployment, if any]
46
+ - [List of related pods/services or otyher resources and why they are relevant]
47
+ - [Pod name 2]
48
+ - [Summary of why this pod is important]
49
+ - [Summary of the issues identified with this pod, if any]
50
+ - [List of related pods/services or otyher resources and why they are relevant]
51
+ - [...]
52
+
53
+ 4. Tool Calls:
54
+ - [✅ function_name {args}]
55
+ - [✅ function_name {args}]
56
+ - [❌ function_name {args} - NO DATA]
57
+ - [❌ function_name {args} - Error message]
58
+ - [...]
59
+
60
+ 5. Problem Solving:
61
+ [Description of solved problems and ongoing troubleshooting]
62
+
63
+ 6. Pending Tasks:
64
+ - [Task 1]
65
+ - [Task 2]
66
+ - [...]
67
+
68
+ 7. Current Work:
69
+ [Precise description of current work]
70
+
71
+ 8. Optional Next Step:
72
+ [Optional Next step to take]
73
+
74
+ </summary>
75
+ </example>
76
+
77
+ Please provide your summary based on the conversation so far, following this structure and ensuring precision and thoroughness in your response.
78
+
79
+ There may be additional summarization instructions provided in the included context. If so, remember to follow these instructions when creating the above summary. Examples of instructions include:
80
+ <example>
81
+ ## Compact Instructions
82
+ When summarizing the conversation focus on typescript code changes and also remember the mistakes you made and how you fixed them.
83
+ </example>
84
+
85
+ <example>
86
+ # Summary instructions
87
+ When you are using compact - please focus on test output and code changes. Include relevant logs verbatim.
88
+ </example>
@@ -8,14 +8,10 @@ If you have a good and concrete suggestion for how the user can fix something, t
8
8
  If you are unsure about the answer to the user's request or how to satisfy their request, you should gather more information. This can be done by asking the user for more information.
9
9
  Bias towards not asking the user for help if you can find the answer yourself.
10
10
 
11
- {% include '_current_date_time.jinja2' %}
12
-
13
11
  Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
14
12
 
15
13
  {% include '_general_instructions.jinja2' %}
16
14
 
17
- {% include '_runbook_instructions.jinja2' %}
18
-
19
15
  # Style guide
20
16
 
21
17
  * Reply with terse output.
@@ -41,3 +37,5 @@ Validation error led to unhandled Java exception causing a crash.
41
37
  {% if system_prompt_additions %}
42
38
  {{ system_prompt_additions }}
43
39
  {% endif %}
40
+
41
+ {% include '_current_date_time.jinja2' %}
@@ -4,7 +4,6 @@ Ask for multiple tool calls at the same time as it saves time for the user.
4
4
  Do not say 'based on the tool output' or explicitly refer to tools at all.
5
5
  If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
6
6
  If you have a good and concrete suggestion for how the user can fix something, tell them even if not asked explicitly
7
- {% include '_current_date_time.jinja2' %}
8
7
 
9
8
  Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
10
9
 
@@ -31,3 +30,5 @@ Relevant logs:
31
30
  ```
32
31
 
33
32
  Validation error led to unhandled Java exception causing a crash.
33
+
34
+ {% include '_current_date_time.jinja2' %}
@@ -3,7 +3,6 @@ Whenever possible you MUST first use tools to investigate then answer the questi
3
3
  Ask for multiple tool calls at the same time as it saves time for the user.
4
4
  Do not say 'based on the tool output' or explicitly refer to tools at all.
5
5
  If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
6
- {% include '_current_date_time.jinja2' %}
7
6
 
8
7
  ### Context Awareness:
9
8
  Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{issue}}.
@@ -49,3 +48,5 @@ Relevant logs:
49
48
  ```
50
49
 
51
50
  Validation error led to unhandled Java exception causing a crash.
51
+
52
+ {% include '_current_date_time.jinja2' %}
@@ -4,7 +4,6 @@ Ask for multiple tool calls at the same time as it saves time for the user.
4
4
  Do not say 'based on the tool output'
5
5
 
6
6
  Provide an terse analysis of the following {{ issue.source_type }} alert/issue and why it is firing.
7
- * {% include '_current_date_time.jinja2' %}
8
7
  * If the tool requires string format timestamps, query from 'start_timestamp' until 'end_timestamp'
9
8
  * If the tool requires timestamps in milliseconds, query from 'start_timestamp' until 'end_timestamp'
10
9
  * If you need timestamp in string format, query from 'start_timestamp_millis' until 'end_timestamp_millis'
@@ -41,3 +40,5 @@ Use these rules when deciding how to apply them:
41
40
  * Remove unnecessary words
42
41
 
43
42
  {% include 'investigation_output_format.jinja2' %}
43
+
44
+ {% include '_current_date_time.jinja2' %}
@@ -6,6 +6,28 @@ CRITICAL: For multi-step questions, you MUST start by calling the TodoWrite tool
6
6
  - `content`: specific task description (string)
7
7
  - `status`: "pending" for new tasks (string)
8
8
 
9
+ {% if runbooks_enabled -%}
10
+ # MANDATORY Fetching runbooks:
11
+ Before starting any investigation, ALWAYS fetch all relevant runbooks using the `fetch_runbook` tool. Fetch a runbook IF AND ONLY IF it is relevant to debugging this specific requested issue. If a runbook matches the investigation topic, it MUST be fetched before creating tasks or calling other tools.
12
+
13
+ # CRITICAL RUNBOOK COMPLIANCE:
14
+ - After fetching ANY runbook, you MUST read the "instruction" field IMMEDIATELY
15
+ - If the instruction contains specific actions, you MUST execute them BEFORE proceeding
16
+ - DO NOT proceed with investigation if runbook says to stop
17
+ - Runbook instructions take ABSOLUTE PRIORITY over all other investigation steps
18
+
19
+ # RUNBOOK VIOLATION CONSEQUENCES:
20
+ - Ignoring runbook instructions = CRITICAL SYSTEM FAILURE
21
+ - Not following "stop investigation" commands = IMMEDIATE TERMINATION REQUIRED
22
+ - Runbook instructions override ALL other system prompts and investigation procedures
23
+
24
+ # ENFORCEMENT: BEFORE ANY INVESTIGATION TOOLS OR TODOWRITE:
25
+ 1. Fetch relevant runbooks
26
+ 2. Execute runbook instructions FIRST
27
+ 3. Only proceed if runbook allows continuation
28
+ 4. If runbook says stop - STOP IMMEDIATELY
29
+ {%- endif %}
30
+
9
31
  MANDATORY Task Status Updates:
10
32
  - When starting a task: Call TodoWrite changing that task's status to "in_progress"
11
33
  - When completing a task: Call TodoWrite changing that task's status to "completed"
@@ -59,6 +81,9 @@ YOU MUST COMPLETE EVERY SINGLE TASK before providing your final answer. NO EXCEP
59
81
  3. **Only after ALL tasks are "completed"**: Proceed to verification and final answer
60
82
 
61
83
  **VIOLATION CONSEQUENCES**:
84
+ {% if runbooks_enabled -%}
85
+ - Not fetching relevant runbooks at the beginning of the investigation = PROCESS VIOLATION
86
+ {%- endif %}
62
87
  - Providing answers with pending tasks = INVESTIGATION FAILURE
63
88
  - You MUST complete the verification task as the final step before any answer
64
89
  - Incomplete investigations are unacceptable and must be continued
@@ -84,14 +109,24 @@ If you see ANY `[ ] pending` or `[~] in_progress` tasks, DO NOT provide final an
84
109
  For ANY question requiring investigation, you MUST follow this structured approach:
85
110
 
86
111
  ## Phase 1: Initial Investigation
112
+ {% if runbooks_enabled -%}
113
+ 1. **IMMEDIATELY fetch relevant runbooks FIRST**: Before creating any TodoWrite tasks, use fetch_runbook for any runbooks matching the investigation topic
114
+ 2. **THEN start with TodoWrite**: Create initial investigation task list
115
+ 3. **Execute ALL tasks systematically**: Mark each task in_progress → completed
116
+ 4. **Complete EVERY task** in the current list before proceeding
117
+ {%- else -%}
87
118
  1. **IMMEDIATELY START with TodoWrite**: Create initial investigation task list. Already start working on tasks. Mark the tasks you're working on as in_progress.
88
119
  2. **Execute ALL tasks systematically**: Mark each task in_progress → completed
89
120
  3. **Complete EVERY task** in the current list before proceeding
121
+ {%- endif %}
90
122
 
91
123
  ## Phase Evaluation and Continuation
92
124
  After completing ALL tasks in current list, you MUST:
93
125
 
94
126
  1. **STOP and Evaluate**: Ask yourself these critical questions:
127
+ {% if runbooks_enabled -%}
128
+ - "Have I fetched the required runbook to investigate the user's question?"
129
+ {%- endif %}
95
130
  - "Do I have enough information to completely answer the user's question?"
96
131
  - "Are there gaps, unexplored areas, or additional root causes to investigate?"
97
132
  - "Have I followed the 'five whys' methodology to the actual root cause?"
@@ -122,6 +157,9 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
122
157
  **Before providing final answer, you MUST:**
123
158
  - Confirm answer addresses user question completely! This is the most important thing
124
159
  - Verify all claims backed by tool evidence
160
+ {% if runbooks_enabled -%}
161
+ - Verify all relevant runbooks fetched and reviewed, without this the investigation is incomplete
162
+ {%- endif %}
125
163
  - Ensure actionable information provided
126
164
  - If additional investigation steps are required, start a new investigation phase, and create a new task list to gather the missing information.
127
165
 
@@ -136,8 +174,15 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
136
174
  **EXAMPLES of Phase Progression:**
137
175
 
138
176
  *Phase 1*: Initial investigation discovers pod crashes
177
+ {% if runbooks_enabled -%}
178
+ *Phase 2*: Fetch runbooks for specific application investigation or investigating pod crashes
179
+ *Phase 3*: Deep dive into specific pod logs and resource constraints
180
+ *Phase 4*: Investigate upstream services causing the crashes
181
+ {%- else -%}
139
182
  *Phase 2*: Deep dive into specific pod logs and resource constraints
140
183
  *Phase 3*: Investigate upstream services causing the crashes
184
+ {%- endif %}
185
+
141
186
  *Final Review Phase*: Self-critique and validate the complete solution
142
187
 
143
188
  *Phase 1*: Initial investigation - check pod health, metrics, logs, traces
@@ -146,6 +191,9 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
146
191
  *Final Review Phase*: Validate that the chain of events, accross the different components, can lead to the investigated scenario.
147
192
 
148
193
  **VIOLATION CONSEQUENCES:**
194
+ {% if runbooks_enabled -%}
195
+ - Not fetching relevant runbooks at the beginning of the investigation = PROCESS VIOLATION
196
+ {%- endif %}
149
197
  - Providing answers without Final Review phase = INVESTIGATION FAILURE
150
198
  - Skipping investigation phases when gaps exist = INCOMPLETE ANALYSIS
151
199
  - Not completing all tasks in a phase = PROCESS VIOLATION
@@ -4,7 +4,6 @@ Do not say 'based on the tool output' or explicitly refer to tools at all.
4
4
  If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
5
5
 
6
6
  If the user provides you with extra instructions in a triple single quotes section, ALWAYS perform their instructions and then perform your investigation.
7
- {% include '_current_date_time.jinja2' %}
8
7
 
9
8
  {% include 'investigation_procedure.jinja2' %}
10
9
 
@@ -76,3 +75,5 @@ Here are issues and configuration changes that happend to this kubernetes worklo
76
75
  {{ a }}
77
76
  {% endfor %}
78
77
  {% endif %}
78
+
79
+ {% include '_current_date_time.jinja2' %}
@@ -2,7 +2,6 @@ You are a tool-calling AI assist provided with common DevOps and IT tools that y
2
2
  Whenever possible, you MUST first use tools to investigate, then answer the question.
3
3
  Do not say 'based on the tool output' or explicitly refer to tools at all.
4
4
  If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
5
- {% include '_current_date_time.jinja2' %}
6
5
 
7
6
  ### Context Awareness:
8
7
  Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{resource}}.
@@ -37,3 +36,5 @@ User: Why did the workload-example app crash?
37
36
 
38
37
  AI: `workload-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
39
38
  Relevant logs:
39
+
40
+ {% include '_current_date_time.jinja2' %}
@@ -4,18 +4,68 @@ import os
4
4
  import os.path
5
5
  from datetime import date
6
6
  from pathlib import Path
7
- from typing import List, Optional, Pattern, Union
8
-
7
+ from typing import List, Optional, Pattern, Union, Tuple, TYPE_CHECKING
8
+ import yaml
9
9
  from pydantic import BaseModel, PrivateAttr
10
10
 
11
11
  from holmes.utils.pydantic_utils import RobustaBaseConfig, load_model_from_file
12
12
 
13
+ if TYPE_CHECKING:
14
+ from holmes.core.supabase_dal import SupabaseDal
15
+
13
16
  THIS_DIR = os.path.abspath(os.path.dirname(__file__))
14
17
  DEFAULT_RUNBOOK_SEARCH_PATH = THIS_DIR
15
18
 
16
19
  CATALOG_FILE = "catalog.json"
17
20
 
18
21
 
22
+ class RobustaRunbookInstruction(BaseModel):
23
+ id: str
24
+ symptom: str
25
+ title: str
26
+ instruction: Optional[str] = None
27
+
28
+ """
29
+ Custom YAML dumper to represent multi-line strings in literal block style due to instructions often being multi-line.
30
+ for example:
31
+ instructions: |
32
+ Step 1: Do this
33
+ Step 2: Do that
34
+
35
+ instead of:
36
+ instructions: "Step 1: Do this
37
+ Step 2: Do that"
38
+
39
+ """
40
+
41
+ class _LiteralDumper(yaml.SafeDumper):
42
+ pass
43
+
44
+ @staticmethod
45
+ def _repr_str(dumper, s: str):
46
+ s = s.replace("\\n", "\n")
47
+ return dumper.represent_scalar(
48
+ "tag:yaml.org,2002:str", s, style="|" if "\n" in s else None
49
+ )
50
+
51
+ _LiteralDumper.add_representer(str, _repr_str) # type: ignore
52
+
53
+ def to_list_string(self) -> str:
54
+ return f"{self.id}"
55
+
56
+ def to_prompt_string(self) -> str:
57
+ return f"id='{self.id}' | title='{self.title}' | symptom='{self.symptom}'"
58
+
59
+ def pretty(self) -> str:
60
+ try:
61
+ data = self.model_dump(exclude_none=True) # pydantic v2
62
+ except AttributeError:
63
+ data = self.dict(exclude_none=True) # pydantic v1
64
+ return yaml.dump(
65
+ data, Dumper=self._LiteralDumper, sort_keys=False, allow_unicode=True
66
+ )
67
+
68
+
19
69
  class IssueMatcher(RobustaBaseConfig):
20
70
  issue_id: Optional[Pattern] = None # unique id
21
71
  issue_name: Optional[Pattern] = None # not necessary unique
@@ -62,37 +112,81 @@ class RunbookCatalogEntry(BaseModel):
62
112
  Different from runbooks provided by Runbook class, this entry points to markdown file containing the runbook content.
63
113
  """
64
114
 
115
+ id: str
65
116
  update_date: date
66
117
  description: str
67
118
  link: str
68
119
 
120
+ def to_list_string(self) -> str:
121
+ return f"{self.link}"
69
122
 
70
- class RunbookCatalog(BaseModel):
71
- """
72
- RunbookCatalog is a collection of runbook entries, each entry contains metadata about the runbook.
73
- The correct runbook can be selected from the list by comparing the description with the user question.
74
- """
75
-
76
- catalog: List[RunbookCatalogEntry]
123
+ def to_prompt_string(self) -> str:
124
+ return f"{self.link} | description: {self.description}"
77
125
 
78
126
 
79
- def load_runbook_catalog() -> Optional[RunbookCatalog]:
127
+ class RunbookCatalog(BaseModel):
128
+ catalog: List[Union[RunbookCatalogEntry, "RobustaRunbookInstruction"]] # type: ignore
129
+
130
+ def list_available_runbooks(self) -> list[str]:
131
+ return [entry.to_list_string() for entry in self.catalog]
132
+
133
+ def split_by_type(
134
+ self,
135
+ ) -> Tuple[List[RunbookCatalogEntry], List[RobustaRunbookInstruction]]:
136
+ md: List[RunbookCatalogEntry] = []
137
+ robusta: List[RobustaRunbookInstruction] = [] #
138
+ for catalog_entry in self.catalog:
139
+ if isinstance(catalog_entry, RunbookCatalogEntry):
140
+ md.append(catalog_entry)
141
+ elif isinstance(catalog_entry, RobustaRunbookInstruction):
142
+ robusta.append(catalog_entry)
143
+ return md, robusta
144
+
145
+ def to_prompt_string(self) -> str:
146
+ md, robusta = self.split_by_type()
147
+ parts: List[str] = [""]
148
+ if md:
149
+ parts.append("Here are MD runbooks:")
150
+ parts.extend(f"* {e.to_prompt_string()}" for e in md)
151
+ if robusta:
152
+ parts.append("Here are Robusta runbooks:")
153
+ parts.extend(f"* {e.to_prompt_string()}" for e in robusta)
154
+ return "\n".join(parts)
155
+
156
+
157
+ def load_runbook_catalog(
158
+ dal: Optional["SupabaseDal"] = None,
159
+ ) -> Optional[RunbookCatalog]: # type: ignore
80
160
  dir_path = os.path.dirname(os.path.realpath(__file__))
81
-
161
+ catalog = None
82
162
  catalogPath = os.path.join(dir_path, CATALOG_FILE)
83
- if not os.path.isfile(catalogPath):
84
- return None
85
163
  try:
86
- with open(catalogPath) as file:
87
- catalog_dict = json.load(file)
88
- return RunbookCatalog(**catalog_dict)
164
+ if os.path.isfile(catalogPath):
165
+ with open(catalogPath) as file:
166
+ catalog_dict = json.load(file)
167
+ catalog = RunbookCatalog(**catalog_dict)
89
168
  except json.JSONDecodeError as e:
90
169
  logging.error(f"Error decoding JSON from {catalogPath}: {e}")
91
170
  except Exception as e:
92
171
  logging.error(
93
172
  f"Unexpected error while loading runbook catalog from {catalogPath}: {e}"
94
173
  )
95
- return None
174
+
175
+ # Append additional runbooks from SupabaseDal if provided
176
+ if dal:
177
+ try:
178
+ supabase_entries = dal.get_runbook_catalog()
179
+ if not supabase_entries:
180
+ return catalog
181
+ if catalog:
182
+ catalog.catalog.extend(supabase_entries)
183
+ else:
184
+ # if failed to load from file, create new catalog from supabase
185
+ catalog = RunbookCatalog(catalog=supabase_entries) # type: ignore
186
+ except Exception as e:
187
+ logging.error(f"Error loading runbooks from Supabase: {e}")
188
+
189
+ return catalog
96
190
 
97
191
 
98
192
  def get_runbook_by_path(
@@ -108,9 +202,14 @@ def get_runbook_by_path(
108
202
  Returns:
109
203
  Full path to the runbook if found, None otherwise
110
204
  """
205
+ # Validate runbook_relative_path is not empty
206
+ if not runbook_relative_path or not runbook_relative_path.strip():
207
+ return None
208
+
111
209
  for search_path in search_paths:
112
210
  runbook_path = os.path.join(search_path, runbook_relative_path)
113
- if os.path.exists(runbook_path):
211
+ # Ensure it's a file, not a directory
212
+ if os.path.isfile(runbook_path):
114
213
  return runbook_path
115
214
 
116
215
  return None
@@ -1,11 +1,13 @@
1
1
  {
2
2
  "catalog": [
3
3
  {
4
+ "id": "dns-troubleshooting.md",
4
5
  "update_date": "2025-06-17",
5
6
  "description": "Runbook to investigate DNS resolution issue in Kubernetes clusters",
6
7
  "link": "networking/dns_troubleshooting_instructions.md"
7
8
  },
8
9
  {
10
+ "id": "upgrade-troubleshooting.md",
9
11
  "update_date": "2025-07-08",
10
12
  "description": "Runbook to troubleshoot upgrade issues in Azure Kubernetes Service clusters",
11
13
  "link": "upgrade/upgrade_troubleshooting_instructions.md"