holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +20 -6
  3. holmes/common/env_vars.py +58 -3
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +123 -148
  6. holmes/core/conversations.py +71 -15
  7. holmes/core/feedback.py +191 -0
  8. holmes/core/investigation.py +31 -39
  9. holmes/core/investigation_structured_output.py +3 -3
  10. holmes/core/issue.py +1 -1
  11. holmes/core/llm.py +508 -88
  12. holmes/core/models.py +108 -4
  13. holmes/core/openai_formatting.py +14 -1
  14. holmes/core/prompt.py +48 -3
  15. holmes/core/runbooks.py +1 -0
  16. holmes/core/safeguards.py +8 -6
  17. holmes/core/supabase_dal.py +295 -100
  18. holmes/core/tool_calling_llm.py +489 -428
  19. holmes/core/tools.py +325 -56
  20. holmes/core/tools_utils/token_counting.py +21 -0
  21. holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
  22. holmes/core/tools_utils/tool_executor.py +0 -13
  23. holmes/core/tools_utils/toolset_utils.py +1 -0
  24. holmes/core/toolset_manager.py +191 -5
  25. holmes/core/tracing.py +19 -3
  26. holmes/core/transformers/__init__.py +23 -0
  27. holmes/core/transformers/base.py +63 -0
  28. holmes/core/transformers/llm_summarize.py +175 -0
  29. holmes/core/transformers/registry.py +123 -0
  30. holmes/core/transformers/transformer.py +32 -0
  31. holmes/core/truncation/compaction.py +94 -0
  32. holmes/core/truncation/dal_truncation_utils.py +23 -0
  33. holmes/core/truncation/input_context_window_limiter.py +219 -0
  34. holmes/interactive.py +228 -31
  35. holmes/main.py +23 -40
  36. holmes/plugins/interfaces.py +2 -1
  37. holmes/plugins/prompts/__init__.py +2 -1
  38. holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
  39. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
  41. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  42. holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
  43. holmes/plugins/prompts/generic_ask.jinja2 +0 -4
  44. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
  45. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
  46. holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
  47. holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
  48. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
  49. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
  50. holmes/plugins/runbooks/__init__.py +145 -17
  51. holmes/plugins/runbooks/catalog.json +2 -0
  52. holmes/plugins/sources/github/__init__.py +4 -2
  53. holmes/plugins/sources/prometheus/models.py +1 -0
  54. holmes/plugins/toolsets/__init__.py +44 -27
  55. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  56. holmes/plugins/toolsets/aks.yaml +64 -0
  57. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
  58. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  59. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  60. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  61. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  62. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  63. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
  64. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
  65. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
  66. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
  67. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
  68. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
  69. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
  70. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
  71. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
  72. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
  73. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
  74. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  75. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  76. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  77. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  78. holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
  79. holmes/plugins/toolsets/bash/common/bash.py +23 -13
  80. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  81. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  82. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  83. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  84. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  85. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  86. holmes/plugins/toolsets/cilium.yaml +284 -0
  87. holmes/plugins/toolsets/connectivity_check.py +124 -0
  88. holmes/plugins/toolsets/coralogix/api.py +132 -119
  89. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  90. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  91. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  92. holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
  93. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
  94. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  95. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  96. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  97. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  98. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
  99. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
  100. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
  101. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
  102. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  103. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  104. holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  105. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
  106. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
  107. holmes/plugins/toolsets/git.py +54 -50
  108. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  109. holmes/plugins/toolsets/grafana/common.py +13 -29
  110. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
  111. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
  112. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
  113. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  114. holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
  115. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
  116. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  117. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
  118. holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
  119. holmes/plugins/toolsets/internet/internet.py +15 -16
  120. holmes/plugins/toolsets/internet/notion.py +9 -11
  121. holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
  122. holmes/plugins/toolsets/investigator/model.py +3 -1
  123. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  124. holmes/plugins/toolsets/kafka.py +36 -42
  125. holmes/plugins/toolsets/kubernetes.yaml +317 -113
  126. holmes/plugins/toolsets/kubernetes_logs.py +9 -9
  127. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  128. holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
  129. holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
  130. holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
  131. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
  132. holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
  133. holmes/plugins/toolsets/openshift.yaml +283 -0
  134. holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
  135. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
  136. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  137. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  138. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
  139. holmes/plugins/toolsets/robusta/robusta.py +239 -68
  140. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  141. holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
  142. holmes/plugins/toolsets/service_discovery.py +1 -1
  143. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  144. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  145. holmes/plugins/toolsets/utils.py +88 -0
  146. holmes/utils/config_utils.py +91 -0
  147. holmes/utils/connection_utils.py +31 -0
  148. holmes/utils/console/result.py +10 -0
  149. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  150. holmes/utils/env.py +7 -0
  151. holmes/utils/file_utils.py +2 -1
  152. holmes/utils/global_instructions.py +60 -11
  153. holmes/utils/holmes_status.py +6 -4
  154. holmes/utils/holmes_sync_toolsets.py +0 -2
  155. holmes/utils/krr_utils.py +188 -0
  156. holmes/utils/log.py +15 -0
  157. holmes/utils/markdown_utils.py +2 -3
  158. holmes/utils/memory_limit.py +58 -0
  159. holmes/utils/sentry_helper.py +64 -0
  160. holmes/utils/stream.py +69 -8
  161. holmes/utils/tags.py +4 -3
  162. holmes/version.py +37 -15
  163. holmesgpt-0.18.4.dist-info/LICENSE +178 -0
  164. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
  165. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  166. holmes/core/performance_timing.py +0 -72
  167. holmes/plugins/toolsets/aws.yaml +0 -80
  168. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
  169. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  170. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
  171. holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
  172. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  173. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  174. holmes/plugins/toolsets/newrelic.py +0 -231
  175. holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
  176. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  177. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
  178. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  179. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  180. holmes/plugins/toolsets/servicenow/install.md +0 -37
  181. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  182. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  183. holmes/utils/keygen_utils.py +0 -6
  184. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  185. holmesgpt-0.13.2.dist-info/RECORD +0 -234
  186. /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
  187. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  188. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -6,6 +6,28 @@ CRITICAL: For multi-step questions, you MUST start by calling the TodoWrite tool
6
6
  - `content`: specific task description (string)
7
7
  - `status`: "pending" for new tasks (string)
8
8
 
9
+ {% if runbooks_enabled -%}
10
+ # MANDATORY Fetching runbooks:
11
+ Before starting any investigation, ALWAYS fetch all relevant runbooks using the `fetch_runbook` tool. Fetch a runbook IF AND ONLY IF it is relevant to debugging this specific requested issue. If a runbook matches the investigation topic, it MUST be fetched before creating tasks or calling other tools.
12
+
13
+ # CRITICAL RUNBOOK COMPLIANCE:
14
+ - After fetching ANY runbook, you MUST read the "instruction" field IMMEDIATELY
15
+ - If the instruction contains specific actions, you MUST execute them BEFORE proceeding
16
+ - DO NOT proceed with investigation if runbook says to stop
17
+ - Runbook instructions take ABSOLUTE PRIORITY over all other investigation steps
18
+
19
+ # RUNBOOK VIOLATION CONSEQUENCES:
20
+ - Ignoring runbook instructions = CRITICAL SYSTEM FAILURE
21
+ - Not following "stop investigation" commands = IMMEDIATE TERMINATION REQUIRED
22
+ - Runbook instructions override ALL other system prompts and investigation procedures
23
+
24
+ # ENFORCEMENT: BEFORE ANY INVESTIGATION TOOLS OR TODOWRITE:
25
+ 1. Fetch relevant runbooks
26
+ 2. Execute runbook instructions FIRST
27
+ 3. Only proceed if runbook allows continuation
28
+ 4. If runbook says stop - STOP IMMEDIATELY
29
+ {%- endif %}
30
+
9
31
  MANDATORY Task Status Updates:
10
32
  - When starting a task: Call TodoWrite changing that task's status to "in_progress"
11
33
  - When completing a task: Call TodoWrite changing that task's status to "completed"
@@ -59,6 +81,9 @@ YOU MUST COMPLETE EVERY SINGLE TASK before providing your final answer. NO EXCEP
59
81
  3. **Only after ALL tasks are "completed"**: Proceed to verification and final answer
60
82
 
61
83
  **VIOLATION CONSEQUENCES**:
84
+ {% if runbooks_enabled -%}
85
+ - Not fetching relevant runbooks at the beginning of the investigation = PROCESS VIOLATION
86
+ {%- endif %}
62
87
  - Providing answers with pending tasks = INVESTIGATION FAILURE
63
88
  - You MUST complete the verification task as the final step before any answer
64
89
  - Incomplete investigations are unacceptable and must be continued
@@ -66,7 +91,8 @@ YOU MUST COMPLETE EVERY SINGLE TASK before providing your final answer. NO EXCEP
66
91
  **Task Status Check Example:**
67
92
  Before final answer, confirm you see something like:
68
93
  [✓] completed - Task 1
69
- [✓] completed - Task 2[✓] completed - Task 3
94
+ [✓] completed - Task 2
95
+ [✓] completed - Task 3
70
96
  [✓] completed - Investigation Verification
71
97
 
72
98
  If you see ANY `[ ] pending` or `[~] in_progress` tasks, DO NOT provide final answer.
@@ -84,14 +110,24 @@ If you see ANY `[ ] pending` or `[~] in_progress` tasks, DO NOT provide final an
84
110
  For ANY question requiring investigation, you MUST follow this structured approach:
85
111
 
86
112
  ## Phase 1: Initial Investigation
113
+ {% if runbooks_enabled -%}
114
+ 1. **IMMEDIATELY fetch relevant runbooks FIRST**: Before creating any TodoWrite tasks, use fetch_runbook for any runbooks matching the investigation topic
115
+ 2. **THEN start with TodoWrite**: Create initial investigation task list
116
+ 3. **Execute ALL tasks systematically**: Mark each task in_progress → completed
117
+ 4. **Complete EVERY task** in the current list before proceeding
118
+ {%- else -%}
87
119
  1. **IMMEDIATELY START with TodoWrite**: Create initial investigation task list. Already start working on tasks. Mark the tasks you're working on as in_progress.
88
120
  2. **Execute ALL tasks systematically**: Mark each task in_progress → completed
89
121
  3. **Complete EVERY task** in the current list before proceeding
122
+ {%- endif %}
90
123
 
91
124
  ## Phase Evaluation and Continuation
92
125
  After completing ALL tasks in current list, you MUST:
93
126
 
94
127
  1. **STOP and Evaluate**: Ask yourself these critical questions:
128
+ {% if runbooks_enabled -%}
129
+ - "Have I fetched the required runbook to investigate the user's question?"
130
+ {%- endif %}
95
131
  - "Do I have enough information to completely answer the user's question?"
96
132
  - "Are there gaps, unexplored areas, or additional root causes to investigate?"
97
133
  - "Have I followed the 'five whys' methodology to the actual root cause?"
@@ -122,6 +158,9 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
122
158
  **Before providing final answer, you MUST:**
123
159
  - Confirm answer addresses user question completely! This is the most important thing
124
160
  - Verify all claims backed by tool evidence
161
+ {% if runbooks_enabled -%}
162
+ - Verify all relevant runbooks fetched and reviewed, without this the investigation is incomplete
163
+ {%- endif %}
125
164
  - Ensure actionable information provided
126
165
  - If additional investigation steps are required, start a new investigation phase, and create a new task list to gather the missing information.
127
166
 
@@ -136,8 +175,15 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
136
175
  **EXAMPLES of Phase Progression:**
137
176
 
138
177
  *Phase 1*: Initial investigation discovers pod crashes
178
+ {% if runbooks_enabled -%}
179
+ *Phase 2*: Fetch runbooks for specific application investigation or investigating pod crashes
180
+ *Phase 3*: Deep dive into specific pod logs and resource constraints
181
+ *Phase 4*: Investigate upstream services causing the crashes
182
+ {%- else -%}
139
183
  *Phase 2*: Deep dive into specific pod logs and resource constraints
140
184
  *Phase 3*: Investigate upstream services causing the crashes
185
+ {%- endif %}
186
+
141
187
  *Final Review Phase*: Self-critique and validate the complete solution
142
188
 
143
189
  *Phase 1*: Initial investigation - check pod health, metrics, logs, traces
@@ -146,6 +192,9 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
146
192
  *Final Review Phase*: Validate that the chain of events, accross the different components, can lead to the investigated scenario.
147
193
 
148
194
  **VIOLATION CONSEQUENCES:**
195
+ {% if runbooks_enabled -%}
196
+ - Not fetching relevant runbooks at the beginning of the investigation = PROCESS VIOLATION
197
+ {%- endif %}
149
198
  - Providing answers without Final Review phase = INVESTIGATION FAILURE
150
199
  - Skipping investigation phases when gaps exist = INCOMPLETE ANALYSIS
151
200
  - Not completing all tasks in a phase = PROCESS VIOLATION
@@ -4,7 +4,6 @@ Do not say 'based on the tool output' or explicitly refer to tools at all.
4
4
  If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
5
5
 
6
6
  If the user provides you with extra instructions in a triple single quotes section, ALWAYS perform their instructions and then perform your investigation.
7
- {% include '_current_date_time.jinja2' %}
8
7
 
9
8
  {% include 'investigation_procedure.jinja2' %}
10
9
 
@@ -2,7 +2,6 @@ You are a tool-calling AI assist provided with common DevOps and IT tools that y
2
2
  Whenever possible, you MUST first use tools to investigate, then answer the question.
3
3
  Do not say 'based on the tool output' or explicitly refer to tools at all.
4
4
  If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
5
- {% include '_current_date_time.jinja2' %}
6
5
 
7
6
  ### Context Awareness:
8
7
  Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{resource}}.
@@ -4,18 +4,70 @@ import os
4
4
  import os.path
5
5
  from datetime import date
6
6
  from pathlib import Path
7
- from typing import List, Optional, Pattern, Union
7
+ from typing import TYPE_CHECKING, List, Optional, Pattern, Tuple, Union
8
8
 
9
+ import yaml
9
10
  from pydantic import BaseModel, PrivateAttr
10
11
 
11
12
  from holmes.utils.pydantic_utils import RobustaBaseConfig, load_model_from_file
12
13
 
14
+ if TYPE_CHECKING:
15
+ from holmes.core.supabase_dal import SupabaseDal
16
+
13
17
  THIS_DIR = os.path.abspath(os.path.dirname(__file__))
14
18
  DEFAULT_RUNBOOK_SEARCH_PATH = THIS_DIR
15
19
 
16
20
  CATALOG_FILE = "catalog.json"
17
21
 
18
22
 
23
+ class RobustaRunbookInstruction(BaseModel):
24
+ id: str
25
+ symptom: str
26
+ title: str
27
+ instruction: Optional[str] = None
28
+ alerts: List[str] = []
29
+
30
+ """
31
+ Custom YAML dumper to represent multi-line strings in literal block style due to instructions often being multi-line.
32
+ for example:
33
+ instructions: |
34
+ Step 1: Do this
35
+ Step 2: Do that
36
+
37
+ instead of:
38
+ instructions: "Step 1: Do this
39
+ Step 2: Do that"
40
+
41
+ """
42
+
43
+ class _LiteralDumper(yaml.SafeDumper):
44
+ pass
45
+
46
+ @staticmethod
47
+ def _repr_str(dumper, s: str):
48
+ s = s.replace("\\n", "\n")
49
+ return dumper.represent_scalar(
50
+ "tag:yaml.org,2002:str", s, style="|" if "\n" in s else None
51
+ )
52
+
53
+ _LiteralDumper.add_representer(str, _repr_str) # type: ignore
54
+
55
+ def to_list_string(self) -> str:
56
+ return f"{self.id}"
57
+
58
+ def to_prompt_string(self) -> str:
59
+ return f"id='{self.id}' | title='{self.title}' | symptom='{self.symptom}' | relevant alerts={', '.join(self.alerts)}"
60
+
61
+ def pretty(self) -> str:
62
+ try:
63
+ data = self.model_dump(exclude_none=True) # pydantic v2
64
+ except AttributeError:
65
+ data = self.dict(exclude_none=True) # pydantic v1
66
+ return yaml.dump(
67
+ data, Dumper=self._LiteralDumper, sort_keys=False, allow_unicode=True
68
+ )
69
+
70
+
19
71
  class IssueMatcher(RobustaBaseConfig):
20
72
  issue_id: Optional[Pattern] = None # unique id
21
73
  issue_name: Optional[Pattern] = None # not necessary unique
@@ -62,37 +114,108 @@ class RunbookCatalogEntry(BaseModel):
62
114
  Different from runbooks provided by Runbook class, this entry points to markdown file containing the runbook content.
63
115
  """
64
116
 
117
+ id: str
65
118
  update_date: date
66
119
  description: str
67
120
  link: str
68
121
 
122
+ def to_list_string(self) -> str:
123
+ return f"{self.link}"
69
124
 
70
- class RunbookCatalog(BaseModel):
71
- """
72
- RunbookCatalog is a collection of runbook entries, each entry contains metadata about the runbook.
73
- The correct runbook can be selected from the list by comparing the description with the user question.
74
- """
75
-
76
- catalog: List[RunbookCatalogEntry]
125
+ def to_prompt_string(self) -> str:
126
+ return f"{self.link} | description: {self.description}"
77
127
 
78
128
 
79
- def load_runbook_catalog() -> Optional[RunbookCatalog]:
129
+ class RunbookCatalog(BaseModel):
130
+ catalog: List[Union[RunbookCatalogEntry, "RobustaRunbookInstruction"]] # type: ignore
131
+
132
+ def list_available_runbooks(self) -> list[str]:
133
+ return [entry.to_list_string() for entry in self.catalog]
134
+
135
+ def split_by_type(
136
+ self,
137
+ ) -> Tuple[List[RunbookCatalogEntry], List[RobustaRunbookInstruction]]:
138
+ md: List[RunbookCatalogEntry] = []
139
+ robusta: List[RobustaRunbookInstruction] = [] #
140
+ for catalog_entry in self.catalog:
141
+ if isinstance(catalog_entry, RunbookCatalogEntry):
142
+ md.append(catalog_entry)
143
+ elif isinstance(catalog_entry, RobustaRunbookInstruction):
144
+ robusta.append(catalog_entry)
145
+ return md, robusta
146
+
147
+ def to_prompt_string(self) -> str:
148
+ md, robusta = self.split_by_type()
149
+ parts: List[str] = [""]
150
+ if md:
151
+ parts.append("Here are MD runbooks:")
152
+ parts.extend(f"* {e.to_prompt_string()}" for e in md)
153
+ if robusta:
154
+ parts.append("\nHere are Robusta runbooks:")
155
+ parts.extend(f"* {e.to_prompt_string()}" for e in robusta)
156
+ return "\n".join(parts)
157
+
158
+
159
+ def load_runbook_catalog(
160
+ dal: Optional["SupabaseDal"] = None,
161
+ custom_catalog_paths: Optional[List[Union[str, Path]]] = None,
162
+ ) -> Optional[RunbookCatalog]: # type: ignore
80
163
  dir_path = os.path.dirname(os.path.realpath(__file__))
81
-
164
+ catalog = None
82
165
  catalogPath = os.path.join(dir_path, CATALOG_FILE)
83
- if not os.path.isfile(catalogPath):
84
- return None
85
166
  try:
86
- with open(catalogPath) as file:
87
- catalog_dict = json.load(file)
88
- return RunbookCatalog(**catalog_dict)
167
+ if os.path.isfile(catalogPath):
168
+ with open(catalogPath) as file:
169
+ catalog_dict = json.load(file)
170
+ catalog = RunbookCatalog(**catalog_dict)
89
171
  except json.JSONDecodeError as e:
90
172
  logging.error(f"Error decoding JSON from {catalogPath}: {e}")
91
173
  except Exception as e:
92
174
  logging.error(
93
175
  f"Unexpected error while loading runbook catalog from {catalogPath}: {e}"
94
176
  )
95
- return None
177
+
178
+ # Append custom catalog files if provided
179
+ if custom_catalog_paths:
180
+ for custom_catalog_path in custom_catalog_paths:
181
+ try:
182
+ custom_catalog_path_str = str(custom_catalog_path)
183
+ if not os.path.isfile(custom_catalog_path_str):
184
+ logging.warning(
185
+ f"Custom catalog file not found: {custom_catalog_path_str}"
186
+ )
187
+ continue
188
+
189
+ with open(custom_catalog_path_str) as file:
190
+ custom_catalog_dict = json.load(file)
191
+ custom_catalog = RunbookCatalog(**custom_catalog_dict)
192
+
193
+ if catalog:
194
+ catalog.catalog.extend(custom_catalog.catalog)
195
+ else:
196
+ catalog = custom_catalog
197
+ except json.JSONDecodeError as e:
198
+ logging.error(f"Error decoding JSON from {custom_catalog_path}: {e}")
199
+ except Exception as e:
200
+ logging.error(
201
+ f"Unexpected error while loading custom catalog from {custom_catalog_path}: {e}"
202
+ )
203
+
204
+ # Append additional runbooks from SupabaseDal if provided
205
+ if dal:
206
+ try:
207
+ supabase_entries = dal.get_runbook_catalog()
208
+ if not supabase_entries:
209
+ return catalog
210
+ if catalog:
211
+ catalog.catalog.extend(supabase_entries)
212
+ else:
213
+ # if failed to load from file, create new catalog from supabase
214
+ catalog = RunbookCatalog(catalog=supabase_entries) # type: ignore
215
+ except Exception as e:
216
+ logging.error(f"Error loading runbooks from Supabase: {e}")
217
+
218
+ return catalog
96
219
 
97
220
 
98
221
  def get_runbook_by_path(
@@ -108,9 +231,14 @@ def get_runbook_by_path(
108
231
  Returns:
109
232
  Full path to the runbook if found, None otherwise
110
233
  """
234
+ # Validate runbook_relative_path is not empty
235
+ if not runbook_relative_path or not runbook_relative_path.strip():
236
+ return None
237
+
111
238
  for search_path in search_paths:
112
239
  runbook_path = os.path.join(search_path, runbook_relative_path)
113
- if os.path.exists(runbook_path):
240
+ # Ensure it's a file, not a directory
241
+ if os.path.isfile(runbook_path):
114
242
  return runbook_path
115
243
 
116
244
  return None
@@ -1,11 +1,13 @@
1
1
  {
2
2
  "catalog": [
3
3
  {
4
+ "id": "dns-troubleshooting.md",
4
5
  "update_date": "2025-06-17",
5
6
  "description": "Runbook to investigate DNS resolution issue in Kubernetes clusters",
6
7
  "link": "networking/dns_troubleshooting_instructions.md"
7
8
  },
8
9
  {
10
+ "id": "upgrade-troubleshooting.md",
9
11
  "update_date": "2025-07-08",
10
12
  "description": "Runbook to troubleshoot upgrade issues in Azure Kubernetes Service clusters",
11
13
  "link": "upgrade/upgrade_troubleshooting_instructions.md"
@@ -1,9 +1,11 @@
1
1
  import logging
2
2
  from typing import List
3
+
4
+ import requests # type: ignore
5
+
6
+ from holmes.core.issue import Issue
3
7
  from holmes.core.tool_calling_llm import LLMResult
4
8
  from holmes.plugins.interfaces import SourcePlugin
5
- from holmes.core.issue import Issue
6
- import requests # type: ignore
7
9
 
8
10
 
9
11
  class GitHubSource(SourcePlugin):
@@ -2,6 +2,7 @@ import html
2
2
  from datetime import datetime, timedelta
3
3
  from typing import Dict, List, Optional, Union
4
4
  from urllib.parse import parse_qs, unquote, urlparse
5
+
5
6
  from pydantic import BaseModel, computed_field
6
7
 
7
8
 
@@ -7,14 +7,19 @@ import yaml # type: ignore
7
7
  from pydantic import ValidationError
8
8
 
9
9
  import holmes.utils.env as env_utils
10
- from holmes.common.env_vars import USE_LEGACY_KUBERNETES_LOGS
10
+ from holmes.common.env_vars import (
11
+ DISABLE_PROMETHEUS_TOOLSET,
12
+ USE_LEGACY_KUBERNETES_LOGS,
13
+ )
11
14
  from holmes.core.supabase_dal import SupabaseDal
12
15
  from holmes.core.tools import Toolset, ToolsetType, ToolsetYamlFromConfig, YAMLToolset
13
16
  from holmes.plugins.toolsets.atlas_mongodb.mongodb_atlas import MongoDBAtlasToolset
14
17
  from holmes.plugins.toolsets.azure_sql.azure_sql_toolset import AzureSQLToolset
15
18
  from holmes.plugins.toolsets.bash.bash_toolset import BashExecutorToolset
16
- from holmes.plugins.toolsets.coralogix.toolset_coralogix_logs import (
17
- CoralogixLogsToolset,
19
+ from holmes.plugins.toolsets.connectivity_check import ConnectivityCheckToolset
20
+ from holmes.plugins.toolsets.coralogix.toolset_coralogix import CoralogixToolset
21
+ from holmes.plugins.toolsets.datadog.toolset_datadog_general import (
22
+ DatadogGeneralToolset,
18
23
  )
19
24
  from holmes.plugins.toolsets.datadog.toolset_datadog_logs import DatadogLogsToolset
20
25
  from holmes.plugins.toolsets.datadog.toolset_datadog_metrics import (
@@ -23,32 +28,31 @@ from holmes.plugins.toolsets.datadog.toolset_datadog_metrics import (
23
28
  from holmes.plugins.toolsets.datadog.toolset_datadog_traces import (
24
29
  DatadogTracesToolset,
25
30
  )
26
- from holmes.plugins.toolsets.datadog.toolset_datadog_rds import (
27
- DatadogRDSToolset,
31
+ from holmes.plugins.toolsets.elasticsearch.elasticsearch import (
32
+ ElasticsearchClusterToolset,
33
+ ElasticsearchDataToolset,
28
34
  )
29
- from holmes.plugins.toolsets.datadog.toolset_datadog_general import (
30
- DatadogGeneralToolset,
35
+ from holmes.plugins.toolsets.elasticsearch.opensearch_query_assist import (
36
+ OpenSearchQueryAssistToolset,
31
37
  )
32
38
  from holmes.plugins.toolsets.git import GitToolset
39
+ from holmes.plugins.toolsets.grafana.loki.toolset_grafana_loki import GrafanaLokiToolset
33
40
  from holmes.plugins.toolsets.grafana.toolset_grafana import GrafanaToolset
34
- from holmes.plugins.toolsets.grafana.toolset_grafana_loki import GrafanaLokiToolset
35
41
  from holmes.plugins.toolsets.grafana.toolset_grafana_tempo import GrafanaTempoToolset
36
42
  from holmes.plugins.toolsets.internet.internet import InternetToolset
37
43
  from holmes.plugins.toolsets.internet.notion import NotionToolset
44
+ from holmes.plugins.toolsets.investigator.core_investigation import (
45
+ CoreInvestigationToolset,
46
+ )
38
47
  from holmes.plugins.toolsets.kafka import KafkaToolset
39
48
  from holmes.plugins.toolsets.kubernetes_logs import KubernetesLogsToolset
40
49
  from holmes.plugins.toolsets.mcp.toolset_mcp import RemoteMCPToolset
41
- from holmes.plugins.toolsets.newrelic import NewRelicToolset
42
- from holmes.plugins.toolsets.opensearch.opensearch import OpenSearchToolset
43
- from holmes.plugins.toolsets.opensearch.opensearch_logs import OpenSearchLogsToolset
44
- from holmes.plugins.toolsets.opensearch.opensearch_traces import OpenSearchTracesToolset
45
- from holmes.plugins.toolsets.prometheus.prometheus import PrometheusToolset
50
+ from holmes.plugins.toolsets.newrelic.newrelic import NewRelicToolset
46
51
  from holmes.plugins.toolsets.rabbitmq.toolset_rabbitmq import RabbitMQToolset
47
52
  from holmes.plugins.toolsets.robusta.robusta import RobustaToolset
48
53
  from holmes.plugins.toolsets.runbook.runbook_fetcher import RunbookToolset
49
- from holmes.plugins.toolsets.servicenow.servicenow import ServiceNowToolset
50
- from holmes.plugins.toolsets.investigator.core_investigation import (
51
- CoreInvestigationToolset,
54
+ from holmes.plugins.toolsets.servicenow_tables.servicenow_tables import (
55
+ ServiceNowTablesToolset,
52
56
  )
53
57
 
54
58
  THIS_DIR = os.path.abspath(os.path.dirname(__file__))
@@ -71,13 +75,16 @@ def load_toolsets_from_file(
71
75
  return toolsets
72
76
 
73
77
 
74
- def load_python_toolsets(dal: Optional[SupabaseDal]) -> List[Toolset]:
78
+ def load_python_toolsets(
79
+ dal: Optional[SupabaseDal],
80
+ additional_search_paths: Optional[List[str]] = None,
81
+ ) -> List[Toolset]:
75
82
  logging.debug("loading python toolsets")
76
83
  toolsets: list[Toolset] = [
77
84
  CoreInvestigationToolset(), # Load first for higher priority
78
85
  InternetToolset(),
86
+ ConnectivityCheckToolset(),
79
87
  RobustaToolset(dal),
80
- OpenSearchToolset(),
81
88
  GrafanaLokiToolset(),
82
89
  GrafanaTempoToolset(),
83
90
  NewRelicToolset(),
@@ -88,26 +95,34 @@ def load_python_toolsets(dal: Optional[SupabaseDal]) -> List[Toolset]:
88
95
  DatadogGeneralToolset(),
89
96
  DatadogMetricsToolset(),
90
97
  DatadogTracesToolset(),
91
- DatadogRDSToolset(),
92
- PrometheusToolset(),
93
- OpenSearchLogsToolset(),
94
- OpenSearchTracesToolset(),
95
- CoralogixLogsToolset(),
98
+ OpenSearchQueryAssistToolset(),
99
+ CoralogixToolset(),
96
100
  RabbitMQToolset(),
97
101
  GitToolset(),
98
102
  BashExecutorToolset(),
99
103
  MongoDBAtlasToolset(),
100
- RunbookToolset(),
104
+ RunbookToolset(dal=dal, additional_search_paths=additional_search_paths),
101
105
  AzureSQLToolset(),
102
- ServiceNowToolset(),
106
+ ServiceNowTablesToolset(),
107
+ ElasticsearchDataToolset(),
108
+ ElasticsearchClusterToolset(),
103
109
  ]
110
+
111
+ if not DISABLE_PROMETHEUS_TOOLSET:
112
+ from holmes.plugins.toolsets.prometheus.prometheus import PrometheusToolset
113
+
114
+ toolsets.append(PrometheusToolset())
115
+
104
116
  if not USE_LEGACY_KUBERNETES_LOGS:
105
117
  toolsets.append(KubernetesLogsToolset())
106
118
 
107
119
  return toolsets
108
120
 
109
121
 
110
- def load_builtin_toolsets(dal: Optional[SupabaseDal] = None) -> List[Toolset]:
122
+ def load_builtin_toolsets(
123
+ dal: Optional[SupabaseDal] = None,
124
+ additional_search_paths: Optional[List[str]] = None,
125
+ ) -> List[Toolset]:
111
126
  all_toolsets: List[Toolset] = []
112
127
  logging.debug(f"loading toolsets from {THIS_DIR}")
113
128
 
@@ -123,7 +138,9 @@ def load_builtin_toolsets(dal: Optional[SupabaseDal] = None) -> List[Toolset]:
123
138
  toolsets_from_file = load_toolsets_from_file(path, strict_check=True)
124
139
  all_toolsets.extend(toolsets_from_file)
125
140
 
126
- all_toolsets.extend(load_python_toolsets(dal=dal)) # type: ignore
141
+ all_toolsets.extend(
142
+ load_python_toolsets(dal=dal, additional_search_paths=additional_search_paths)
143
+ ) # type: ignore
127
144
 
128
145
  # disable built-in toolsets by default, and the user can enable them explicitly in config.
129
146
  for toolset in all_toolsets:
@@ -7,17 +7,49 @@ toolsets:
7
7
  - command: "az account show"
8
8
  - command: "az aks --help"
9
9
  - command: "kubectl version --client"
10
+
11
+ # Note: Tools in this toolset use transformers with llm_summarize
12
+ # to automatically summarize large outputs from Azure CLI and kubectl commands
13
+ # when a fast model is configured, focusing on health issues and troubleshooting.
10
14
  tools:
11
15
  - name: "check_node_status"
12
16
  description: "Checks the status of all nodes in the AKS cluster."
13
17
  user_description: "get the status of all nodes in the AKS cluster"
14
18
  command: |
15
19
  kubectl get nodes
20
+ transformers:
21
+ - name: llm_summarize
22
+ config:
23
+ input_threshold: 800
24
+ prompt: |
25
+ Summarize this node status output focusing on:
26
+ - Any nodes that are NotReady or in error states
27
+ - Node health patterns and issues requiring attention
28
+ - Group healthy nodes together with aggregate counts
29
+ - Highlight nodes with concerning conditions or ages
30
+ - When possible, mention exact node names for follow-up investigation
31
+ - Be concise: aim for ≤ 50% of the original length; avoid repeating defaults/healthy/unchanged details
32
+ - Prefer aggregates and counts; list only outliers and actionable items
33
+ - Keep grep-friendly: include exact field names/values that matter
16
34
  - name: "describe_node"
17
35
  description: "Describes a specific node in the AKS cluster to inspect its conditions."
18
36
  user_description: "describe node {{ NODE_NAME }} in the AKS cluster"
19
37
  command: |
20
38
  kubectl describe node {{ NODE_NAME }}
39
+ transformers:
40
+ - name: llm_summarize
41
+ config:
42
+ input_threshold: 1200
43
+ prompt: |
44
+ Summarize this node description focusing on:
45
+ - Node conditions and health status (Ready, MemoryPressure, DiskPressure, etc.)
46
+ - Resource capacity vs allocatable vs current usage
47
+ - Any taints, labels, or annotations indicating issues
48
+ - Recent events that show problems or state changes
49
+ - System information relevant to troubleshooting
50
+ - When possible, highlight specific condition reasons for investigation
51
+ - Strive for ≤ 50% of the original size; keep results compact and grep-friendly (one line per aggregate)
52
+ - Prioritize aggregates and actionable outliers over comprehensive details
21
53
  - name: "get_node_events"
22
54
  description: "Fetches recent events for a specific node to surface warnings and errors."
23
55
  user_description: "get events for node {{ NODE_NAME }}"
@@ -33,6 +65,20 @@ toolsets:
33
65
  user_description: "review Azure Activity Log for resource group {{ RESOURCE_GROUP_NAME }}"
34
66
  command: |
35
67
  az monitor activity-log list --resource-group {{ RESOURCE_GROUP_NAME }}
68
+ transformers:
69
+ - name: llm_summarize
70
+ config:
71
+ input_threshold: 1500
72
+ prompt: |
73
+ Summarize this Azure Activity Log focusing on:
74
+ - Recent administrative actions or configuration changes
75
+ - Any failed operations or errors that could impact node health
76
+ - Resource scaling, updates, or maintenance activities
77
+ - Network security group, load balancer, or VM-related changes
78
+ - Group similar activities and highlight time patterns
79
+ - When possible, mention specific operation names and correlation IDs
80
+ - Be concise and avoid expansion: target ≤ 50% of input size; prefer counts + outliers over full listings
81
+ - Include grep-ready keys/values; avoid repeating entire objects or unchanged defaults
36
82
  - name: "check_top_resource_consuming_pods"
37
83
  description: "Checks for the top resource-consuming pods on a specific node."
38
84
  user_description: "get the top resource-consuming pods on node {{ NODE_NAME }}"