holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +17 -4
- holmes/common/env_vars.py +40 -1
- holmes/config.py +114 -144
- holmes/core/conversations.py +53 -14
- holmes/core/feedback.py +191 -0
- holmes/core/investigation.py +18 -22
- holmes/core/llm.py +489 -88
- holmes/core/models.py +103 -1
- holmes/core/openai_formatting.py +13 -0
- holmes/core/prompt.py +1 -1
- holmes/core/safeguards.py +4 -4
- holmes/core/supabase_dal.py +293 -100
- holmes/core/tool_calling_llm.py +423 -323
- holmes/core/tools.py +311 -33
- holmes/core/tools_utils/token_counting.py +14 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
- holmes/core/tools_utils/tool_executor.py +13 -8
- holmes/core/toolset_manager.py +155 -4
- holmes/core/tracing.py +6 -1
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +62 -0
- holmes/core/transformers/llm_summarize.py +174 -0
- holmes/core/transformers/registry.py +122 -0
- holmes/core/transformers/transformer.py +31 -0
- holmes/core/truncation/compaction.py +59 -0
- holmes/core/truncation/dal_truncation_utils.py +23 -0
- holmes/core/truncation/input_context_window_limiter.py +218 -0
- holmes/interactive.py +177 -24
- holmes/main.py +7 -4
- holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
- holmes/plugins/prompts/generic_ask.jinja2 +2 -4
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
- holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
- holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
- holmes/plugins/runbooks/__init__.py +117 -18
- holmes/plugins/runbooks/catalog.json +2 -0
- holmes/plugins/toolsets/__init__.py +21 -8
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
- holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
- holmes/plugins/toolsets/bash/common/bash.py +7 -7
- holmes/plugins/toolsets/cilium.yaml +284 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
- holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
- holmes/plugins/toolsets/git.py +51 -46
- holmes/plugins/toolsets/grafana/common.py +15 -3
- holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
- holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
- holmes/plugins/toolsets/internet/internet.py +6 -7
- holmes/plugins/toolsets/internet/notion.py +5 -6
- holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
- holmes/plugins/toolsets/kafka.py +25 -36
- holmes/plugins/toolsets/kubernetes.yaml +58 -84
- holmes/plugins/toolsets/kubernetes_logs.py +6 -6
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
- holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
- holmes/plugins/toolsets/newrelic/__init__.py +0 -0
- holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
- holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
- holmes/plugins/toolsets/openshift.yaml +283 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
- holmes/plugins/toolsets/robusta/robusta.py +236 -65
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
- holmes/utils/env.py +7 -0
- holmes/utils/global_instructions.py +75 -10
- holmes/utils/holmes_status.py +2 -1
- holmes/utils/holmes_sync_toolsets.py +0 -2
- holmes/utils/krr_utils.py +188 -0
- holmes/utils/sentry_helper.py +41 -0
- holmes/utils/stream.py +61 -7
- holmes/version.py +34 -14
- holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
- holmes/core/performance_timing.py +0 -72
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
- holmes/plugins/toolsets/newrelic.py +0 -231
- holmes/plugins/toolsets/servicenow/install.md +0 -37
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
- holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0
holmes/main.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
# ruff: noqa: E402
|
|
2
2
|
import os
|
|
3
|
-
import sys
|
|
4
3
|
|
|
5
4
|
from holmes.utils.cert_utils import add_custom_certificate
|
|
6
|
-
from holmes.utils.colors import USER_COLOR
|
|
7
5
|
|
|
8
6
|
ADDITIONAL_CERTIFICATE: str = os.environ.get("CERTIFICATE", "")
|
|
9
7
|
if add_custom_certificate(ADDITIONAL_CERTIFICATE):
|
|
@@ -11,8 +9,7 @@ if add_custom_certificate(ADDITIONAL_CERTIFICATE):
|
|
|
11
9
|
|
|
12
10
|
# DO NOT ADD ANY IMPORTS OR CODE ABOVE THIS LINE
|
|
13
11
|
# IMPORTING ABOVE MIGHT INITIALIZE AN HTTPS CLIENT THAT DOESN'T TRUST THE CUSTOM CERTIFICATE
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
import sys
|
|
16
13
|
import json
|
|
17
14
|
import logging
|
|
18
15
|
import socket
|
|
@@ -44,6 +41,7 @@ from holmes.utils.console.consts import system_prompt_help
|
|
|
44
41
|
from holmes.utils.console.logging import init_logging
|
|
45
42
|
from holmes.utils.console.result import handle_result
|
|
46
43
|
from holmes.utils.file_utils import write_json_file
|
|
44
|
+
from holmes.utils.colors import USER_COLOR
|
|
47
45
|
|
|
48
46
|
app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
|
|
49
47
|
investigate_app = typer.Typer(
|
|
@@ -76,6 +74,9 @@ opt_api_key: Optional[str] = typer.Option(
|
|
|
76
74
|
help="API key to use for the LLM (if not given, uses environment variables OPENAI_API_KEY or AZURE_API_KEY)",
|
|
77
75
|
)
|
|
78
76
|
opt_model: Optional[str] = typer.Option(None, help="Model to use for the LLM")
|
|
77
|
+
opt_fast_model: Optional[str] = typer.Option(
|
|
78
|
+
None, help="Optional fast model for summarization tasks"
|
|
79
|
+
)
|
|
79
80
|
opt_config_file: Optional[Path] = typer.Option(
|
|
80
81
|
DEFAULT_CONFIG_LOCATION, # type: ignore
|
|
81
82
|
"--config",
|
|
@@ -177,6 +178,7 @@ def ask(
|
|
|
177
178
|
# common options
|
|
178
179
|
api_key: Optional[str] = opt_api_key,
|
|
179
180
|
model: Optional[str] = opt_model,
|
|
181
|
+
fast_model: Optional[str] = opt_fast_model,
|
|
180
182
|
config_file: Optional[Path] = opt_config_file,
|
|
181
183
|
custom_toolsets: Optional[List[Path]] = opt_custom_toolsets,
|
|
182
184
|
max_steps: Optional[int] = opt_max_steps,
|
|
@@ -244,6 +246,7 @@ def ask(
|
|
|
244
246
|
config_file,
|
|
245
247
|
api_key=api_key,
|
|
246
248
|
model=model,
|
|
249
|
+
fast_model=fast_model,
|
|
247
250
|
max_steps=max_steps,
|
|
248
251
|
custom_toolsets_from_cli=custom_toolsets,
|
|
249
252
|
slack_token=slack_token,
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
{%- set k8s_yaml_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | rejectattr("fetch_pod_logs", "defined") | first -%}
|
|
5
5
|
{%- set opensearch_ts = toolsets | selectattr("name", "equalto", "opensearch/logs") | first -%}
|
|
6
6
|
{%- set datadog_ts = toolsets | selectattr("name", "equalto", "datadog/logs") | first -%}
|
|
7
|
+
{%- set openshift_ts = toolsets | selectattr("name", "equalto", "openshift/logs") | first -%}
|
|
7
8
|
{%- set bash_ts = toolsets | selectattr("name", "equalto", "bash") | first -%}
|
|
8
9
|
|
|
9
10
|
## Logs
|
|
@@ -11,6 +12,7 @@
|
|
|
11
12
|
* IMPORTANT: ALWAYS inform the user about what logs you fetched. For example: "Here are pod logs for ..."
|
|
12
13
|
* IMPORTANT: If logs commands have limits mention them. For example: "Showing last 100 lines of logs:"
|
|
13
14
|
* IMPORTANT: If a filter was used, mention the filter. For example: "Logs filtered for 'error':"
|
|
15
|
+
* IMPORTANT: If a date range was used (even if just the default one and you didn't specify the parameter, mention the date range. For example: "Logs from last 1 hour..."
|
|
14
16
|
|
|
15
17
|
{% if loki_ts and loki_ts.status == "enabled" -%}
|
|
16
18
|
* For any logs, including for investigating kubernetes problems, use Loki
|
|
@@ -34,8 +36,29 @@ Tools to search and fetch logs from Coralogix.
|
|
|
34
36
|
### datadog/logs
|
|
35
37
|
#### Datadog Logs Toolset
|
|
36
38
|
Tools to search and fetch logs from Datadog.
|
|
37
|
-
|
|
39
|
+
* Use the tool `fetch_pod_logs` to access an application's logs.
|
|
40
|
+
* Do fetch application logs yourself and DO not ask users to do so
|
|
41
|
+
* If you have an alert/monitor try to figure out the time it fired
|
|
42
|
+
** Then, use `start_time=-300` (5 minutes before `end_time`) and `end_time=<time monitor started firing>` when calling `fetch_pod_logs`.
|
|
43
|
+
** If there are too many logs, or not enough, narrow or widen the timestamps
|
|
44
|
+
* If the user did not explicitly ask about a given timeframe, ignore the `start_time` and `end_time` so it will use the default.
|
|
45
|
+
* IMPORTANT: ALWAYS inform the user about the actual time period fetched (e.g., "Looking at logs from the last <X> days")
|
|
46
|
+
* IMPORTANT: If a limit was applied, ALWAYS tell the user how many logs were shown vs total (e.g., "Showing latest <Y> of <Z> logs")
|
|
47
|
+
* IMPORTANT: If any filters were applied, ALWAYS mention them explicitly
|
|
48
|
+
{%- elif openshift_ts and openshift_ts.status == "enabled" -%}
|
|
49
|
+
### openshift/logs
|
|
50
|
+
#### OpenShift Logs Toolset
|
|
51
|
+
Tools to search and fetch logs from OpenShift.
|
|
52
|
+
* Use the tool `oc_logs` to access an application's logs.
|
|
53
|
+
* Do fetch application logs yourself and DO not ask users to do so
|
|
54
|
+
* If you have an alert/monitor try to figure out the time it fired
|
|
55
|
+
** If there are too many logs, or not enough, narrow or widen the timestamps
|
|
56
|
+
* IMPORTANT: ALWAYS inform the user about the actual time period fetched (e.g., "Looking at logs from the last <X> days")
|
|
57
|
+
* IMPORTANT: If a limit was applied, ALWAYS tell the user how many logs were shown vs total (e.g., "Showing latest <Y> of <Z> logs")
|
|
58
|
+
* IMPORTANT: If any filters were applied, ALWAYS mention them explicitly
|
|
38
59
|
{%- elif k8s_yaml_ts and k8s_yaml_ts.status == "enabled" -%}
|
|
60
|
+
### Logs from newrelic
|
|
61
|
+
* you can fetch logs from newrelic if this is toolset is enabled
|
|
39
62
|
### kubernetes/logs
|
|
40
63
|
#### Kubernetes Logs Toolset
|
|
41
64
|
Tools to search and fetch logs from Kubernetes.
|
|
@@ -56,4 +79,6 @@ DO NOT use `--tail` or `| tail` when calling `kubectl logs` because you may miss
|
|
|
56
79
|
** 'opensearch/logs'
|
|
57
80
|
** 'coralogix/logs'
|
|
58
81
|
** 'datadog/logs'
|
|
82
|
+
** 'openshift/logs'
|
|
83
|
+
** 'newrelic'
|
|
59
84
|
{%- endif -%}
|
|
@@ -12,8 +12,7 @@
|
|
|
12
12
|
* do not stop investigating until you are at the final root cause you are able to find.
|
|
13
13
|
* use the "five whys" methodology to find the root cause.
|
|
14
14
|
* for example, if you found a problem in microservice A that is due to an error in microservice B, look at microservice B too and find the error in that.
|
|
15
|
-
* if you cannot find the resource/application that the user referred to, assume they made a typo or included/excluded characters like - and
|
|
16
|
-
* in this case, try to find substrings or search for the correct spellings
|
|
15
|
+
* if you cannot find the resource/application that the user referred to, assume they made a typo or included/excluded characters like - and in this case, try to find substrings or search for the correct spellings
|
|
17
16
|
* always provide detailed information like exact resource names, versions, labels, etc
|
|
18
17
|
* even if you found the root cause, keep investigating to find other possible root causes and to gather data for the answer like exact names
|
|
19
18
|
* if a runbook url is present you MUST fetch the runbook before beginning your investigation
|
|
@@ -1,21 +1,32 @@
|
|
|
1
|
-
{
|
|
1
|
+
{%- set sections = [
|
|
2
|
+
{'title': 'Runbook Catalog', 'content': runbook_catalog},
|
|
3
|
+
{'title': 'Subject/Issue Runbooks', 'content': custom_instructions},
|
|
4
|
+
{'title': 'Global Instructions', 'content': global_instructions}
|
|
5
|
+
] -%}
|
|
6
|
+
{%- set available = sections | selectattr('content') | list -%}
|
|
7
|
+
{%- if available -%}
|
|
2
8
|
# Runbook Selection
|
|
3
9
|
|
|
4
|
-
You (HolmesGPT) have access to
|
|
5
|
-
|
|
10
|
+
You (HolmesGPT) have access to runbooks with step-by-step troubleshooting instructions. If one of the following runbooks relates to the user's issue, you MUST fetch it with the fetch_runbook tool.
|
|
11
|
+
You (HolmesGPT) must follow runbook sources in this priority order:
|
|
12
|
+
{%- for sec in available %}
|
|
13
|
+
{{ loop.index }}) {{ sec.title }} (priority #{{ loop.index }})
|
|
14
|
+
{%- endfor %}
|
|
6
15
|
|
|
7
|
-
|
|
8
|
-
{
|
|
9
|
-
### description: {{ runbook.description }}
|
|
10
|
-
link: {{ runbook.link }}
|
|
11
|
-
{% endfor %}
|
|
16
|
+
{%- for sec in available %}
|
|
17
|
+
## {{ sec.title }} (priority #{{ loop.index }})
|
|
12
18
|
|
|
13
|
-
|
|
19
|
+
{%- set content = (sec.content|string) -%}
|
|
20
|
+
{{ content.replace('\n', '\n ') }}
|
|
21
|
+
|
|
22
|
+
{%- endfor %}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
If a runbook might match the user's issue, you MUST:
|
|
14
26
|
1. Fetch the runbook with the `fetch_runbook` tool.
|
|
15
27
|
2. Decide based on the runbook's contents if it is relevant or not.
|
|
16
|
-
3. If it seems relevant, inform the user that you
|
|
28
|
+
3. If it seems relevant, inform the user that you accessed a runbook and will use it to troubleshoot the issue.
|
|
17
29
|
4. To the maximum extent possible, follow the runbook instructions step-by-step.
|
|
18
30
|
5. Provide a detailed report of the steps you performed, including any findings or errors encountered.
|
|
19
|
-
6. If a runbook step requires tools or integrations you don't have access to tell the user that you cannot perform that step due to missing tools.
|
|
20
|
-
|
|
31
|
+
6. If a runbook step requires tools or integrations you don't have access to, tell the user that you cannot perform that step due to missing tools.
|
|
21
32
|
{%- endif -%}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
Your task is to create a detailed summary of the conversation so far, paying close attention to the user's explicit requests and your previous actions.
|
|
2
|
+
This summary should be thorough in capturing technical details, code patterns, and architectural decisions that would be essential for continuing development work without losing context.
|
|
3
|
+
|
|
4
|
+
Before providing your final summary, wrap your analysis in <analysis> tags to organize your thoughts and ensure you've covered all necessary points. In your analysis process:
|
|
5
|
+
|
|
6
|
+
1. Chronologically analyze each message and section of the conversation. For each section thoroughly identify:
|
|
7
|
+
- The user's explicit requests and intents
|
|
8
|
+
- Your approach to addressing the user's requests
|
|
9
|
+
- Key decisions, technical concepts and code patterns
|
|
10
|
+
- Specific details like kubernetes resource names, namespaces, relevant logs extracts (verbatim), etc
|
|
11
|
+
- What tools were called and the outcome or analysis of the tool output
|
|
12
|
+
2. Double-check for technical accuracy and completeness, addressing each required element thoroughly.
|
|
13
|
+
|
|
14
|
+
Your summary should include the following sections:
|
|
15
|
+
|
|
16
|
+
1. Primary Request and Intent: Capture all of the user's explicit requests and intents in detail
|
|
17
|
+
2. Key Technical Concepts: List all important technical concepts, technologies, and frameworks discussed.
|
|
18
|
+
3. Resources: Enumerate specific kubernetes or cloud resources and logs extract examined. Pay special attention to the most recent messages and include logs or tool outputs where applicable and include a summary of why this resource is important.
|
|
19
|
+
4. Tool calls: List all tool calls that were executed and whether they failed/succeeded. Make sure to mention the full arguments used. Only summarize the arguments if they are over 200 characters long
|
|
20
|
+
5. Problem Solving: Document problems solved and any ongoing troubleshooting efforts.
|
|
21
|
+
6. Pending Tasks: Outline any pending tasks that you have explicitly been asked to work on.
|
|
22
|
+
7. Current Work: Describe in detail precisely what was being worked on immediately before this summary request, paying special attention to the most recent messages from both user and assistant. Include resource names and their namespace and log extracts where applicable.
|
|
23
|
+
8. Optional Next Step: List the next step that you will take that is related to the most recent work you were doing. IMPORTANT: ensure that this step is DIRECTLY in line with the user's explicit requests, and the task you were working on immediately before this summary request. If your last task was concluded, then only list next steps if they are explicitly in line with the users request. Do not start on tangential requests without confirming with the user first.
|
|
24
|
+
If there is a next step, include direct quotes from the most recent conversation showing exactly what task you were working on and where you left off. This should be verbatim to ensure there's no drift in task interpretation.
|
|
25
|
+
|
|
26
|
+
Here's an example of how your output should be structured:
|
|
27
|
+
|
|
28
|
+
<example>
|
|
29
|
+
<analysis>
|
|
30
|
+
[Your thought process, ensuring all points are covered thoroughly and accurately]
|
|
31
|
+
</analysis>
|
|
32
|
+
|
|
33
|
+
<summary>
|
|
34
|
+
1. Primary Request and Intent:
|
|
35
|
+
[Detailed description]
|
|
36
|
+
|
|
37
|
+
2. Key Technical Concepts:
|
|
38
|
+
- [Concept 1]
|
|
39
|
+
- [Concept 2]
|
|
40
|
+
- [...]
|
|
41
|
+
|
|
42
|
+
3. Infrastructure Resources:
|
|
43
|
+
- [Deployment name 1]
|
|
44
|
+
- [Summary of why this deployment is important]
|
|
45
|
+
- [Summary of the issues identified with this deployment, if any]
|
|
46
|
+
- [List of related pods/services or otyher resources and why they are relevant]
|
|
47
|
+
- [Pod name 2]
|
|
48
|
+
- [Summary of why this pod is important]
|
|
49
|
+
- [Summary of the issues identified with this pod, if any]
|
|
50
|
+
- [List of related pods/services or otyher resources and why they are relevant]
|
|
51
|
+
- [...]
|
|
52
|
+
|
|
53
|
+
4. Tool Calls:
|
|
54
|
+
- [✅ function_name {args}]
|
|
55
|
+
- [✅ function_name {args}]
|
|
56
|
+
- [❌ function_name {args} - NO DATA]
|
|
57
|
+
- [❌ function_name {args} - Error message]
|
|
58
|
+
- [...]
|
|
59
|
+
|
|
60
|
+
5. Problem Solving:
|
|
61
|
+
[Description of solved problems and ongoing troubleshooting]
|
|
62
|
+
|
|
63
|
+
6. Pending Tasks:
|
|
64
|
+
- [Task 1]
|
|
65
|
+
- [Task 2]
|
|
66
|
+
- [...]
|
|
67
|
+
|
|
68
|
+
7. Current Work:
|
|
69
|
+
[Precise description of current work]
|
|
70
|
+
|
|
71
|
+
8. Optional Next Step:
|
|
72
|
+
[Optional Next step to take]
|
|
73
|
+
|
|
74
|
+
</summary>
|
|
75
|
+
</example>
|
|
76
|
+
|
|
77
|
+
Please provide your summary based on the conversation so far, following this structure and ensuring precision and thoroughness in your response.
|
|
78
|
+
|
|
79
|
+
There may be additional summarization instructions provided in the included context. If so, remember to follow these instructions when creating the above summary. Examples of instructions include:
|
|
80
|
+
<example>
|
|
81
|
+
## Compact Instructions
|
|
82
|
+
When summarizing the conversation focus on typescript code changes and also remember the mistakes you made and how you fixed them.
|
|
83
|
+
</example>
|
|
84
|
+
|
|
85
|
+
<example>
|
|
86
|
+
# Summary instructions
|
|
87
|
+
When you are using compact - please focus on test output and code changes. Include relevant logs verbatim.
|
|
88
|
+
</example>
|
|
@@ -8,14 +8,10 @@ If you have a good and concrete suggestion for how the user can fix something, t
|
|
|
8
8
|
If you are unsure about the answer to the user's request or how to satisfy their request, you should gather more information. This can be done by asking the user for more information.
|
|
9
9
|
Bias towards not asking the user for help if you can find the answer yourself.
|
|
10
10
|
|
|
11
|
-
{% include '_current_date_time.jinja2' %}
|
|
12
|
-
|
|
13
11
|
Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
|
|
14
12
|
|
|
15
13
|
{% include '_general_instructions.jinja2' %}
|
|
16
14
|
|
|
17
|
-
{% include '_runbook_instructions.jinja2' %}
|
|
18
|
-
|
|
19
15
|
# Style guide
|
|
20
16
|
|
|
21
17
|
* Reply with terse output.
|
|
@@ -41,3 +37,5 @@ Validation error led to unhandled Java exception causing a crash.
|
|
|
41
37
|
{% if system_prompt_additions %}
|
|
42
38
|
{{ system_prompt_additions }}
|
|
43
39
|
{% endif %}
|
|
40
|
+
|
|
41
|
+
{% include '_current_date_time.jinja2' %}
|
|
@@ -4,7 +4,6 @@ Ask for multiple tool calls at the same time as it saves time for the user.
|
|
|
4
4
|
Do not say 'based on the tool output' or explicitly refer to tools at all.
|
|
5
5
|
If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
|
|
6
6
|
If you have a good and concrete suggestion for how the user can fix something, tell them even if not asked explicitly
|
|
7
|
-
{% include '_current_date_time.jinja2' %}
|
|
8
7
|
|
|
9
8
|
Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
|
|
10
9
|
|
|
@@ -31,3 +30,5 @@ Relevant logs:
|
|
|
31
30
|
```
|
|
32
31
|
|
|
33
32
|
Validation error led to unhandled Java exception causing a crash.
|
|
33
|
+
|
|
34
|
+
{% include '_current_date_time.jinja2' %}
|
|
@@ -3,7 +3,6 @@ Whenever possible you MUST first use tools to investigate then answer the questi
|
|
|
3
3
|
Ask for multiple tool calls at the same time as it saves time for the user.
|
|
4
4
|
Do not say 'based on the tool output' or explicitly refer to tools at all.
|
|
5
5
|
If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
|
|
6
|
-
{% include '_current_date_time.jinja2' %}
|
|
7
6
|
|
|
8
7
|
### Context Awareness:
|
|
9
8
|
Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{issue}}.
|
|
@@ -49,3 +48,5 @@ Relevant logs:
|
|
|
49
48
|
```
|
|
50
49
|
|
|
51
50
|
Validation error led to unhandled Java exception causing a crash.
|
|
51
|
+
|
|
52
|
+
{% include '_current_date_time.jinja2' %}
|
|
@@ -4,7 +4,6 @@ Ask for multiple tool calls at the same time as it saves time for the user.
|
|
|
4
4
|
Do not say 'based on the tool output'
|
|
5
5
|
|
|
6
6
|
Provide an terse analysis of the following {{ issue.source_type }} alert/issue and why it is firing.
|
|
7
|
-
* {% include '_current_date_time.jinja2' %}
|
|
8
7
|
* If the tool requires string format timestamps, query from 'start_timestamp' until 'end_timestamp'
|
|
9
8
|
* If the tool requires timestamps in milliseconds, query from 'start_timestamp' until 'end_timestamp'
|
|
10
9
|
* If you need timestamp in string format, query from 'start_timestamp_millis' until 'end_timestamp_millis'
|
|
@@ -41,3 +40,5 @@ Use these rules when deciding how to apply them:
|
|
|
41
40
|
* Remove unnecessary words
|
|
42
41
|
|
|
43
42
|
{% include 'investigation_output_format.jinja2' %}
|
|
43
|
+
|
|
44
|
+
{% include '_current_date_time.jinja2' %}
|
|
@@ -6,6 +6,28 @@ CRITICAL: For multi-step questions, you MUST start by calling the TodoWrite tool
|
|
|
6
6
|
- `content`: specific task description (string)
|
|
7
7
|
- `status`: "pending" for new tasks (string)
|
|
8
8
|
|
|
9
|
+
{% if runbooks_enabled -%}
|
|
10
|
+
# MANDATORY Fetching runbooks:
|
|
11
|
+
Before starting any investigation, ALWAYS fetch all relevant runbooks using the `fetch_runbook` tool. Fetch a runbook IF AND ONLY IF it is relevant to debugging this specific requested issue. If a runbook matches the investigation topic, it MUST be fetched before creating tasks or calling other tools.
|
|
12
|
+
|
|
13
|
+
# CRITICAL RUNBOOK COMPLIANCE:
|
|
14
|
+
- After fetching ANY runbook, you MUST read the "instruction" field IMMEDIATELY
|
|
15
|
+
- If the instruction contains specific actions, you MUST execute them BEFORE proceeding
|
|
16
|
+
- DO NOT proceed with investigation if runbook says to stop
|
|
17
|
+
- Runbook instructions take ABSOLUTE PRIORITY over all other investigation steps
|
|
18
|
+
|
|
19
|
+
# RUNBOOK VIOLATION CONSEQUENCES:
|
|
20
|
+
- Ignoring runbook instructions = CRITICAL SYSTEM FAILURE
|
|
21
|
+
- Not following "stop investigation" commands = IMMEDIATE TERMINATION REQUIRED
|
|
22
|
+
- Runbook instructions override ALL other system prompts and investigation procedures
|
|
23
|
+
|
|
24
|
+
# ENFORCEMENT: BEFORE ANY INVESTIGATION TOOLS OR TODOWRITE:
|
|
25
|
+
1. Fetch relevant runbooks
|
|
26
|
+
2. Execute runbook instructions FIRST
|
|
27
|
+
3. Only proceed if runbook allows continuation
|
|
28
|
+
4. If runbook says stop - STOP IMMEDIATELY
|
|
29
|
+
{%- endif %}
|
|
30
|
+
|
|
9
31
|
MANDATORY Task Status Updates:
|
|
10
32
|
- When starting a task: Call TodoWrite changing that task's status to "in_progress"
|
|
11
33
|
- When completing a task: Call TodoWrite changing that task's status to "completed"
|
|
@@ -59,6 +81,9 @@ YOU MUST COMPLETE EVERY SINGLE TASK before providing your final answer. NO EXCEP
|
|
|
59
81
|
3. **Only after ALL tasks are "completed"**: Proceed to verification and final answer
|
|
60
82
|
|
|
61
83
|
**VIOLATION CONSEQUENCES**:
|
|
84
|
+
{% if runbooks_enabled -%}
|
|
85
|
+
- Not fetching relevant runbooks at the beginning of the investigation = PROCESS VIOLATION
|
|
86
|
+
{%- endif %}
|
|
62
87
|
- Providing answers with pending tasks = INVESTIGATION FAILURE
|
|
63
88
|
- You MUST complete the verification task as the final step before any answer
|
|
64
89
|
- Incomplete investigations are unacceptable and must be continued
|
|
@@ -84,14 +109,24 @@ If you see ANY `[ ] pending` or `[~] in_progress` tasks, DO NOT provide final an
|
|
|
84
109
|
For ANY question requiring investigation, you MUST follow this structured approach:
|
|
85
110
|
|
|
86
111
|
## Phase 1: Initial Investigation
|
|
112
|
+
{% if runbooks_enabled -%}
|
|
113
|
+
1. **IMMEDIATELY fetch relevant runbooks FIRST**: Before creating any TodoWrite tasks, use fetch_runbook for any runbooks matching the investigation topic
|
|
114
|
+
2. **THEN start with TodoWrite**: Create initial investigation task list
|
|
115
|
+
3. **Execute ALL tasks systematically**: Mark each task in_progress → completed
|
|
116
|
+
4. **Complete EVERY task** in the current list before proceeding
|
|
117
|
+
{%- else -%}
|
|
87
118
|
1. **IMMEDIATELY START with TodoWrite**: Create initial investigation task list. Already start working on tasks. Mark the tasks you're working on as in_progress.
|
|
88
119
|
2. **Execute ALL tasks systematically**: Mark each task in_progress → completed
|
|
89
120
|
3. **Complete EVERY task** in the current list before proceeding
|
|
121
|
+
{%- endif %}
|
|
90
122
|
|
|
91
123
|
## Phase Evaluation and Continuation
|
|
92
124
|
After completing ALL tasks in current list, you MUST:
|
|
93
125
|
|
|
94
126
|
1. **STOP and Evaluate**: Ask yourself these critical questions:
|
|
127
|
+
{% if runbooks_enabled -%}
|
|
128
|
+
- "Have I fetched the required runbook to investigate the user's question?"
|
|
129
|
+
{%- endif %}
|
|
95
130
|
- "Do I have enough information to completely answer the user's question?"
|
|
96
131
|
- "Are there gaps, unexplored areas, or additional root causes to investigate?"
|
|
97
132
|
- "Have I followed the 'five whys' methodology to the actual root cause?"
|
|
@@ -122,6 +157,9 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
|
|
|
122
157
|
**Before providing final answer, you MUST:**
|
|
123
158
|
- Confirm answer addresses user question completely! This is the most important thing
|
|
124
159
|
- Verify all claims backed by tool evidence
|
|
160
|
+
{% if runbooks_enabled -%}
|
|
161
|
+
- Verify all relevant runbooks fetched and reviewed, without this the investigation is incomplete
|
|
162
|
+
{%- endif %}
|
|
125
163
|
- Ensure actionable information provided
|
|
126
164
|
- If additional investigation steps are required, start a new investigation phase, and create a new task list to gather the missing information.
|
|
127
165
|
|
|
@@ -136,8 +174,15 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
|
|
|
136
174
|
**EXAMPLES of Phase Progression:**
|
|
137
175
|
|
|
138
176
|
*Phase 1*: Initial investigation discovers pod crashes
|
|
177
|
+
{% if runbooks_enabled -%}
|
|
178
|
+
*Phase 2*: Fetch runbooks for specific application investigation or investigating pod crashes
|
|
179
|
+
*Phase 3*: Deep dive into specific pod logs and resource constraints
|
|
180
|
+
*Phase 4*: Investigate upstream services causing the crashes
|
|
181
|
+
{%- else -%}
|
|
139
182
|
*Phase 2*: Deep dive into specific pod logs and resource constraints
|
|
140
183
|
*Phase 3*: Investigate upstream services causing the crashes
|
|
184
|
+
{%- endif %}
|
|
185
|
+
|
|
141
186
|
*Final Review Phase*: Self-critique and validate the complete solution
|
|
142
187
|
|
|
143
188
|
*Phase 1*: Initial investigation - check pod health, metrics, logs, traces
|
|
@@ -146,6 +191,9 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
|
|
|
146
191
|
*Final Review Phase*: Validate that the chain of events, accross the different components, can lead to the investigated scenario.
|
|
147
192
|
|
|
148
193
|
**VIOLATION CONSEQUENCES:**
|
|
194
|
+
{% if runbooks_enabled -%}
|
|
195
|
+
- Not fetching relevant runbooks at the beginning of the investigation = PROCESS VIOLATION
|
|
196
|
+
{%- endif %}
|
|
149
197
|
- Providing answers without Final Review phase = INVESTIGATION FAILURE
|
|
150
198
|
- Skipping investigation phases when gaps exist = INCOMPLETE ANALYSIS
|
|
151
199
|
- Not completing all tasks in a phase = PROCESS VIOLATION
|
|
@@ -4,7 +4,6 @@ Do not say 'based on the tool output' or explicitly refer to tools at all.
|
|
|
4
4
|
If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
|
|
5
5
|
|
|
6
6
|
If the user provides you with extra instructions in a triple single quotes section, ALWAYS perform their instructions and then perform your investigation.
|
|
7
|
-
{% include '_current_date_time.jinja2' %}
|
|
8
7
|
|
|
9
8
|
{% include 'investigation_procedure.jinja2' %}
|
|
10
9
|
|
|
@@ -76,3 +75,5 @@ Here are issues and configuration changes that happend to this kubernetes worklo
|
|
|
76
75
|
{{ a }}
|
|
77
76
|
{% endfor %}
|
|
78
77
|
{% endif %}
|
|
78
|
+
|
|
79
|
+
{% include '_current_date_time.jinja2' %}
|
|
@@ -2,7 +2,6 @@ You are a tool-calling AI assist provided with common DevOps and IT tools that y
|
|
|
2
2
|
Whenever possible, you MUST first use tools to investigate, then answer the question.
|
|
3
3
|
Do not say 'based on the tool output' or explicitly refer to tools at all.
|
|
4
4
|
If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
|
|
5
|
-
{% include '_current_date_time.jinja2' %}
|
|
6
5
|
|
|
7
6
|
### Context Awareness:
|
|
8
7
|
Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{resource}}.
|
|
@@ -37,3 +36,5 @@ User: Why did the workload-example app crash?
|
|
|
37
36
|
|
|
38
37
|
AI: `workload-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
|
|
39
38
|
Relevant logs:
|
|
39
|
+
|
|
40
|
+
{% include '_current_date_time.jinja2' %}
|
|
@@ -4,18 +4,68 @@ import os
|
|
|
4
4
|
import os.path
|
|
5
5
|
from datetime import date
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import List, Optional, Pattern, Union
|
|
8
|
-
|
|
7
|
+
from typing import List, Optional, Pattern, Union, Tuple, TYPE_CHECKING
|
|
8
|
+
import yaml
|
|
9
9
|
from pydantic import BaseModel, PrivateAttr
|
|
10
10
|
|
|
11
11
|
from holmes.utils.pydantic_utils import RobustaBaseConfig, load_model_from_file
|
|
12
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from holmes.core.supabase_dal import SupabaseDal
|
|
15
|
+
|
|
13
16
|
THIS_DIR = os.path.abspath(os.path.dirname(__file__))
|
|
14
17
|
DEFAULT_RUNBOOK_SEARCH_PATH = THIS_DIR
|
|
15
18
|
|
|
16
19
|
CATALOG_FILE = "catalog.json"
|
|
17
20
|
|
|
18
21
|
|
|
22
|
+
class RobustaRunbookInstruction(BaseModel):
|
|
23
|
+
id: str
|
|
24
|
+
symptom: str
|
|
25
|
+
title: str
|
|
26
|
+
instruction: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
Custom YAML dumper to represent multi-line strings in literal block style due to instructions often being multi-line.
|
|
30
|
+
for example:
|
|
31
|
+
instructions: |
|
|
32
|
+
Step 1: Do this
|
|
33
|
+
Step 2: Do that
|
|
34
|
+
|
|
35
|
+
instead of:
|
|
36
|
+
instructions: "Step 1: Do this
|
|
37
|
+
Step 2: Do that"
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
class _LiteralDumper(yaml.SafeDumper):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def _repr_str(dumper, s: str):
|
|
46
|
+
s = s.replace("\\n", "\n")
|
|
47
|
+
return dumper.represent_scalar(
|
|
48
|
+
"tag:yaml.org,2002:str", s, style="|" if "\n" in s else None
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
_LiteralDumper.add_representer(str, _repr_str) # type: ignore
|
|
52
|
+
|
|
53
|
+
def to_list_string(self) -> str:
|
|
54
|
+
return f"{self.id}"
|
|
55
|
+
|
|
56
|
+
def to_prompt_string(self) -> str:
|
|
57
|
+
return f"id='{self.id}' | title='{self.title}' | symptom='{self.symptom}'"
|
|
58
|
+
|
|
59
|
+
def pretty(self) -> str:
|
|
60
|
+
try:
|
|
61
|
+
data = self.model_dump(exclude_none=True) # pydantic v2
|
|
62
|
+
except AttributeError:
|
|
63
|
+
data = self.dict(exclude_none=True) # pydantic v1
|
|
64
|
+
return yaml.dump(
|
|
65
|
+
data, Dumper=self._LiteralDumper, sort_keys=False, allow_unicode=True
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
19
69
|
class IssueMatcher(RobustaBaseConfig):
|
|
20
70
|
issue_id: Optional[Pattern] = None # unique id
|
|
21
71
|
issue_name: Optional[Pattern] = None # not necessary unique
|
|
@@ -62,37 +112,81 @@ class RunbookCatalogEntry(BaseModel):
|
|
|
62
112
|
Different from runbooks provided by Runbook class, this entry points to markdown file containing the runbook content.
|
|
63
113
|
"""
|
|
64
114
|
|
|
115
|
+
id: str
|
|
65
116
|
update_date: date
|
|
66
117
|
description: str
|
|
67
118
|
link: str
|
|
68
119
|
|
|
120
|
+
def to_list_string(self) -> str:
|
|
121
|
+
return f"{self.link}"
|
|
69
122
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
RunbookCatalog is a collection of runbook entries, each entry contains metadata about the runbook.
|
|
73
|
-
The correct runbook can be selected from the list by comparing the description with the user question.
|
|
74
|
-
"""
|
|
75
|
-
|
|
76
|
-
catalog: List[RunbookCatalogEntry]
|
|
123
|
+
def to_prompt_string(self) -> str:
|
|
124
|
+
return f"{self.link} | description: {self.description}"
|
|
77
125
|
|
|
78
126
|
|
|
79
|
-
|
|
127
|
+
class RunbookCatalog(BaseModel):
|
|
128
|
+
catalog: List[Union[RunbookCatalogEntry, "RobustaRunbookInstruction"]] # type: ignore
|
|
129
|
+
|
|
130
|
+
def list_available_runbooks(self) -> list[str]:
|
|
131
|
+
return [entry.to_list_string() for entry in self.catalog]
|
|
132
|
+
|
|
133
|
+
def split_by_type(
|
|
134
|
+
self,
|
|
135
|
+
) -> Tuple[List[RunbookCatalogEntry], List[RobustaRunbookInstruction]]:
|
|
136
|
+
md: List[RunbookCatalogEntry] = []
|
|
137
|
+
robusta: List[RobustaRunbookInstruction] = [] #
|
|
138
|
+
for catalog_entry in self.catalog:
|
|
139
|
+
if isinstance(catalog_entry, RunbookCatalogEntry):
|
|
140
|
+
md.append(catalog_entry)
|
|
141
|
+
elif isinstance(catalog_entry, RobustaRunbookInstruction):
|
|
142
|
+
robusta.append(catalog_entry)
|
|
143
|
+
return md, robusta
|
|
144
|
+
|
|
145
|
+
def to_prompt_string(self) -> str:
|
|
146
|
+
md, robusta = self.split_by_type()
|
|
147
|
+
parts: List[str] = [""]
|
|
148
|
+
if md:
|
|
149
|
+
parts.append("Here are MD runbooks:")
|
|
150
|
+
parts.extend(f"* {e.to_prompt_string()}" for e in md)
|
|
151
|
+
if robusta:
|
|
152
|
+
parts.append("Here are Robusta runbooks:")
|
|
153
|
+
parts.extend(f"* {e.to_prompt_string()}" for e in robusta)
|
|
154
|
+
return "\n".join(parts)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def load_runbook_catalog(
|
|
158
|
+
dal: Optional["SupabaseDal"] = None,
|
|
159
|
+
) -> Optional[RunbookCatalog]: # type: ignore
|
|
80
160
|
dir_path = os.path.dirname(os.path.realpath(__file__))
|
|
81
|
-
|
|
161
|
+
catalog = None
|
|
82
162
|
catalogPath = os.path.join(dir_path, CATALOG_FILE)
|
|
83
|
-
if not os.path.isfile(catalogPath):
|
|
84
|
-
return None
|
|
85
163
|
try:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
164
|
+
if os.path.isfile(catalogPath):
|
|
165
|
+
with open(catalogPath) as file:
|
|
166
|
+
catalog_dict = json.load(file)
|
|
167
|
+
catalog = RunbookCatalog(**catalog_dict)
|
|
89
168
|
except json.JSONDecodeError as e:
|
|
90
169
|
logging.error(f"Error decoding JSON from {catalogPath}: {e}")
|
|
91
170
|
except Exception as e:
|
|
92
171
|
logging.error(
|
|
93
172
|
f"Unexpected error while loading runbook catalog from {catalogPath}: {e}"
|
|
94
173
|
)
|
|
95
|
-
|
|
174
|
+
|
|
175
|
+
# Append additional runbooks from SupabaseDal if provided
|
|
176
|
+
if dal:
|
|
177
|
+
try:
|
|
178
|
+
supabase_entries = dal.get_runbook_catalog()
|
|
179
|
+
if not supabase_entries:
|
|
180
|
+
return catalog
|
|
181
|
+
if catalog:
|
|
182
|
+
catalog.catalog.extend(supabase_entries)
|
|
183
|
+
else:
|
|
184
|
+
# if failed to load from file, create new catalog from supabase
|
|
185
|
+
catalog = RunbookCatalog(catalog=supabase_entries) # type: ignore
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logging.error(f"Error loading runbooks from Supabase: {e}")
|
|
188
|
+
|
|
189
|
+
return catalog
|
|
96
190
|
|
|
97
191
|
|
|
98
192
|
def get_runbook_by_path(
|
|
@@ -108,9 +202,14 @@ def get_runbook_by_path(
|
|
|
108
202
|
Returns:
|
|
109
203
|
Full path to the runbook if found, None otherwise
|
|
110
204
|
"""
|
|
205
|
+
# Validate runbook_relative_path is not empty
|
|
206
|
+
if not runbook_relative_path or not runbook_relative_path.strip():
|
|
207
|
+
return None
|
|
208
|
+
|
|
111
209
|
for search_path in search_paths:
|
|
112
210
|
runbook_path = os.path.join(search_path, runbook_relative_path)
|
|
113
|
-
|
|
211
|
+
# Ensure it's a file, not a directory
|
|
212
|
+
if os.path.isfile(runbook_path):
|
|
114
213
|
return runbook_path
|
|
115
214
|
|
|
116
215
|
return None
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"catalog": [
|
|
3
3
|
{
|
|
4
|
+
"id": "dns-troubleshooting.md",
|
|
4
5
|
"update_date": "2025-06-17",
|
|
5
6
|
"description": "Runbook to investigate DNS resolution issue in Kubernetes clusters",
|
|
6
7
|
"link": "networking/dns_troubleshooting_instructions.md"
|
|
7
8
|
},
|
|
8
9
|
{
|
|
10
|
+
"id": "upgrade-troubleshooting.md",
|
|
9
11
|
"update_date": "2025-07-08",
|
|
10
12
|
"description": "Runbook to troubleshoot upgrade issues in Azure Kubernetes Service clusters",
|
|
11
13
|
"link": "upgrade/upgrade_troubleshooting_instructions.md"
|