PyPI - holmesgpt - Versions diffs - 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl - Mend

holmesgpt 0.13.2py3-none-any.whl → 0.16.2a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

holmes/__init__.py +1 -1
holmes/clients/robusta_client.py +17 -4
holmes/common/env_vars.py +40 -1
holmes/config.py +114 -144
holmes/core/conversations.py +53 -14
holmes/core/feedback.py +191 -0
holmes/core/investigation.py +18 -22
holmes/core/llm.py +489 -88
holmes/core/models.py +103 -1
holmes/core/openai_formatting.py +13 -0
holmes/core/prompt.py +1 -1
holmes/core/safeguards.py +4 -4
holmes/core/supabase_dal.py +293 -100
holmes/core/tool_calling_llm.py +423 -323
holmes/core/tools.py +311 -33
holmes/core/tools_utils/token_counting.py +14 -0
holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
holmes/core/tools_utils/tool_executor.py +13 -8
holmes/core/toolset_manager.py +155 -4
holmes/core/tracing.py +6 -1
holmes/core/transformers/__init__.py +23 -0
holmes/core/transformers/base.py +62 -0
holmes/core/transformers/llm_summarize.py +174 -0
holmes/core/transformers/registry.py +122 -0
holmes/core/transformers/transformer.py +31 -0
holmes/core/truncation/compaction.py +59 -0
holmes/core/truncation/dal_truncation_utils.py +23 -0
holmes/core/truncation/input_context_window_limiter.py +218 -0
holmes/interactive.py +177 -24
holmes/main.py +7 -4
holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
holmes/plugins/prompts/generic_ask.jinja2 +2 -4
holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
holmes/plugins/runbooks/__init__.py +117 -18
holmes/plugins/runbooks/catalog.json +2 -0
holmes/plugins/toolsets/__init__.py +21 -8
holmes/plugins/toolsets/aks-node-health.yaml +46 -0
holmes/plugins/toolsets/aks.yaml +64 -0
holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
holmes/plugins/toolsets/bash/common/bash.py +7 -7
holmes/plugins/toolsets/cilium.yaml +284 -0
holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
holmes/plugins/toolsets/git.py +51 -46
holmes/plugins/toolsets/grafana/common.py +15 -3
holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
holmes/plugins/toolsets/internet/internet.py +6 -7
holmes/plugins/toolsets/internet/notion.py +5 -6
holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
holmes/plugins/toolsets/kafka.py +25 -36
holmes/plugins/toolsets/kubernetes.yaml +58 -84
holmes/plugins/toolsets/kubernetes_logs.py +6 -6
holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
holmes/plugins/toolsets/newrelic/__init__.py +0 -0
holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
holmes/plugins/toolsets/openshift.yaml +283 -0
holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
holmes/plugins/toolsets/prometheus/utils.py +28 -0
holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
holmes/plugins/toolsets/robusta/robusta.py +236 -65
holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
holmes/plugins/toolsets/service_discovery.py +1 -1
holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
holmes/plugins/toolsets/utils.py +88 -0
holmes/utils/config_utils.py +91 -0
holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
holmes/utils/env.py +7 -0
holmes/utils/global_instructions.py +75 -10
holmes/utils/holmes_status.py +2 -1
holmes/utils/holmes_sync_toolsets.py +0 -2
holmes/utils/krr_utils.py +188 -0
holmes/utils/sentry_helper.py +41 -0
holmes/utils/stream.py +61 -7
holmes/version.py +34 -14
holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
{holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
{holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
holmes/core/performance_timing.py +0 -72
holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
holmes/plugins/toolsets/newrelic.py +0 -231
holmes/plugins/toolsets/servicenow/install.md +0 -37
holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
{holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
{holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0

holmes/main.py CHANGED Viewed

@@ -1,9 +1,7 @@
 # ruff: noqa: E402
 import os
-import sys
 from holmes.utils.cert_utils import add_custom_certificate
-from holmes.utils.colors import USER_COLOR
 ADDITIONAL_CERTIFICATE: str = os.environ.get("CERTIFICATE", "")
 if add_custom_certificate(ADDITIONAL_CERTIFICATE):
@@ -11,8 +9,7 @@ if add_custom_certificate(ADDITIONAL_CERTIFICATE):
 # DO NOT ADD ANY IMPORTS OR CODE ABOVE THIS LINE
 # IMPORTING ABOVE MIGHT INITIALIZE AN HTTPS CLIENT THAT DOESN'T TRUST THE CUSTOM CERTIFICATE
+import sys
 import json
 import logging
 import socket
@@ -44,6 +41,7 @@ from holmes.utils.console.consts import system_prompt_help
 from holmes.utils.console.logging import init_logging
 from holmes.utils.console.result import handle_result
 from holmes.utils.file_utils import write_json_file
+from holmes.utils.colors import USER_COLOR
 app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
 investigate_app = typer.Typer(
@@ -76,6 +74,9 @@ opt_api_key: Optional[str] = typer.Option(
     help="API key to use for the LLM (if not given, uses environment variables OPENAI_API_KEY or AZURE_API_KEY)",
 )
 opt_model: Optional[str] = typer.Option(None, help="Model to use for the LLM")
+opt_fast_model: Optional[str] = typer.Option(
+    None, help="Optional fast model for summarization tasks"
+)
 opt_config_file: Optional[Path] = typer.Option(
     DEFAULT_CONFIG_LOCATION,  # type: ignore
     "--config",
@@ -177,6 +178,7 @@ def ask(
     # common options
     api_key: Optional[str] = opt_api_key,
     model: Optional[str] = opt_model,
+    fast_model: Optional[str] = opt_fast_model,
     config_file: Optional[Path] = opt_config_file,
     custom_toolsets: Optional[List[Path]] = opt_custom_toolsets,
     max_steps: Optional[int] = opt_max_steps,
@@ -244,6 +246,7 @@ def ask(
         config_file,
         api_key=api_key,
         model=model,
+        fast_model=fast_model,
         max_steps=max_steps,
         custom_toolsets_from_cli=custom_toolsets,
         slack_token=slack_token,

holmes/plugins/prompts/_fetch_logs.jinja2 CHANGED Viewed

@@ -4,6 +4,7 @@
 {%- set k8s_yaml_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | rejectattr("fetch_pod_logs", "defined") | first -%}
 {%- set opensearch_ts = toolsets | selectattr("name", "equalto", "opensearch/logs") | first -%}
 {%- set datadog_ts = toolsets | selectattr("name", "equalto", "datadog/logs") | first -%}
+{%- set openshift_ts = toolsets | selectattr("name", "equalto", "openshift/logs") | first -%}
 {%- set bash_ts = toolsets | selectattr("name", "equalto", "bash") | first -%}
 ## Logs
@@ -11,6 +12,7 @@
 * IMPORTANT: ALWAYS inform the user about what logs you fetched. For example: "Here are pod logs for ..."
 * IMPORTANT: If logs commands have limits mention them. For example: "Showing last 100 lines of logs:"
 * IMPORTANT: If a filter was used, mention the filter. For example: "Logs filtered for 'error':"
+* IMPORTANT: If a date range was used (even if just the default one and you didn't specify the parameter, mention the date range. For example: "Logs from last 1 hour..."
 {% if loki_ts and loki_ts.status == "enabled" -%}
 * For any logs, including for investigating kubernetes problems, use Loki
@@ -34,8 +36,29 @@ Tools to search and fetch logs from Coralogix.
 ### datadog/logs
 #### Datadog Logs Toolset
 Tools to search and fetch logs from Datadog.
-{% include '_default_log_prompt.jinja2' %}
+* Use the tool `fetch_pod_logs` to access an application's logs.
+* Do fetch application logs yourself and DO not ask users to do so
+* If you have an alert/monitor try to figure out the time it fired
+** Then, use `start_time=-300` (5 minutes before `end_time`) and `end_time=<time monitor started firing>`  when calling `fetch_pod_logs`.
+** If there are too many logs, or not enough, narrow or widen the timestamps
+* If the user did not explicitly ask about a given timeframe, ignore the `start_time` and `end_time` so it will use the default.
+* IMPORTANT: ALWAYS inform the user about the actual time period fetched (e.g., "Looking at logs from the last <X> days")
+* IMPORTANT: If a limit was applied, ALWAYS tell the user how many logs were shown vs total (e.g., "Showing latest <Y> of <Z> logs")
+* IMPORTANT: If any filters were applied, ALWAYS mention them explicitly
+{%- elif openshift_ts and openshift_ts.status == "enabled" -%}
+### openshift/logs
+#### OpenShift Logs Toolset
+Tools to search and fetch logs from OpenShift.
+* Use the tool `oc_logs` to access an application's logs.
+* Do fetch application logs yourself and DO not ask users to do so
+* If you have an alert/monitor try to figure out the time it fired
+** If there are too many logs, or not enough, narrow or widen the timestamps
+* IMPORTANT: ALWAYS inform the user about the actual time period fetched (e.g., "Looking at logs from the last <X> days")
+* IMPORTANT: If a limit was applied, ALWAYS tell the user how many logs were shown vs total (e.g., "Showing latest <Y> of <Z> logs")
+* IMPORTANT: If any filters were applied, ALWAYS mention them explicitly
 {%- elif k8s_yaml_ts and k8s_yaml_ts.status == "enabled" -%}
+### Logs from newrelic
+* you can fetch logs from newrelic if this is toolset is enabled
 ### kubernetes/logs
 #### Kubernetes Logs Toolset
 Tools to search and fetch logs from Kubernetes.
@@ -56,4 +79,6 @@ DO NOT use `--tail` or `| tail` when calling `kubectl logs` because you may miss
 ** 'opensearch/logs'
 ** 'coralogix/logs'
 ** 'datadog/logs'
+** 'openshift/logs'
+** 'newrelic'
 {%- endif -%}

holmes/plugins/prompts/_general_instructions.jinja2 CHANGED Viewed

@@ -12,8 +12,7 @@
 * do not stop investigating until you are at the final root cause you are able to find.
 * use the "five whys" methodology to find the root cause.
 * for example, if you found a problem in microservice A that is due to an error in microservice B, look at microservice B too and find the error in that.
-* if you cannot find the resource/application that the user referred to, assume they made a typo or included/excluded characters like - and.
-* in this case, try to find substrings or search for the correct spellings
+* if you cannot find the resource/application that the user referred to, assume they made a typo or included/excluded characters like - and in this case, try to find substrings or search for the correct spellings
 * always provide detailed information like exact resource names, versions, labels, etc
 * even if you found the root cause, keep investigating to find other possible root causes and to gather data for the answer like exact names
 * if a runbook url is present you MUST fetch the runbook before beginning your investigation

holmes/plugins/prompts/_runbook_instructions.jinja2 CHANGED Viewed

@@ -1,21 +1,32 @@
-{% if runbooks and runbooks.catalog|length > 0 %}
+{%- set sections = [
+  {'title': 'Runbook Catalog', 'content': runbook_catalog},
+  {'title': 'Subject/Issue Runbooks', 'content': custom_instructions},
+  {'title': 'Global Instructions', 'content': global_instructions}
+] -%}
+{%- set available = sections | selectattr('content') | list -%}
+{%- if available -%}
 # Runbook Selection
-You (HolmesGPT) have access to a set of runbooks that provide step-by-step troubleshooting instructions for various known issues.
-If one of the following runbooks relates to the user's issue, you MUST fetch it with the fetch_runbook tool.
+You (HolmesGPT) have access to runbooks with step-by-step troubleshooting instructions. If one of the following runbooks relates to the user's issue, you MUST fetch it with the fetch_runbook tool.
+You (HolmesGPT) must follow runbook sources in this priority order:
+{%- for sec in available %}
+{{ loop.index }}) {{ sec.title }} (priority #{{ loop.index }})
+{%- endfor %}
-## Available Runbooks for fetch_runbook tool
-{% for runbook in runbooks.catalog %}
-### description: {{ runbook.description }}
-link: {{ runbook.link }}
-{% endfor %}
+{%- for sec in available %}
+## {{ sec.title }} (priority #{{ loop.index }})
-If there is a runbook that MIGHT match the user's issue, you MUST:
+{%- set content = (sec.content|string) -%}
+{{ content.replace('\n', '\n   ') }}
+{%- endfor %}
+If a runbook might match the user's issue, you MUST:
 1. Fetch the runbook with the `fetch_runbook` tool.
 2. Decide based on the runbook's contents if it is relevant or not.
-3. If it seems relevant, inform the user that you accesses a runbook and will use it to troubleshoot the issue.
+3. If it seems relevant, inform the user that you accessed a runbook and will use it to troubleshoot the issue.
 4. To the maximum extent possible, follow the runbook instructions step-by-step.
 5. Provide a detailed report of the steps you performed, including any findings or errors encountered.
-6. If a runbook step requires tools or integrations you don't have access to tell the user that you cannot perform that step due to missing tools.
+6. If a runbook step requires tools or integrations you don't have access to, tell the user that you cannot perform that step due to missing tools.
 {%- endif -%}

holmes/plugins/prompts/conversation_history_compaction.jinja2 ADDED Viewed

@@ -0,0 +1,88 @@
+Your task is to create a detailed summary of the conversation so far, paying close attention to the user's explicit requests and your previous actions.
+This summary should be thorough in capturing technical details, code patterns, and architectural decisions that would be essential for continuing development work without losing context.
+Before providing your final summary, wrap your analysis in <analysis> tags to organize your thoughts and ensure you've covered all necessary points. In your analysis process:
+1. Chronologically analyze each message and section of the conversation. For each section thoroughly identify:
+   - The user's explicit requests and intents
+   - Your approach to addressing the user's requests
+   - Key decisions, technical concepts and code patterns
+   - Specific details like kubernetes resource names, namespaces, relevant logs extracts (verbatim), etc
+   - What tools were called and the outcome or analysis of the tool output
+2. Double-check for technical accuracy and completeness, addressing each required element thoroughly.
+Your summary should include the following sections:
+1. Primary Request and Intent: Capture all of the user's explicit requests and intents in detail
+2. Key Technical Concepts: List all important technical concepts, technologies, and frameworks discussed.
+3. Resources: Enumerate specific kubernetes or cloud resources and logs extract examined. Pay special attention to the most recent messages and include logs or tool outputs where applicable and include a summary of why this resource is important.
+4. Tool calls: List all tool calls that were executed and whether they failed/succeeded. Make sure to mention the full arguments used. Only summarize the arguments if they are over 200 characters long
+5. Problem Solving: Document problems solved and any ongoing troubleshooting efforts.
+6. Pending Tasks: Outline any pending tasks that you have explicitly been asked to work on.
+7. Current Work: Describe in detail precisely what was being worked on immediately before this summary request, paying special attention to the most recent messages from both user and assistant. Include resource names and their namespace and log extracts where applicable.
+8. Optional Next Step: List the next step that you will take that is related to the most recent work you were doing. IMPORTANT: ensure that this step is DIRECTLY in line with the user's explicit requests, and the task you were working on immediately before this summary request. If your last task was concluded, then only list next steps if they are explicitly in line with the users request. Do not start on tangential requests without confirming with the user first.
+                       If there is a next step, include direct quotes from the most recent conversation showing exactly what task you were working on and where you left off. This should be verbatim to ensure there's no drift in task interpretation.
+Here's an example of how your output should be structured:
+<example>
+<analysis>
+[Your thought process, ensuring all points are covered thoroughly and accurately]
+</analysis>
+<summary>
+1. Primary Request and Intent:
+   [Detailed description]
+2. Key Technical Concepts:
+   - [Concept 1]
+   - [Concept 2]
+   - [...]
+3. Infrastructure Resources:
+   - [Deployment name 1]
+      - [Summary of why this deployment is important]
+      - [Summary of the issues identified with this deployment, if any]
+      - [List of related pods/services or otyher resources and why they are relevant]
+   - [Pod name 2]
+      - [Summary of why this pod is important]
+      - [Summary of the issues identified with this pod, if any]
+      - [List of related pods/services or otyher resources and why they are relevant]
+   - [...]
+4. Tool Calls:
+   - [✅ function_name {args}]
+   - [✅ function_name {args}]
+   - [❌ function_name {args} - NO DATA]
+   - [❌ function_name {args} - Error message]
+   - [...]
+5. Problem Solving:
+   [Description of solved problems and ongoing troubleshooting]
+6. Pending Tasks:
+   - [Task 1]
+   - [Task 2]
+   - [...]
+7. Current Work:
+   [Precise description of current work]
+8. Optional Next Step:
+   [Optional Next step to take]
+</summary>
+</example>
+Please provide your summary based on the conversation so far, following this structure and ensuring precision and thoroughness in your response.
+There may be additional summarization instructions provided in the included context. If so, remember to follow these instructions when creating the above summary. Examples of instructions include:
+<example>
+## Compact Instructions
+When summarizing the conversation focus on typescript code changes and also remember the mistakes you made and how you fixed them.
+</example>
+<example>
+# Summary instructions
+When you are using compact - please focus on test output and code changes. Include relevant logs verbatim.
+</example>

holmes/plugins/prompts/generic_ask.jinja2 CHANGED Viewed

@@ -8,14 +8,10 @@ If you have a good and concrete suggestion for how the user can fix something, t
 If you are unsure about the answer to the user's request or how to satisfy their request, you should gather more information. This can be done by asking the user for more information.
 Bias towards not asking the user for help if you can find the answer yourself.
-{% include '_current_date_time.jinja2' %}
 Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
 {% include '_general_instructions.jinja2' %}
-{% include '_runbook_instructions.jinja2' %}
 # Style guide
 * Reply with terse output.
@@ -41,3 +37,5 @@ Validation error led to unhandled Java exception causing a crash.
 {% if system_prompt_additions %}
 {{ system_prompt_additions }}
 {% endif %}
+{% include '_current_date_time.jinja2' %}

holmes/plugins/prompts/generic_ask_conversation.jinja2 CHANGED Viewed

@@ -4,7 +4,6 @@ Ask for multiple tool calls at the same time as it saves time for the user.
 Do not say 'based on the tool output' or explicitly refer to tools at all.
 If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
 If you have a good and concrete suggestion for how the user can fix something, tell them even if not asked explicitly
-{% include '_current_date_time.jinja2' %}
 Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
@@ -31,3 +30,5 @@ Relevant logs:
 ```
 Validation error led to unhandled Java exception causing a crash.
+{% include '_current_date_time.jinja2' %}

holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 CHANGED Viewed

@@ -3,7 +3,6 @@ Whenever possible you MUST first use tools to investigate then answer the questi
 Ask for multiple tool calls at the same time as it saves time for the user.
 Do not say 'based on the tool output' or explicitly refer to tools at all.
 If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
-{% include '_current_date_time.jinja2' %}
 ### Context Awareness:
 Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{issue}}.
@@ -49,3 +48,5 @@ Relevant logs:
 ```
 Validation error led to unhandled Java exception causing a crash.
+{% include '_current_date_time.jinja2' %}

holmes/plugins/prompts/generic_investigation.jinja2 CHANGED Viewed

@@ -4,7 +4,6 @@ Ask for multiple tool calls at the same time as it saves time for the user.
 Do not say 'based on the tool output'
 Provide an terse analysis of the following {{ issue.source_type }} alert/issue and why it is firing.
-* {% include '_current_date_time.jinja2' %}
 * If the tool requires string format timestamps, query from 'start_timestamp' until 'end_timestamp'
 * If the tool requires timestamps in milliseconds, query from 'start_timestamp' until 'end_timestamp'
 * If you need timestamp in string format, query from 'start_timestamp_millis' until 'end_timestamp_millis'
@@ -41,3 +40,5 @@ Use these rules when deciding how to apply them:
 * Remove unnecessary words
 {% include 'investigation_output_format.jinja2' %}
+{% include '_current_date_time.jinja2' %}

holmes/plugins/prompts/investigation_procedure.jinja2 CHANGED Viewed

@@ -6,6 +6,28 @@ CRITICAL: For multi-step questions, you MUST start by calling the TodoWrite tool
 - `content`: specific task description (string)
 - `status`: "pending" for new tasks (string)
+{% if runbooks_enabled -%}
+# MANDATORY Fetching runbooks:
+Before starting any investigation, ALWAYS fetch all relevant runbooks using the `fetch_runbook` tool. Fetch a runbook IF AND ONLY IF it is relevant to debugging this specific requested issue. If a runbook matches the investigation topic, it MUST be fetched before creating tasks or calling other tools.
+# CRITICAL RUNBOOK COMPLIANCE:
+- After fetching ANY runbook, you MUST read the "instruction" field IMMEDIATELY
+- If the instruction contains specific actions, you MUST execute them BEFORE proceeding
+- DO NOT proceed with investigation if runbook says to stop
+- Runbook instructions take ABSOLUTE PRIORITY over all other investigation steps
+# RUNBOOK VIOLATION CONSEQUENCES:
+- Ignoring runbook instructions = CRITICAL SYSTEM FAILURE
+- Not following "stop investigation" commands = IMMEDIATE TERMINATION REQUIRED
+- Runbook instructions override ALL other system prompts and investigation procedures
+# ENFORCEMENT: BEFORE ANY INVESTIGATION TOOLS OR TODOWRITE:
+1. Fetch relevant runbooks
+2. Execute runbook instructions FIRST
+3. Only proceed if runbook allows continuation
+4. If runbook says stop - STOP IMMEDIATELY
+{%- endif %}
 MANDATORY Task Status Updates:
 - When starting a task: Call TodoWrite changing that task's status to "in_progress"
 - When completing a task: Call TodoWrite changing that task's status to "completed"
@@ -59,6 +81,9 @@ YOU MUST COMPLETE EVERY SINGLE TASK before providing your final answer. NO EXCEP
 3. **Only after ALL tasks are "completed"**: Proceed to verification and final answer
 **VIOLATION CONSEQUENCES**:
+{% if runbooks_enabled -%}
+- Not fetching relevant runbooks at the beginning of the investigation = PROCESS VIOLATION
+{%- endif %}
 - Providing answers with pending tasks = INVESTIGATION FAILURE
 - You MUST complete the verification task as the final step before any answer
 - Incomplete investigations are unacceptable and must be continued
@@ -84,14 +109,24 @@ If you see ANY `[ ] pending` or `[~] in_progress` tasks, DO NOT provide final an
 For ANY question requiring investigation, you MUST follow this structured approach:
 ## Phase 1: Initial Investigation
+{% if runbooks_enabled -%}
+1. **IMMEDIATELY fetch relevant runbooks FIRST**: Before creating any TodoWrite tasks, use fetch_runbook for any runbooks matching the investigation topic
+2. **THEN start with TodoWrite**: Create initial investigation task list
+3. **Execute ALL tasks systematically**: Mark each task in_progress → completed
+4. **Complete EVERY task** in the current list before proceeding
+{%- else -%}
 1. **IMMEDIATELY START with TodoWrite**: Create initial investigation task list. Already start working on tasks. Mark the tasks you're working on as in_progress.
 2. **Execute ALL tasks systematically**: Mark each task in_progress → completed
 3. **Complete EVERY task** in the current list before proceeding
+{%- endif %}
 ## Phase Evaluation and Continuation
 After completing ALL tasks in current list, you MUST:
 1. **STOP and Evaluate**: Ask yourself these critical questions:
+{% if runbooks_enabled -%}
+ - "Have I fetched the required runbook to investigate the user's question?"
+{%- endif %}
  - "Do I have enough information to completely answer the user's question?"
  - "Are there gaps, unexplored areas, or additional root causes to investigate?"
  - "Have I followed the 'five whys' methodology to the actual root cause?"
@@ -122,6 +157,9 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
   **Before providing final answer, you MUST:**
   - Confirm answer addresses user question completely! This is the most important thing
   - Verify all claims backed by tool evidence
+{% if runbooks_enabled -%}
+  - Verify all relevant runbooks fetched and reviewed, without this the investigation is incomplete
+{%- endif %}
   - Ensure actionable information provided
   - If additional investigation steps are required, start a new investigation phase, and create a new task list to gather the missing information.
@@ -136,8 +174,15 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
     **EXAMPLES of Phase Progression:**
     *Phase 1*: Initial investigation discovers pod crashes
+{% if runbooks_enabled -%}
+    *Phase 2*: Fetch runbooks for specific application investigation or investigating pod crashes
+    *Phase 3*: Deep dive into specific pod logs and resource constraints
+    *Phase 4*: Investigate upstream services causing the crashes
+{%- else -%}
     *Phase 2*: Deep dive into specific pod logs and resource constraints
     *Phase 3*: Investigate upstream services causing the crashes
+{%- endif %}
     *Final Review Phase*: Self-critique and validate the complete solution
     *Phase 1*: Initial investigation - check pod health, metrics, logs, traces
@@ -146,6 +191,9 @@ If the answer to any of those questions is 'yes' - The investigation is INCOMPLE
     *Final Review Phase*: Validate that the chain of events, accross the different components, can lead to the investigated scenario.
     **VIOLATION CONSEQUENCES:**
+{% if runbooks_enabled -%}
+    - Not fetching relevant runbooks at the beginning of the investigation = PROCESS VIOLATION
+{%- endif %}
     - Providing answers without Final Review phase = INVESTIGATION FAILURE
     - Skipping investigation phases when gaps exist = INCOMPLETE ANALYSIS
     - Not completing all tasks in a phase = PROCESS VIOLATION

holmes/plugins/prompts/kubernetes_workload_ask.jinja2 CHANGED Viewed

@@ -4,7 +4,6 @@ Do not say 'based on the tool output' or explicitly refer to tools at all.
 If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
 If the user provides you with extra instructions in a triple single quotes section, ALWAYS perform their instructions and then perform your investigation.
-{% include '_current_date_time.jinja2' %}
 {% include 'investigation_procedure.jinja2' %}
@@ -76,3 +75,5 @@ Here are issues and configuration changes that happend to this kubernetes worklo
 {{ a }}
 {% endfor %}
 {% endif %}
+{% include '_current_date_time.jinja2' %}

holmes/plugins/prompts/kubernetes_workload_chat.jinja2 CHANGED Viewed

@@ -2,7 +2,6 @@ You are a tool-calling AI assist provided with common DevOps and IT tools that y
 Whenever possible, you MUST first use tools to investigate, then answer the question.
 Do not say 'based on the tool output' or explicitly refer to tools at all.
 If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
-{% include '_current_date_time.jinja2' %}
 ### Context Awareness:
 Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{resource}}.
@@ -37,3 +36,5 @@ User: Why did the workload-example app crash?
 AI: `workload-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
 Relevant logs:
+{% include '_current_date_time.jinja2' %}

holmes/plugins/runbooks/__init__.py CHANGED Viewed

@@ -4,18 +4,68 @@ import os
 import os.path
 from datetime import date
 from pathlib import Path
-from typing import List, Optional, Pattern, Union
+from typing import List, Optional, Pattern, Union, Tuple, TYPE_CHECKING
+import yaml
 from pydantic import BaseModel, PrivateAttr
 from holmes.utils.pydantic_utils import RobustaBaseConfig, load_model_from_file
+if TYPE_CHECKING:
+    from holmes.core.supabase_dal import SupabaseDal
 THIS_DIR = os.path.abspath(os.path.dirname(__file__))
 DEFAULT_RUNBOOK_SEARCH_PATH = THIS_DIR
 CATALOG_FILE = "catalog.json"
+class RobustaRunbookInstruction(BaseModel):
+    id: str
+    symptom: str
+    title: str
+    instruction: Optional[str] = None
+    """
+    Custom YAML dumper to represent multi-line strings in literal block style due to instructions often being multi-line.
+    for example:
+    instructions: |
+      Step 1: Do this
+      Step 2: Do that
+    instead of:
+    instructions: "Step 1: Do this
+    Step 2: Do that"
+    """
+    class _LiteralDumper(yaml.SafeDumper):
+        pass
+    @staticmethod
+    def _repr_str(dumper, s: str):
+        s = s.replace("\\n", "\n")
+        return dumper.represent_scalar(
+            "tag:yaml.org,2002:str", s, style="|" if "\n" in s else None
+        )
+    _LiteralDumper.add_representer(str, _repr_str)  # type: ignore
+    def to_list_string(self) -> str:
+        return f"{self.id}"
+    def to_prompt_string(self) -> str:
+        return f"id='{self.id}' | title='{self.title}' | symptom='{self.symptom}'"
+    def pretty(self) -> str:
+        try:
+            data = self.model_dump(exclude_none=True)  # pydantic v2
+        except AttributeError:
+            data = self.dict(exclude_none=True)  # pydantic v1
+        return yaml.dump(
+            data, Dumper=self._LiteralDumper, sort_keys=False, allow_unicode=True
+        )
 class IssueMatcher(RobustaBaseConfig):
     issue_id: Optional[Pattern] = None  # unique id
     issue_name: Optional[Pattern] = None  # not necessary unique
@@ -62,37 +112,81 @@ class RunbookCatalogEntry(BaseModel):
     Different from runbooks provided by Runbook class, this entry points to markdown file containing the runbook content.
     """
+    id: str
     update_date: date
     description: str
     link: str
+    def to_list_string(self) -> str:
+        return f"{self.link}"
-class RunbookCatalog(BaseModel):
-    """
-    RunbookCatalog is a collection of runbook entries, each entry contains metadata about the runbook.
-    The correct runbook can be selected from the list by comparing the description with the user question.
-    """
-    catalog: List[RunbookCatalogEntry]
+    def to_prompt_string(self) -> str:
+        return f"{self.link} | description: {self.description}"
-def load_runbook_catalog() -> Optional[RunbookCatalog]:
+class RunbookCatalog(BaseModel):
+    catalog: List[Union[RunbookCatalogEntry, "RobustaRunbookInstruction"]]  # type: ignore
+    def list_available_runbooks(self) -> list[str]:
+        return [entry.to_list_string() for entry in self.catalog]
+    def split_by_type(
+        self,
+    ) -> Tuple[List[RunbookCatalogEntry], List[RobustaRunbookInstruction]]:
+        md: List[RunbookCatalogEntry] = []
+        robusta: List[RobustaRunbookInstruction] = []  #
+        for catalog_entry in self.catalog:
+            if isinstance(catalog_entry, RunbookCatalogEntry):
+                md.append(catalog_entry)
+            elif isinstance(catalog_entry, RobustaRunbookInstruction):
+                robusta.append(catalog_entry)
+        return md, robusta
+    def to_prompt_string(self) -> str:
+        md, robusta = self.split_by_type()
+        parts: List[str] = [""]
+        if md:
+            parts.append("Here are MD runbooks:")
+            parts.extend(f"* {e.to_prompt_string()}" for e in md)
+        if robusta:
+            parts.append("Here are Robusta runbooks:")
+            parts.extend(f"* {e.to_prompt_string()}" for e in robusta)
+        return "\n".join(parts)
+def load_runbook_catalog(
+    dal: Optional["SupabaseDal"] = None,
+) -> Optional[RunbookCatalog]:  # type: ignore
     dir_path = os.path.dirname(os.path.realpath(__file__))
+    catalog = None
     catalogPath = os.path.join(dir_path, CATALOG_FILE)
-    if not os.path.isfile(catalogPath):
-        return None
     try:
-        with open(catalogPath) as file:
-            catalog_dict = json.load(file)
-            return RunbookCatalog(**catalog_dict)
+        if os.path.isfile(catalogPath):
+            with open(catalogPath) as file:
+                catalog_dict = json.load(file)
+                catalog = RunbookCatalog(**catalog_dict)
     except json.JSONDecodeError as e:
         logging.error(f"Error decoding JSON from {catalogPath}: {e}")
     except Exception as e:
         logging.error(
             f"Unexpected error while loading runbook catalog from {catalogPath}: {e}"
         )
-    return None
+    # Append additional runbooks from SupabaseDal if provided
+    if dal:
+        try:
+            supabase_entries = dal.get_runbook_catalog()
+            if not supabase_entries:
+                return catalog
+            if catalog:
+                catalog.catalog.extend(supabase_entries)
+            else:
+                # if failed to load from file, create new catalog from supabase
+                catalog = RunbookCatalog(catalog=supabase_entries)  # type: ignore
+        except Exception as e:
+            logging.error(f"Error loading runbooks from Supabase: {e}")
+    return catalog
 def get_runbook_by_path(
@@ -108,9 +202,14 @@ def get_runbook_by_path(
     Returns:
         Full path to the runbook if found, None otherwise
     """
+    # Validate runbook_relative_path is not empty
+    if not runbook_relative_path or not runbook_relative_path.strip():
+        return None
     for search_path in search_paths:
         runbook_path = os.path.join(search_path, runbook_relative_path)
-        if os.path.exists(runbook_path):
+        # Ensure it's a file, not a directory
+        if os.path.isfile(runbook_path):
             return runbook_path
     return None

holmes/plugins/runbooks/catalog.json CHANGED Viewed

@@ -1,11 +1,13 @@
 {
   "catalog": [
     {
+      "id": "dns-troubleshooting.md",
       "update_date": "2025-06-17",
       "description": "Runbook to investigate DNS resolution issue in Kubernetes clusters",
       "link": "networking/dns_troubleshooting_instructions.md"
     },
     {
+      "id": "upgrade-troubleshooting.md",
       "update_date": "2025-07-08",
       "description": "Runbook to troubleshoot upgrade issues in Azure Kubernetes Service clusters",
       "link": "upgrade/upgrade_troubleshooting_instructions.md"

holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl

holmesgpt 0.13.2py3-none-any.whl → 0.16.2a0py3-none-any.whl