PyPI - holmesgpt - Versions diffs - 0.11.5__py3-none-any.whl - Mend

holmesgpt 0.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of holmesgpt might be problematic. Click here for more details.

Files changed (183) hide show

holmes/.git_archival.json +7 -0
holmes/__init__.py +76 -0
holmes/__init__.py.bak +76 -0
holmes/clients/robusta_client.py +24 -0
holmes/common/env_vars.py +47 -0
holmes/config.py +526 -0
holmes/core/__init__.py +0 -0
holmes/core/conversations.py +578 -0
holmes/core/investigation.py +152 -0
holmes/core/investigation_structured_output.py +264 -0
holmes/core/issue.py +54 -0
holmes/core/llm.py +250 -0
holmes/core/models.py +157 -0
holmes/core/openai_formatting.py +51 -0
holmes/core/performance_timing.py +72 -0
holmes/core/prompt.py +42 -0
holmes/core/resource_instruction.py +17 -0
holmes/core/runbooks.py +26 -0
holmes/core/safeguards.py +120 -0
holmes/core/supabase_dal.py +540 -0
holmes/core/tool_calling_llm.py +798 -0
holmes/core/tools.py +566 -0
holmes/core/tools_utils/__init__.py +0 -0
holmes/core/tools_utils/tool_executor.py +65 -0
holmes/core/tools_utils/toolset_utils.py +52 -0
holmes/core/toolset_manager.py +418 -0
holmes/interactive.py +229 -0
holmes/main.py +1041 -0
holmes/plugins/__init__.py +0 -0
holmes/plugins/destinations/__init__.py +6 -0
holmes/plugins/destinations/slack/__init__.py +2 -0
holmes/plugins/destinations/slack/plugin.py +163 -0
holmes/plugins/interfaces.py +32 -0
holmes/plugins/prompts/__init__.py +48 -0
holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
holmes/plugins/prompts/generic_ask.jinja2 +36 -0
holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
holmes/plugins/runbooks/README.md +22 -0
holmes/plugins/runbooks/__init__.py +100 -0
holmes/plugins/runbooks/catalog.json +14 -0
holmes/plugins/runbooks/jira.yaml +12 -0
holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
holmes/plugins/sources/github/__init__.py +77 -0
holmes/plugins/sources/jira/__init__.py +123 -0
holmes/plugins/sources/opsgenie/__init__.py +93 -0
holmes/plugins/sources/pagerduty/__init__.py +147 -0
holmes/plugins/sources/prometheus/__init__.py +0 -0
holmes/plugins/sources/prometheus/models.py +104 -0
holmes/plugins/sources/prometheus/plugin.py +154 -0
holmes/plugins/toolsets/__init__.py +171 -0
holmes/plugins/toolsets/aks-node-health.yaml +65 -0
holmes/plugins/toolsets/aks.yaml +86 -0
holmes/plugins/toolsets/argocd.yaml +70 -0
holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
holmes/plugins/toolsets/aws.yaml +76 -0
holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
holmes/plugins/toolsets/azure_sql/install.md +66 -0
holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
holmes/plugins/toolsets/azure_sql/utils.py +83 -0
holmes/plugins/toolsets/bash/__init__.py +0 -0
holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
holmes/plugins/toolsets/bash/common/bash.py +52 -0
holmes/plugins/toolsets/bash/common/config.py +14 -0
holmes/plugins/toolsets/bash/common/stringify.py +25 -0
holmes/plugins/toolsets/bash/common/validators.py +24 -0
holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
holmes/plugins/toolsets/bash/parse_command.py +103 -0
holmes/plugins/toolsets/confluence.yaml +19 -0
holmes/plugins/toolsets/consts.py +5 -0
holmes/plugins/toolsets/coralogix/api.py +158 -0
holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
holmes/plugins/toolsets/coralogix/utils.py +181 -0
holmes/plugins/toolsets/datadog.py +153 -0
holmes/plugins/toolsets/docker.yaml +46 -0
holmes/plugins/toolsets/git.py +756 -0
holmes/plugins/toolsets/grafana/__init__.py +0 -0
holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
holmes/plugins/toolsets/grafana/common.py +68 -0
holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
holmes/plugins/toolsets/grafana/loki_api.py +89 -0
holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
holmes/plugins/toolsets/helm.yaml +42 -0
holmes/plugins/toolsets/internet/internet.py +275 -0
holmes/plugins/toolsets/internet/notion.py +137 -0
holmes/plugins/toolsets/kafka.py +638 -0
holmes/plugins/toolsets/kubernetes.yaml +255 -0
holmes/plugins/toolsets/kubernetes_logs.py +426 -0
holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
holmes/plugins/toolsets/logging_utils/types.py +0 -0
holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
holmes/plugins/toolsets/newrelic.py +222 -0
holmes/plugins/toolsets/opensearch/__init__.py +0 -0
holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
holmes/plugins/toolsets/rabbitmq/api.py +398 -0
holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
holmes/plugins/toolsets/robusta/__init__.py +0 -0
holmes/plugins/toolsets/robusta/robusta.py +235 -0
holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
holmes/plugins/toolsets/runbook/__init__.py +0 -0
holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
holmes/plugins/toolsets/service_discovery.py +92 -0
holmes/plugins/toolsets/servicenow/install.md +37 -0
holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
holmes/plugins/toolsets/slab.yaml +20 -0
holmes/plugins/toolsets/utils.py +137 -0
holmes/plugins/utils.py +14 -0
holmes/utils/__init__.py +0 -0
holmes/utils/cache.py +84 -0
holmes/utils/cert_utils.py +40 -0
holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
holmes/utils/definitions.py +13 -0
holmes/utils/env.py +53 -0
holmes/utils/file_utils.py +56 -0
holmes/utils/global_instructions.py +20 -0
holmes/utils/holmes_status.py +22 -0
holmes/utils/holmes_sync_toolsets.py +80 -0
holmes/utils/markdown_utils.py +55 -0
holmes/utils/pydantic_utils.py +54 -0
holmes/utils/robusta.py +10 -0
holmes/utils/tags.py +97 -0
holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
holmesgpt-0.11.5.dist-info/METADATA +400 -0
holmesgpt-0.11.5.dist-info/RECORD +183 -0
holmesgpt-0.11.5.dist-info/WHEEL +4 -0
holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0

holmes/plugins/prompts/generic_post_processing.jinja2 ADDED Viewed

@@ -0,0 +1,13 @@
+# Summarize the investigation below using the following points
+1. Summarize the output provided to be short and concise without removing any specific details about any resourse or the root cause.
+2. Remove any sections on 'to gather more information' or 'next steps' unless explicitly requested in the prompt.
+3. Do not compliment the user or mention their investigation.
+4. Avoid introductory or concluding remarks; provide only the direct summary of issues.
+5. Preserve the answer format.
+This is the original prompt:
+{{ prompt }}
+This is the investigation to summarize:
+{{ investigation }}

holmes/plugins/prompts/generic_ticket.jinja2 ADDED Viewed

@@ -0,0 +1,12 @@
+{% include 'generic_ask.jinja2' %}
+{% if output_instructions %}
+You are updating tickets in {{ source }} so verify you follow all the following instructions:
+Always validate pod names before generating links.
+When creating Grafana dashboard links, ensure that **EVERY** link is relevant to the issue investigated and express it in the link text.
+{% for instruction in output_instructions %}
+{{ instruction }}
+{% endfor %}
+{% endif %}

holmes/plugins/prompts/investigation_output_format.jinja2 ADDED Viewed

@@ -0,0 +1,32 @@
+{% if structured_output %}
+Give your answer in a pure JSON format with the following sections.
+The content of each section should be formatted with markdown:
+{
+{% for title, description in sections.items() %}
+  "{{ title }}": "{{ description | replace("\n", "\\n") }}",{% endfor %}
+}
+You MUST set a section as `null` if:
+- It's not relevant to the investigation
+- It does not contain relevant information
+- Its content is not related to a possible root cause
+<DO NOT list tools used and DO NOT add a `Tools` section>
+{% else %}
+Give your answer returning a markdown document structured with the following sections. Use top level markdown titles format `#`.
+Ignore any section that is not relevant to the investigation.
+{% for title, description in sections.items() %}
+# {{ title }}
+{{ description }}
+{% endfor %}
+# <DO NOT list tools used and DO NOT add a `# Tools` section>
+You MUST ignore a section and skip it:
+- It's not relevant to the investigation
+- It does not contain relevant information
+- Its content is not related to a possible root cause
+{% endif %}

holmes/plugins/prompts/kubernetes_workload_ask.jinja2 ADDED Viewed

@@ -0,0 +1,84 @@
+You are a tool-calling AI assist provided with common devops and IT tools that you can use to troubleshoot problems or answer questions.
+Whenever possible you MUST first use tools to investigate then answer the question.
+Do not say 'based on the tool output' or explicitly refer to tools at all.
+If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
+If the user provides you with extra instructions in a triple single quotes section, ALWAYS perform their instructions and then perform your investigation.
+{% include '_current_date_time.jinja2' %}
+Global Instructions
+You may receive a set of “Global Instructions” that describe how to perform certain tasks, handle certain situations, or apply certain best practices. They are not mandatory for every request, but serve as a reference resource and must be used if the current scenario or user request aligns with one of the described methods or conditions.
+Use these rules when deciding how to apply them:
+* If the user prompt includes Global Instructions, treat them as a reference resource.
+* Some Global Instructions may describe how to handle specific tasks or scenarios. If the user's current request or the instructions in a triple quotes section reference one of these tasks, ALWAYS follow the Global Instruction for that task.
+* Some Global Instructions may define general conditions that always apply if a certain scenario occurs (e.g., "whenever investigating a memory issue, always check resource limits"). If such a condition matches the current situation, apply the Global Instruction accordingly.
+* If user's prompt or the instructions in a triple quotes section direct you to perform a task (e.g., “Find owner”) and there is a Global Instruction on how to do that task, ALWAYS follow the Global Instructions on how to perform it.
+* If multiple Global Instructions are relevant, apply all that fit.
+* If no Global Instruction is relevant, or no condition applies, ignore them and proceed as normal.
+* Before finalizing your answer double-check if any Global Instructions apply. If so, ensure you have correctly followed those instructions.
+In general:
+* when it can provide extra information, first run as many tools as you need to gather more information, then respond.
+* if possible, do so repeatedly with different tool calls each time to gather more information.
+* do not stop investigating until you are at the final root cause you are able to find.
+* use the "five whys" methodology to find the root cause.
+* for example, if you found a problem in microservice A that is due to an error in microservice B, look at microservice B too and find the error in that.
+* if you cannot find the resource/application that the user referred to, assume they made a typo or included/excluded characters like - and.
+* in this case, try to find substrings or search for the correct spellings
+* if you are unable to investigate something properly because you do not have access to the right data, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should specifically use the templated phrase "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
+* always provide detailed information like exact resource names, versions, labels, etc
+* even if you found the root cause, keep investigating to find other possible root causes and to gather data for the answer like exact names
+* if a runbook url is present as well as tool that can fetch it, you MUST fetch the runbook before beginning your investigation.
+* if you don't know, say that the analysis was inconclusive.
+* if there are multiple possible causes list them in a numbered list.
+* there will often be errors in the data that are not relevant or that do not have an impact - ignore them in your conclusion if you were not able to tie them to an actual error.
+* run as many kubectl commands as you need to gather more information, then respond.
+* if possible, do so repeatedly on different Kubernetes objects.
+* for example, for deployments first run kubectl on the deployment then a replicaset inside it, then a pod inside that.
+* when investigating a pod that crashed or application errors, always run kubectl_describe and fetch the pod's logs so that you see current logs and any logs from before a crash.
+* do not give an answer like "The pod is pending" as that doesn't state why the pod is pending and how to fix it.
+* do not give an answer like "Pod's node affinity/selector doesn't match any available nodes" because that doesn't include data on WHICH label doesn't match
+* if investigating an issue on many pods, there is no need to check more than 3 individual pods in the same deployment. pick up to a representative 3 from each deployment if relevant
+* if you find errors and warning in a pods logs and you believe they indicate a real issue. consider the pod as not healthy.
+* if the user says something isn't working, ALWAYS:
+** use kubectl_describe on the owner workload + individual pods and look for any transient issues they might have been referring to
+** check the application aspects by accessing the application logs and other relevant tools
+** look for misconfigured ingresses/services etc
+{% include '_toolsets_instructions.jinja2' %}
+{% include '_fetch_logs.jinja2' %}
+Style guide:
+* Be painfully concise.
+* Leave out "the" and filler words when possible.
+* Be terse but not at the expense of leaving out important data like the root cause and how to fix.
+* if asked by Global Instructions or instructions in a triple single quotes section to explicitly include something in the answer, don't leave it out.
+* return a json object with the following schema as a result:
+{
+  "type": "object",
+  "properties": {
+    "root_cause_summary": {
+      "type": "string",
+      "description": "concise short explaination leading to the workload_healthy result, pinpoint reason and root cause for the workload issues if any."
+    },
+    "workload_healthy": {
+      "type": "boolean",
+      "description": "is the workload in healthy state or in error state"
+    }
+  },
+  "required": [
+    "reasoning",
+    "workload_healthy"
+  ]
+}
+{% if alerts %}
+Here are issues and configuration changes that happend to this kubernetes workload in recent time. Check if these can help you understand the issue.
+{% for a in alerts %}
+{{ a }}
+{% endfor %}
+{% endif %}

holmes/plugins/prompts/kubernetes_workload_chat.jinja2 ADDED Viewed

@@ -0,0 +1,39 @@
+You are a tool-calling AI assist provided with common DevOps and IT tools that you can use to troubleshoot problems or answer questions.
+Whenever possible, you MUST first use tools to investigate, then answer the question.
+Do not say 'based on the tool output' or explicitly refer to tools at all.
+If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
+{% include '_current_date_time.jinja2' %}
+### Context Awareness:
+Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{resource}}.
+However, not all questions may be directly related to that investigation.
+Use results of the investigation and conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
+#### Results of Workload Health Check Analysis:
+{{workload_analysis}}
+{% if tools_called_for_workload %}
+Tools used for the workload analysis:
+{% for tool in tools_called_for_workload %}
+    {{ tool }}
+{% endfor %}
+{% endif %}
+{% include '_global_instructions.jinja2' %}
+{% include '_general_instructions.jinja2' %}
+Style guide:
+* Reply with terse output.
+* Be painfully concise.
+* Leave out "the" and filler words when possible.
+* Be terse but not at the expense of leaving out important data like the root cause and how to fix.
+Examples:
+User: Why did the workload-example app crash?
+(Call tool kubectl_find_resource kind=pod keyword=workload`)
+(Call tool kubectl_previous_logs namespace=demos pod=workload-example-1299492-d9g9d # this pod name was found from the previous tool call)
+AI: `workload-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
+Relevant logs:

holmes/plugins/runbooks/README.md ADDED Viewed

@@ -0,0 +1,22 @@
+# Runbooks
+Runbooks folder contains operational runbooks for the HolmesGPT project. Runbooks provide step-by-step instructions for common tasks, troubleshooting, and maintenance procedures related to the plugins in this directory.
+## Purpose
+- Standardize operational processes
+- Enable quick onboarding for new team members
+- Reduce downtime by providing clear troubleshooting steps
+## Structure
+### Structured Runbook
+Structured runbooks are designed for specific issues when conditions like issue name, id or source match, the corresponding instructions will be returned for investigation.
+For example, the investigation in [kube-prometheus-stack.yaml](kube-prometheus-stack.yaml) will be returned when the issue to be investigated match either KubeSchedulerDown or KubeControllerManagerDown.
+This runbook is mainly used for `holmes investigate`
+### Catalog
+Catalog specified in [catalog.json](catalog.json) contains a collection of runbooks written in markdown.
+During runtime, LLM will compare the runbook description with the user question and return the most matched runbook for investigation. It's possible no runbook is returned for no match.

holmes/plugins/runbooks/__init__.py ADDED Viewed

@@ -0,0 +1,100 @@
+import json
+import logging
+import os
+import os.path
+from datetime import date
+from pathlib import Path
+from typing import List, Optional, Pattern, Union
+from pydantic import BaseModel, PrivateAttr
+from holmes.utils.pydantic_utils import RobustaBaseConfig, load_model_from_file
+THIS_DIR = os.path.abspath(os.path.dirname(__file__))
+CATALOG_FILE = "catalog.json"
+class IssueMatcher(RobustaBaseConfig):
+    issue_id: Optional[Pattern] = None  # unique id
+    issue_name: Optional[Pattern] = None  # not necessary unique
+    source: Optional[Pattern] = None
+class Runbook(RobustaBaseConfig):
+    match: IssueMatcher
+    instructions: str
+    _path = PrivateAttr()
+    def set_path(self, path: str):
+        self._path = path
+    def get_path(self) -> str:
+        return self._path
+class ListOfRunbooks(BaseModel):
+    runbooks: List[Runbook]
+def load_runbooks_from_file(path: Union[str, Path]) -> List[Runbook]:
+    data: ListOfRunbooks = load_model_from_file(ListOfRunbooks, file_path=path)  # type: ignore
+    for runbook in data.runbooks:
+        runbook.set_path(path)  # type: ignore
+    return data.runbooks
+def load_builtin_runbooks() -> List[Runbook]:
+    all_runbooks = []
+    for filename in os.listdir(THIS_DIR):
+        if not filename.endswith(".yaml"):
+            continue
+        path = os.path.join(THIS_DIR, filename)
+        all_runbooks.extend(load_runbooks_from_file(path))
+    return all_runbooks
+class RunbookCatalogEntry(BaseModel):
+    """
+    RunbookCatalogEntry contains metadata about a runbook
+    Different from runbooks provided by Runbook class, this entry points to markdown file containing the runbook content.
+    """
+    update_date: date
+    description: str
+    link: str
+class RunbookCatalog(BaseModel):
+    """
+    RunbookCatalog is a collection of runbook entries, each entry contains metadata about the runbook.
+    The correct runbook can be selected from the list by comparing the description with the user question.
+    """
+    catalog: List[RunbookCatalogEntry]
+def load_runbook_catalog() -> Optional[RunbookCatalog]:
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    catalogPath = os.path.join(dir_path, CATALOG_FILE)
+    if not os.path.isfile(catalogPath):
+        return None
+    try:
+        with open(catalogPath) as file:
+            catalog_dict = json.load(file)
+            return RunbookCatalog(**catalog_dict)
+    except json.JSONDecodeError as e:
+        logging.error(f"Error decoding JSON from {catalogPath}: {e}")
+    except Exception as e:
+        logging.error(
+            f"Unexpected error while loading runbook catalog from {catalogPath}: {e}"
+        )
+    return None
+def get_runbook_by_path(runbook_relative_path: str) -> str:
+    runbook_folder = os.path.dirname(os.path.realpath(__file__))
+    runbook_path = os.path.join(runbook_folder, runbook_relative_path)
+    return runbook_path

holmes/plugins/runbooks/catalog.json ADDED Viewed

@@ -0,0 +1,14 @@
+{
+  "catalog": [
+    {
+      "update_date": "2025-06-17",
+      "description": "Runbook to investigate DNS resolution issue in Kubernetes clusters",
+      "link": "networking/dns_troubleshooting_instructions.md"
+    },
+    {
+      "update_date": "2025-07-08",
+      "description": "Runbook to troubleshoot upgrade issues in Azure Kubernetes Service clusters",
+      "link": "upgrade/upgrade_troubleshooting_instructions.md"
+    }
+  ]
+}

holmes/plugins/runbooks/jira.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+# runbooks for jira alerts
+# the AI will follow the instructions inside these runbooks to investigate alerts!
+# please feel free to open PRs adding your own runboks
+runbooks:
+  - match:
+      source: "jira"
+    instructions: >
+      Investigate and try to solve whatever is written in the title and description of the ticket.
+      Ignore issues related to jira itself, like plugin or licensing problems.
+      Never give an answer like "XYZ is experiencing an issue, as indicated by the Jira issue. Further investigation is needed to determine the exact cause."
+      You are the agent that is supposed to investigate so do so!
+      If you have references to a service or a component, start by searching for related infrastructure or resources using tools that take keywords

holmes/plugins/runbooks/kube-prometheus-stack.yaml ADDED Viewed

@@ -0,0 +1,10 @@
+# runbooks for alerts in kube-prometheus-stack
+# the AI will follow the instructions inside these runbooks to investigate alerts!
+# please feel free to open PRs adding your own runboks
+runbooks:
+  - match:
+      issue_name: "(KubeSchedulerDown)|(KubeControllerManagerDown)"
+    instructions: >
+      Check if the cluster is a managed cluster like EKS by fetching nodes and looking at their labels.
+      If so, tell the user this is likely a known false positive in the kube-prometheus-stack alert because Prometheus can't scrape the scheduler which is managed by the cloud provider.
+      On the other hand, if this is a self-managed Kubernetes, either the scheduler is really down (unlikely) or it is running but Prometheus can't scrape it.

holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md ADDED Viewed

@@ -0,0 +1,66 @@
+# DNS Troubleshooting Guidelines (Kubernetes)
+## Goal
+Your primary goal when using these tools is to diagnose DNS resolution issues within a Kubernetes cluster, focusing on identifying common problems like incorrect CoreDNS/kube-dns setup, network policies, or service discovery failures by strictly following the workflow for DNS diagnosis.
+*   Use the tools to gather information about the DNS pods, services, and configuration.
+*   Clearly present the key findings from the tool outputs in your analysis.
+*   Instead of provide next steps to the user, you need to follow the troubleshoot guide to execute the steps.
+*   When getting pod logs, always try to get the log filter by log_filter toolset to filter out unnecessary logs by tool kubectl_logs_grep_no_match
+## Workflow for DNS Diagnosis
+1.  **Check CoreDNS/kube-dns Pods:**
+    *   Verify that the DNS pods (e.g., CoreDNS or kube-dns) are running in the `kube-system` namespace.
+    *   Look for restarts or crashes in the DNS pods.
+2.  **Examine DNS Service:**
+    *   Ensure the DNS service is correctly defined: `kubectl get svc kube-dns -n kube-system` (or the equivalent for your DNS provider).
+    *   Verify the ClusterIP of the DNS service and the ports (usually 53/UDP and 53/TCP).
+3.  **Test DNS Resolution from a Pod:**
+    *   Launch a debugging pod (e.g., using `busybox` or `nslookup` tools).
+    *   **Inside the debug pod:**
+        *   Check `/etc/resolv.conf`:
+            *   The `nameserver` should point to the DNS service's ClusterIP.
+            *   The `search` path should be appropriate for your namespaces (e.g., `your-namespace.svc.cluster.local svc.cluster.local cluster.local`).
+            *   The `options` (like `ndots:5`) can affect resolution behavior.
+        *   Attempt to resolve internal cluster names:
+            *   A service in the same namespace (e.g., `myservice`).
+            *   A service in a different namespace (e.g., `myservice.othernamespace`).
+            *   A fully qualified domain name (FQDN) (e.g., `myservice.othernamespace.svc.cluster.local`).
+        *   Attempt to resolve external names (e.g., `www.google.com`).
+    *   Use tools like `nslookup <hostname>` or `dig <hostname>` for detailed query information.
+4.  **Check NetworkPolicies:**
+    *   If NetworkPolicies are in place, ensure they allow DNS traffic (to port 53 UDP/TCP) from your application pods to the DNS pods/service.
+    *   List NetworkPolicies and Examine policies that might be affecting the source or destination pods.
+5.  **Review CoreDNS Configuration (if applicable):**
+    *   Inspect the CoreDNS ConfigMap: `kubectl get configmap coredns -n kube-system -o yaml`.
+    *   Look for errors or misconfigurations in the Corefile (e.g., incorrect upstream resolvers, plugin issues).
+    *   Inspect the customized CoreDNS ConfigMap: `kubectl get configmap coredns-custom -n kube-system -o yaml`.
+    *   Look for errors or misconfigurations in the customizated CoreDNS config (e.g., incorrect upstream resolvers, plugin issues).
+6.  **Check the DNS trace**
+    *   Use findings from the DNS trace to pinpoint where DNS resolution is failing (e.g., query not reaching DNS server, invalid FQDN, or error response from DNS server).
+    *   DNS Server should always respond to the requests from the client. Valid FQDN should return NOERROR, and invalid FQDN should return NXDOMAIN
+## Synthesize Findings
+Based on the outputs from the above steps, describe the DNS issue clearly. For example:
+*   "DNS resolution for internal service 'myservice' is failing from pods in namespace 'app-ns'. The CoreDNS pods in `kube-system` are running but show 'connection refused' errors in their logs when trying to reach upstream resolvers."
+*   "Pods in namespace 'secure-ns' cannot resolve any hostnames. `/etc/resolv.conf` in these pods is missing the correct `nameserver` entry. This is likely due to a misconfiguration in the pod's `dnsPolicy` or the underlying node's DNS setup."
+*   "External DNS resolution is failing cluster-wide. The CoreDNS ConfigMap shows that the `forward` plugin is pointing to an incorrect upstream DNS server IP address."
+*   "DNS lookups for 'service-a.namespace-b' are timing out. A NetworkPolicy in 'namespace-b' is blocking egress traffic on port 53 to the kube-dns service."
+## Recommend Remediation Steps (Based on Docs)
+*   **CRITICAL:** ALWAYS refer to the official Kubernetes DNS debugging guide for detailed troubleshooting and solutions:
+    *   Main guide: https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/
+    *   CoreDNS specific: https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/ (for CoreDNS customization which might be relevant)
+*   **DO NOT invent recovery procedures.** Your role is to diagnose and *point* to the correct documentation or standard procedures.
+*   Based on the findings, suggest which sections of the documentation are most relevant.
+    *   If DNS pods are not running, guide towards checking pod deployment and node health.
+    *   If `/etc/resolv.conf` is incorrect, point to sections on Pod `dnsPolicy` and `dnsConfig`.
+    *   If NetworkPolicies are suspected, suggest reviewing policy definitions to allow DNS.
+    *   If CoreDNS configuration seems problematic, refer to CoreDNS documentation and the Kubernetes guide on customizing it.
+    *   If upstream DNS resolution is failing, suggest checking the upstream DNS servers and CoreDNS forward configuration.

holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md ADDED Viewed

@@ -0,0 +1,44 @@
+# AKS Upgrade Troubleshooting Guidelines
+## Goal
+Your primary goal when using these tools is to diagnose cluster and nodepool upgrade failures within an AKS cluster by strictly following the workflow for AKS upgrade diagnosis.
+*   Use the tools to gather information about the cluster configuration and upgrade failure reason.
+*   Clearly present the key findings from the tool outputs in your analysis.
+*   Instead of providing next steps to the user, you need to follow the troubleshoot guide to execute the steps.
+## Workflow for AKS Upgrade Diagnosis
+1.  **Check Cluster and Nodepool Status:**
+    *   Get the current cluster context - cluster name, resource group, and subscription ID. Get the configuration details for this current cluster only. If there is no cluster context set up, ask the user to set the right AKS cluster context using Azure CLI.
+    *   Check the provisioning status of your AKS cluster to see if it's in a Failed state. If the cluster is in a Succeeded or Upgrading state, this means that no recent or ongoing upgrades have failed.
+    *   Check the provisioning status of your AKS nodepools. If any nodepools have a Failed status, that means that a recent nodepool upgrade has failed.
+    *   If the cluster or nodepool is in a Failed state, check the error message and read any documentation links that may be included in the error message details (starting with aka.ms).
+If the cluster is in a Succeeded state, follow the steps below to identify if there are any potential issues which could results in a failed cluster - for example, check for any restrictive PDBs or that outbound connectivity endpoints are blocked. If the cluster is in Failed state, check the error message and follow the steps below for that specific error code.
+2. **Error code: 'VMExtensionError_OutBoundConnFail'**
+    *   If the upgrade failure error message contains 'VMExtensionError_OutBoundConnFail', this means that the cluster upgrade failed due to one or more essential outbound connectivity endpoints being blocked. Most commonly, this is due to a misconfiguration in a network policy, NSG, or Azure firewall which is denying traffic to the endpoint. Check if any of the required rules/endpoints in https://learn.microsoft.com/en-us/azure/aks/outbound-rules-control-egress#azure-global-required-network-rules are blocked due to a network policy, NSG, or Azure firewall and identify the problematic rule.
+    *   Check if the outbound connectivity issue is due to an NSG rule by first getting all of the NSGs in the current subscription. Find the NSGs which have a ResourceGroup which contains both the current resource group and cluster name. For each NSG, list all of the rules and determine if any rule is blocking an essential endpoint in the doc linked above. Do this analysis on behalf of the user and only return the problematic NSG rules.
+3. **PDB blocking upgrade: Error code "UnsatisfiablePDB"**
+    *   If the upgrade fails due to one or more restrictive PDB, this means that one or more PDBs were set in the cluster which do not allow for any pod disruptions. Get all the PDBs configured on the cluster by running 'kubectl get pdb' and identify all PDBs where MaxUnavailable is set to 0 - if any are found, call them out specifically and ask the customer to follow the guidance in this troubleshooting guide for restrictive PDBs: https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/error-codes/unsatisfiablepdb-error.
+4. **Quota exhaustion issues: Error code "QuotaExceeded"**
+    *   If the upgrade fails due to insufficient quota with error code "QuotaExceeded", this means that your subscription doesn't have available resources that are required for upgrading your cluster. You will need to raise the limit or quota for your subscription by filing a "Service and subscription limits (quotas)" support ticket to increase the quota for compute cores. Provide detailed instructions in the response on how to open a specific "Service and subscription limits" support ticket through the Azure Portal for an AKS cluster.
+## Synthesize Findings
+Based on the outputs from the above steps, describe the upgrade issue clearly and recommend specific and actionable remediation steps. For example:
+*   "Upgrade is failing due to outbound connectivity to x endpoint is being blocked by your NSG rules. Please remove the blocking NSG rule x and retry upgrade."
+*   If upgrade is failing due to a restrictive PDB, check how many pods for that deployment are currently deployed. If there is only 1 pod, advise the customer to scale up their replicas to allow for upgrades while maintaining availability. "Upgrade is failing due to a restrictive PDB on your x pods which does not tolerate any disruptions. Please set the minAvailable to allow for 1 pod to be disrupted at a time to allow upgrades while maintaining availability."
+## Recommend Remediation Steps (Based on Docs)
+*   **CRITICAL:** ALWAYS refer to the official AKS upgrade troubleshooting guide for detailed troubleshooting and solutions: https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/create-upgrade-delete/upgrading-or-scaling-does-not-succeed
+*   **DO NOT invent recovery procedures.** Your role is to diagnose and *point* to the correct documentation or standard procedures.
+*   Based on the findings, suggest which sections of the documentation are most relevant.
+    *   If upgrade is failing due to PDB blocking, provide specific guidance from the documentation towards adjusting PDB settings.
+    *   If upgrade is failing due to quota exhaustion, provide specific guidance from the documentation towards reviewing resource quotas.
+    *   If upgrade is failing due to node issues, provide specific guidance from the documentation towards reviewing node status and health.
+    *   If upgrade is failing due to network issues, provide specific guidance from the documentation towards reviewing network configuration.
+    *   If upgrade is failing due to other issues, provide specific guidance from the documentation towards reviewing the upgrade logs and configuration.

holmes/plugins/sources/github/__init__.py ADDED Viewed

@@ -0,0 +1,77 @@
+import logging
+from typing import List
+from holmes.core.tool_calling_llm import LLMResult
+from holmes.plugins.interfaces import SourcePlugin
+from holmes.core.issue import Issue
+import requests  # type: ignore
+class GitHubSource(SourcePlugin):
+    def __init__(self, url: str, owner: str, repository: str, pat: str, query: str):
+        self.url = url
+        self.owner = owner
+        self.repository = repository
+        self.pat = pat
+        self.query = query
+    def fetch_issues(self) -> List[Issue]:
+        logging.info(
+            f"Fetching All issues from {self.url} for repository {self.owner}/{self.repository}"
+        )
+        try:
+            data = []
+            url = f"{self.url}/search/issues"
+            headers = {
+                "Authorization": f"token {self.pat}",
+                "Accept": "application/vnd.github.v3+json",
+                "X-GitHub-Api-Version": "2022-11-28",
+            }
+            params = {"per_page": "100"}
+            default_q = f"repo:{self.owner}/{self.repository}"
+            params["q"] = f"{default_q} {self.query}"
+            while url:
+                response = requests.get(url=url, headers=headers, params=params)
+                if response.status_code != 200:
+                    raise Exception(
+                        f"Failed to get issues:{response.status_code} {response.text}"
+                    )
+                logging.info(f"Got {response}")
+                response.raise_for_status()
+                data.extend(response.json().get("items", []))
+                links = response.headers.get("Link", "")
+                url = None  # type: ignore
+                for link in links.split(","):
+                    if 'rel="next"' in link:
+                        url = link.split(";")[0].strip()[1:-1]
+            return [self.convert_to_issue(issue) for issue in data]
+        except requests.RequestException as e:
+            raise ConnectionError("Failed to fetch data from GitHub.") from e
+    def convert_to_issue(self, github_issue):
+        return Issue(
+            id=str(github_issue["number"]),
+            name=github_issue["title"],
+            source_type="github",
+            source_instance_id=f"{self.owner}/{self.repository}",
+            url=github_issue["html_url"],
+            raw=github_issue,
+        )
+    def write_back_result(self, issue_id: str, result_data: LLMResult) -> None:
+        url = f"{self.url}/repos/{self.owner}/{self.repository}/issues/{issue_id}/comments"
+        headers = {
+            "Authorization": f"token {self.pat}",
+            "Accept": "application/vnd.github.v3+json",
+            "X-GitHub-Api-Version": "2022-11-28",
+        }
+        response = requests.post(
+            url=url,
+            json={
+                "body": f"Automatic AI Investigation by Robusta:\n\n{result_data.result}\n"
+            },
+            headers=headers,
+        )
+        response.raise_for_status()
+        data = response.json()
+        logging.debug(f"Posted comment to issue #{issue_id} at {data['html_url']}")