holmesgpt 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/common/env_vars.py +11 -0
  3. holmes/config.py +3 -1
  4. holmes/core/conversations.py +0 -11
  5. holmes/core/investigation.py +0 -6
  6. holmes/core/llm.py +63 -2
  7. holmes/core/prompt.py +0 -2
  8. holmes/core/supabase_dal.py +2 -2
  9. holmes/core/todo_tasks_formatter.py +51 -0
  10. holmes/core/tool_calling_llm.py +277 -101
  11. holmes/core/tools.py +20 -4
  12. holmes/core/toolset_manager.py +1 -5
  13. holmes/core/tracing.py +1 -1
  14. holmes/interactive.py +63 -2
  15. holmes/main.py +7 -2
  16. holmes/plugins/prompts/_fetch_logs.jinja2 +4 -0
  17. holmes/plugins/prompts/_general_instructions.jinja2 +3 -1
  18. holmes/plugins/prompts/investigation_procedure.jinja2 +3 -13
  19. holmes/plugins/runbooks/CLAUDE.md +85 -0
  20. holmes/plugins/runbooks/README.md +24 -0
  21. holmes/plugins/toolsets/__init__.py +5 -1
  22. holmes/plugins/toolsets/argocd.yaml +1 -1
  23. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +18 -6
  24. holmes/plugins/toolsets/aws.yaml +9 -5
  25. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +3 -1
  26. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +3 -1
  27. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -1
  28. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -1
  29. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +3 -1
  30. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +3 -1
  31. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -1
  32. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -1
  33. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -1
  34. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -1
  35. holmes/plugins/toolsets/bash/argocd/__init__.py +65 -0
  36. holmes/plugins/toolsets/bash/argocd/constants.py +120 -0
  37. holmes/plugins/toolsets/bash/aws/__init__.py +66 -0
  38. holmes/plugins/toolsets/bash/aws/constants.py +529 -0
  39. holmes/plugins/toolsets/bash/azure/__init__.py +56 -0
  40. holmes/plugins/toolsets/bash/azure/constants.py +339 -0
  41. holmes/plugins/toolsets/bash/bash_instructions.jinja2 +6 -7
  42. holmes/plugins/toolsets/bash/bash_toolset.py +62 -17
  43. holmes/plugins/toolsets/bash/common/bash_command.py +131 -0
  44. holmes/plugins/toolsets/bash/common/stringify.py +14 -1
  45. holmes/plugins/toolsets/bash/common/validators.py +91 -0
  46. holmes/plugins/toolsets/bash/docker/__init__.py +59 -0
  47. holmes/plugins/toolsets/bash/docker/constants.py +255 -0
  48. holmes/plugins/toolsets/bash/helm/__init__.py +61 -0
  49. holmes/plugins/toolsets/bash/helm/constants.py +92 -0
  50. holmes/plugins/toolsets/bash/kubectl/__init__.py +80 -79
  51. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -14
  52. holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +38 -56
  53. holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +28 -76
  54. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +39 -99
  55. holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +34 -15
  56. holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +1 -1
  57. holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +38 -77
  58. holmes/plugins/toolsets/bash/parse_command.py +106 -32
  59. holmes/plugins/toolsets/bash/utilities/__init__.py +0 -0
  60. holmes/plugins/toolsets/bash/utilities/base64_util.py +12 -0
  61. holmes/plugins/toolsets/bash/utilities/cut.py +12 -0
  62. holmes/plugins/toolsets/bash/utilities/grep/__init__.py +10 -0
  63. holmes/plugins/toolsets/bash/utilities/head.py +12 -0
  64. holmes/plugins/toolsets/bash/utilities/jq.py +79 -0
  65. holmes/plugins/toolsets/bash/utilities/sed.py +164 -0
  66. holmes/plugins/toolsets/bash/utilities/sort.py +15 -0
  67. holmes/plugins/toolsets/bash/utilities/tail.py +12 -0
  68. holmes/plugins/toolsets/bash/utilities/tr.py +57 -0
  69. holmes/plugins/toolsets/bash/utilities/uniq.py +12 -0
  70. holmes/plugins/toolsets/bash/utilities/wc.py +12 -0
  71. holmes/plugins/toolsets/confluence.yaml +1 -1
  72. holmes/plugins/toolsets/coralogix/api.py +3 -1
  73. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +4 -4
  74. holmes/plugins/toolsets/coralogix/utils.py +41 -14
  75. holmes/plugins/toolsets/datadog/datadog_api.py +45 -2
  76. holmes/plugins/toolsets/datadog/datadog_general_instructions.jinja2 +208 -0
  77. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +43 -0
  78. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +12 -9
  79. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +722 -0
  80. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +17 -6
  81. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +15 -7
  82. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +6 -2
  83. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +9 -3
  84. holmes/plugins/toolsets/docker.yaml +1 -1
  85. holmes/plugins/toolsets/git.py +15 -5
  86. holmes/plugins/toolsets/grafana/toolset_grafana.py +25 -4
  87. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +4 -4
  88. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +5 -3
  89. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -32
  90. holmes/plugins/toolsets/helm.yaml +1 -1
  91. holmes/plugins/toolsets/internet/internet.py +4 -2
  92. holmes/plugins/toolsets/internet/notion.py +4 -2
  93. holmes/plugins/toolsets/investigator/core_investigation.py +5 -17
  94. holmes/plugins/toolsets/investigator/investigator_instructions.jinja2 +1 -5
  95. holmes/plugins/toolsets/kafka.py +19 -7
  96. holmes/plugins/toolsets/kubernetes.yaml +5 -5
  97. holmes/plugins/toolsets/kubernetes_logs.py +4 -4
  98. holmes/plugins/toolsets/kubernetes_logs.yaml +1 -1
  99. holmes/plugins/toolsets/logging_utils/logging_api.py +15 -2
  100. holmes/plugins/toolsets/mcp/toolset_mcp.py +3 -1
  101. holmes/plugins/toolsets/newrelic.py +8 -4
  102. holmes/plugins/toolsets/opensearch/opensearch.py +13 -5
  103. holmes/plugins/toolsets/opensearch/opensearch_logs.py +4 -4
  104. holmes/plugins/toolsets/opensearch/opensearch_traces.py +9 -6
  105. holmes/plugins/toolsets/prometheus/prometheus.py +198 -57
  106. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +7 -3
  107. holmes/plugins/toolsets/robusta/robusta.py +10 -4
  108. holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -2
  109. holmes/plugins/toolsets/servicenow/servicenow.py +9 -3
  110. holmes/plugins/toolsets/slab.yaml +1 -1
  111. holmes/utils/console/logging.py +6 -1
  112. {holmesgpt-0.13.0.dist-info → holmesgpt-0.13.2.dist-info}/METADATA +3 -2
  113. {holmesgpt-0.13.0.dist-info → holmesgpt-0.13.2.dist-info}/RECORD +116 -90
  114. holmes/core/todo_manager.py +0 -88
  115. holmes/plugins/toolsets/bash/grep/__init__.py +0 -52
  116. {holmesgpt-0.13.0.dist-info → holmesgpt-0.13.2.dist-info}/LICENSE.txt +0 -0
  117. {holmesgpt-0.13.0.dist-info → holmesgpt-0.13.2.dist-info}/WHEEL +0 -0
  118. {holmesgpt-0.13.0.dist-info → holmesgpt-0.13.2.dist-info}/entry_points.txt +0 -0
holmes/interactive.py CHANGED
@@ -27,10 +27,11 @@ from pygments.lexers import guess_lexer
27
27
  from rich.console import Console
28
28
  from rich.markdown import Markdown, Panel
29
29
 
30
+ from holmes.common.env_vars import ENABLE_CLI_TOOL_APPROVAL
30
31
  from holmes.core.config import config_path_dir
31
32
  from holmes.core.prompt import build_initial_ask_messages
32
33
  from holmes.core.tool_calling_llm import ToolCallingLLM, ToolCallResult
33
- from holmes.core.tools import pretty_print_toolset_status
34
+ from holmes.core.tools import StructuredToolResult, pretty_print_toolset_status
34
35
  from holmes.core.tracing import DummyTracer
35
36
  from holmes.utils.colors import (
36
37
  AI_COLOR,
@@ -584,6 +585,53 @@ def prompt_for_llm_sharing(
584
585
  return None
585
586
 
586
587
 
588
+ def handle_tool_approval(
589
+ command: Optional[str],
590
+ error_message: Optional[str],
591
+ style: Style,
592
+ console: Console,
593
+ ) -> tuple[bool, Optional[str]]:
594
+ """
595
+ Handle user approval for potentially sensitive commands.
596
+
597
+ Args:
598
+ command: The command that needs approval
599
+ error_message: The error message explaining why approval is needed
600
+ session: PromptSession for user input
601
+ style: Style for prompts
602
+ console: Rich console for output
603
+
604
+ Returns:
605
+ Tuple of (approved: bool, feedback: Optional[str])
606
+ - approved: True if user approves, False if denied
607
+ - feedback: User's optional feedback message when denying
608
+ """
609
+ console.print("\n[bold yellow]⚠️ Command Approval Required[/bold yellow]")
610
+ console.print(f"[yellow]Command:[/yellow] {command or 'unknown'}")
611
+ console.print(f"[yellow]Reason:[/yellow] {error_message or 'unknown'}")
612
+ console.print()
613
+
614
+ # Create a temporary session without history for approval prompts
615
+ temp_session = PromptSession(history=InMemoryHistory()) # type: ignore
616
+
617
+ approval_prompt = temp_session.prompt(
618
+ [("class:prompt", "Do you want to approve and execute this command? (y/N): ")],
619
+ style=style,
620
+ )
621
+
622
+ if approval_prompt.lower().startswith("y"):
623
+ return True, None
624
+ else:
625
+ # Ask for optional feedback when denying
626
+ feedback_prompt = temp_session.prompt(
627
+ [("class:prompt", "Optional feedback for the AI (press Enter to skip): ")],
628
+ style=style,
629
+ )
630
+
631
+ feedback = feedback_prompt.strip() if feedback_prompt.strip() else None
632
+ return False, feedback
633
+
634
+
587
635
  def handle_run_command(
588
636
  bash_command: str, session: PromptSession, style: Style, console: Console
589
637
  ) -> Optional[str]:
@@ -811,6 +859,20 @@ def run_interactive_loop(
811
859
  }
812
860
  )
813
861
 
862
+ # Set up approval callback for potentially sensitive commands
863
+ def approval_handler(
864
+ tool_call_result: StructuredToolResult,
865
+ ) -> tuple[bool, Optional[str]]:
866
+ return handle_tool_approval(
867
+ command=tool_call_result.invocation,
868
+ error_message=tool_call_result.error,
869
+ style=style,
870
+ console=console,
871
+ )
872
+
873
+ if ENABLE_CLI_TOOL_APPROVAL:
874
+ ai.approval_callback = approval_handler
875
+
814
876
  # Create merged completer with slash commands, conditional executables, show command, and smart paths
815
877
  slash_completer = SlashCommandCompleter()
816
878
  executable_completer = ConditionalExecutableCompleter()
@@ -1002,7 +1064,6 @@ def run_interactive_loop(
1002
1064
  user_input,
1003
1065
  include_files,
1004
1066
  ai.tool_executor,
1005
- ai.investigation_id,
1006
1067
  runbooks,
1007
1068
  system_prompt_additions,
1008
1069
  )
holmes/main.py CHANGED
@@ -104,6 +104,11 @@ opt_verbose: Optional[List[bool]] = typer.Option(
104
104
  "-v",
105
105
  help="Verbose output. You can pass multiple times to increase the verbosity. e.g. -v or -vv or -vvv",
106
106
  )
107
+ opt_log_costs: bool = typer.Option(
108
+ False,
109
+ "--log-costs",
110
+ help="Show LLM cost information in the output",
111
+ )
107
112
  opt_echo_request: bool = typer.Option(
108
113
  True,
109
114
  "--echo/--no-echo",
@@ -176,6 +181,7 @@ def ask(
176
181
  custom_toolsets: Optional[List[Path]] = opt_custom_toolsets,
177
182
  max_steps: Optional[int] = opt_max_steps,
178
183
  verbose: Optional[List[bool]] = opt_verbose,
184
+ log_costs: bool = opt_log_costs,
179
185
  # semi-common options
180
186
  destination: Optional[DestinationType] = opt_destination,
181
187
  slack_token: Optional[str] = opt_slack_token,
@@ -219,7 +225,7 @@ def ask(
219
225
  """
220
226
  Ask any question and answer using available tools
221
227
  """
222
- console = init_logging(verbose) # type: ignore
228
+ console = init_logging(verbose, log_costs) # type: ignore
223
229
  # Detect and read piped input
224
230
  piped_data = None
225
231
 
@@ -302,7 +308,6 @@ def ask(
302
308
  prompt, # type: ignore
303
309
  include_file,
304
310
  ai.tool_executor,
305
- ai.investigation_id,
306
311
  config.get_runbook_catalog(),
307
312
  system_prompt_additions,
308
313
  )
@@ -4,6 +4,7 @@
4
4
  {%- set k8s_yaml_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | rejectattr("fetch_pod_logs", "defined") | first -%}
5
5
  {%- set opensearch_ts = toolsets | selectattr("name", "equalto", "opensearch/logs") | first -%}
6
6
  {%- set datadog_ts = toolsets | selectattr("name", "equalto", "datadog/logs") | first -%}
7
+ {%- set bash_ts = toolsets | selectattr("name", "equalto", "bash") | first -%}
7
8
 
8
9
  ## Logs
9
10
 
@@ -44,6 +45,9 @@ Tools to search and fetch logs from Kubernetes.
44
45
  * Check both kubectl_logs and kubectl_previous_logs because a pod restart mean kubectl_logs may not have relevant logs
45
46
  {%- elif opensearch_ts and opensearch_ts.status == "enabled" -%}
46
47
  {% include '_default_log_prompt.jinja2' %}
48
+ {%- elif bash_ts and bash_ts.status == "enabled" -%}
49
+ Use the tool `run_bash_command` to run `kubectl logs` commands and fetch any relevant pod logs.
50
+ DO NOT use `--tail` or `| tail` when calling `kubectl logs` because you may miss critical information.
47
51
  {%- else -%}
48
52
  * You have not been given access to tools to fetch kubernetes logs for nodes, pods, services or apps. This is likely a misconfiguration.
49
53
  * If you need logs to answer questions or investigate issues, tell the user to configure the documentation and enable one of these toolsets:
@@ -62,10 +62,12 @@
62
62
  * Follow ALL tasks in your plan - don't skip any tasks
63
63
  * Use task management to ensure you don't miss important investigation steps
64
64
  * If you discover additional steps during investigation, add them to your task list using TodoWrite
65
+ * When calling TodoWrite, you may ALSO call other tools in parallel to speed things up for your users and make them happy!
66
+ * On the first TodoWrite call, mark at least one task as in_progress, and start working on it in parallel.
67
+ * When calling TodoWrite for the first time, mark the tasks you started working on with ‘in_progress’ status.
65
68
 
66
69
  # Tool/function calls
67
70
 
68
71
  You are able to make tool calls / function calls. Recognise when a tool has already been called and reuse its result.
69
72
  If a tool call returns nothing, modify the parameters as required instead of repeating the tool call.
70
73
  When searching for resources in specific namespaces, test a cluster level tool to find the resource(s) and identify what namespace they are part of.
71
- You are limited in use to a maximum of 5 tool calls for each specific tool. Therefore make sure are smart about what tools you call and how you call them.
@@ -1,8 +1,3 @@
1
- {% if investigation_id %}
2
- # Investigation ID for this session
3
- Investigation id: {{ investigation_id }}
4
- {% endif %}
5
-
6
1
  CLARIFICATION REQUIREMENT: Before starting ANY investigation, if the user's question is ambiguous or lacks critical details, you MUST ask for clarification first. Do NOT create TodoWrite tasks for unclear questions.
7
2
  Only proceed with TodoWrite and investigation AFTER you have clear, specific requirements.
8
3
 
@@ -16,8 +11,8 @@ MANDATORY Task Status Updates:
16
11
  - When completing a task: Call TodoWrite changing that task's status to "completed"
17
12
 
18
13
  PARALLEL EXECUTION RULES:
19
- - When possible, work on multiple tasks at a time. If tasks depend on one another, do them one after the other.
20
- - You MAY execute multiple INDEPENDENT tasks simultaneously
14
+ - When possible, work on multiple tasks at a time. Only when tasks depend on one another, do them one after the other.
15
+ - You SHOULD execute multiple INDEPENDENT tasks simultaneously
21
16
  - Mark multiple tasks as "in_progress" if they don't depend on each other
22
17
  - Wait for dependent tasks to complete before starting tasks that need their results
23
18
  - Always use a single TodoWrite call to update multiple task statuses
@@ -84,17 +79,12 @@ If you see ANY `[ ] pending` or `[~] in_progress` tasks, DO NOT provide final an
84
79
  {"id": "3", "content": "Check resources", "status": "pending"}
85
80
  ])
86
81
 
87
-
88
- {% if todo_list %}
89
- {{ todo_list }}
90
- {% endif %}
91
-
92
82
  # MANDATORY Multi-Phase Investigation Process
93
83
 
94
84
  For ANY question requiring investigation, you MUST follow this structured approach:
95
85
 
96
86
  ## Phase 1: Initial Investigation
97
- 1. **IMMEDIATELY START with TodoWrite**: Create initial investigation task list
87
+ 1. **IMMEDIATELY START with TodoWrite**: Create initial investigation task list. Already start working on tasks. Mark the tasks you're working on as in_progress.
98
88
  2. **Execute ALL tasks systematically**: Mark each task in_progress → completed
99
89
  3. **Complete EVERY task** in the current list before proceeding
100
90
 
@@ -0,0 +1,85 @@
1
+ You are an expert in automated diagnostics and runbook creation for an AI-driven troubleshooting agents. I will provide you with one or more issue descriptions or test scenarios.
2
+
3
+ Your task is to generate a strictly executable runbook for AI Agent to follow. The runbook should be machine-readable but human-understandable, and must include the following sections:
4
+
5
+ # Runbook Content Structure
6
+
7
+ ## 1. Goal
8
+ - **Primary Objective:** Clearly define the specific category of issues this runbook addresses (e.g., "diagnose network connectivity problems", "troubleshoot pod startup failures", "investigate performance degradation").
9
+ - **Scope:** Specify the environment, technology stack, or system components covered by this runbook.
10
+ - **Agent Mandate:** Explicitly state that the AI agent must follow the workflow steps sequentially and systematically without deviation to ensure consistent, thorough troubleshooting.
11
+ - **Expected Outcome:** Define what successful completion of this runbook should achieve (root cause identification, issue resolution, or escalation criteria).
12
+
13
+ ## 2. Workflow for [Issue Category] Diagnosis
14
+ - Provide numbered, sequential steps the AI agent must execute in order.
15
+ - Each step should specify:
16
+ - **Action:** Describe the diagnostic function conceptually (e.g., "retrieve container logs from specified pod", "check service connectivity between components", "examine resource utilization metrics")
17
+ - **Function Description:** Explain what the function should accomplish rather than naming specific tools (e.g., "query the cluster to list all pods in a namespace and their current status" instead of "kubectl_get_pods()")
18
+ - **Parameters:** What data/arguments to pass to the function (namespace, pod name, time range, etc.)
19
+ - **Expected Output:** What information to gather from the result (status codes, error messages, metrics, configurations)
20
+ - **Success/Failure Criteria:** How to interpret the output and what indicates normal vs. problematic conditions
21
+ - Use conditional logic (IF/ELSE) when branching is required based on findings.
22
+ - Describe functions generically so they can be mapped to available tools (e.g., "execute a command to test network connectivity" rather than "ping_host()")
23
+ - Include verification steps to confirm each diagnostic action was successful.
24
+
25
+ ## 3. Synthesize Findings
26
+ - **Data Correlation:** Describe how the AI agent should combine outputs from multiple workflow steps.
27
+ - **Pattern Recognition:** Specify what patterns, error messages, or metrics indicate specific root causes.
28
+ - **Prioritization Logic:** Provide criteria for ranking potential causes by likelihood or severity.
29
+ - **Evidence Requirements:** Define what evidence is needed to confidently identify each potential root cause.
30
+ - **Example Scenarios:** Include sample synthesis statements showing how findings should be summarized.
31
+
32
+ ## 4. Recommended Remediation Steps
33
+ - **Immediate Actions:** List temporary workarounds or urgent fixes for critical issues.
34
+ - **Permanent Solutions:** Provide step-by-step permanent remediation procedures.
35
+ - **Verification Steps:** Define how to confirm each remediation action was successful.
36
+ - **Documentation References:** Include links to official documentation, best practices, or vendor guidance.
37
+ - **Escalation Criteria:** Specify when and how to escalate if remediation steps fail.
38
+ - **Post-Remediation Monitoring:** Describe what to monitor to prevent recurrence.
39
+
40
+ # File Organization Guidelines
41
+
42
+ ## Folder Structure
43
+ *Category folders are used to distinguish and categorize different runbooks based on their focus area or technology domain. Each runbook must be placed into a specific category folder under `holmes/plugins/runbooks/` for better organization and discoverability. Create a new category folder if your runbook doesn't fit into existing categories.*
44
+
45
+ ## File Naming
46
+ *Use consistent naming conventions for runbook files:*
47
+
48
+ - Use descriptive, lowercase names with hyphens: `dns-resolution-troubleshooting.md`
49
+ - Include the issue type or technology: `redis-connection-issues.md`
50
+ - Avoid generic names like `troubleshooting.md` or `debug.md`
51
+
52
+ ### Catalog Registration
53
+ After creating your runbook, you must add an entry to `catalog.json` in the runbooks directory to make it discoverable by AI agents.
54
+
55
+ **Steps to add a new catalog entry:**
56
+
57
+ 1. **Open** `holmes/plugins/runbooks/catalog.json`
58
+ 2. **Add your entry** to the JSON array following this structure:
59
+ ```json
60
+ {
61
+ "name": "Brief, descriptive name of the runbook",
62
+ "path": "category-folder/your-runbook-filename.md",
63
+ "description": "Clear description of what issues this runbook addresses",
64
+ "tags": ["relevant", "tags", "for", "search"]
65
+ }
66
+ ```
67
+
68
+ 3. **Ensure proper JSON formatting** - add a comma after the previous entry if needed
69
+ 4. **Validate the JSON** is properly formatted before committing
70
+
71
+ **Field Guidelines:**
72
+ - `name`: Keep concise but descriptive (e.g., "Redis Connection Issues")
73
+ - `path`: Always include the category folder (e.g., "database/redis-connection-issues.md")
74
+ - `description`: Explain what specific problems this runbook solves
75
+ - `tags`: Include technology names, issue types, and relevant keywords
76
+
77
+ Example catalog entry:
78
+ ```json
79
+ {
80
+ "name": "DNS Resolution Troubleshooting",
81
+ "path": "networking/dns-resolution-troubleshooting.md",
82
+ "description": "Comprehensive guide for diagnosing and resolving DNS resolution issues in Kubernetes clusters",
83
+ "tags": ["dns", "networking", "kubernetes", "troubleshooting"]
84
+ }
85
+ ```
@@ -20,3 +20,27 @@ This runbook is mainly used for `holmes investigate`
20
20
 
21
21
  Catalog specified in [catalog.json](catalog.json) contains a collection of runbooks written in markdown.
22
22
  During runtime, LLM will compare the runbook description with the user question and return the most matched runbook for investigation. It's possible no runbook is returned for no match.
23
+
24
+ ## Generating Runbooks
25
+
26
+ To ensure all runbooks follow a consistent format and improve troubleshooting accuracy, contributors should use the standardized [runbook format prompt](runbook-format.prompt.md) when creating new runbooks.
27
+
28
+ ### Using the Runbook Format Prompt
29
+
30
+ 1. **Start with the Template**: Use `prompt.md` as your guide when creating new runbooks
31
+ 2. **Follow the Structure**: Ensure your runbook includes all required sections:
32
+ - **Goal**: Clear definition of issues addressed and agent mandate
33
+ - **Workflow**: Sequential diagnostic steps with detailed function descriptions
34
+ - **Synthesize Findings**: Logic for combining outputs and identifying root causes
35
+ - **Recommended Remediation Steps**: Both immediate and permanent solutions
36
+
37
+ ### Benefits of Using the Standard Format
38
+
39
+ - **Consistency**: All runbooks follow the same structure and terminology
40
+ - **AI Agent Compatibility**: Ensures runbooks are machine-readable and executable by AI agents
41
+ - **Improved Accuracy**: Standardized format reduces ambiguity and improves diagnostic success rates
42
+ - **Maintainability**: Easier to update and maintain runbooks across the project
43
+
44
+ ### Example Usage
45
+
46
+ When creating a runbook for a new issue category (e.g., storage problems, authentication failures), provide the issue description to an LLM along with the prompt template to generate a properly formatted runbook that follows the established patterns.
@@ -26,6 +26,9 @@ from holmes.plugins.toolsets.datadog.toolset_datadog_traces import (
26
26
  from holmes.plugins.toolsets.datadog.toolset_datadog_rds import (
27
27
  DatadogRDSToolset,
28
28
  )
29
+ from holmes.plugins.toolsets.datadog.toolset_datadog_general import (
30
+ DatadogGeneralToolset,
31
+ )
29
32
  from holmes.plugins.toolsets.git import GitToolset
30
33
  from holmes.plugins.toolsets.grafana.toolset_grafana import GrafanaToolset
31
34
  from holmes.plugins.toolsets.grafana.toolset_grafana_loki import GrafanaLokiToolset
@@ -82,6 +85,7 @@ def load_python_toolsets(dal: Optional[SupabaseDal]) -> List[Toolset]:
82
85
  NotionToolset(),
83
86
  KafkaToolset(),
84
87
  DatadogLogsToolset(),
88
+ DatadogGeneralToolset(),
85
89
  DatadogMetricsToolset(),
86
90
  DatadogTracesToolset(),
87
91
  DatadogRDSToolset(),
@@ -155,7 +159,7 @@ def load_toolsets_from_config(
155
159
 
156
160
  loaded_toolsets: list[Toolset] = []
157
161
  if is_old_toolset_config(toolsets):
158
- message = "Old toolset config format detected, please update to the new format: https://docs.robusta.dev/master/configuration/holmesgpt/custom_toolsets.html"
162
+ message = "Old toolset config format detected, please update to the new format: https://holmesgpt.dev/data-sources/custom-toolsets/"
159
163
  logging.warning(message)
160
164
  raise ValueError(message)
161
165
 
@@ -1,7 +1,7 @@
1
1
  toolsets:
2
2
  argocd/core:
3
3
  description: "Set of tools to get argocd metadata like list of apps, repositories, projects, etc."
4
- docs_url: "https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/argocd.html"
4
+ docs_url: "https://holmesgpt.dev/data-sources/builtin-toolsets/argocd/"
5
5
  icon_url: "https://argo-cd.readthedocs.io/en/stable/assets/logo.png"
6
6
  llm_instructions: |
7
7
  You have access to a set of ArgoCD tools for debugging Kubernetes application deployments.
@@ -118,7 +118,9 @@ class ReturnProjectAlerts(MongoDBAtlasBaseTool):
118
118
  project_id = self.toolset.config.get("project_id", "")
119
119
  return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Project Alerts ({project_id})"
120
120
 
121
- def _invoke(self, params: Any) -> StructuredToolResult:
121
+ def _invoke(
122
+ self, params: dict, user_approved: bool = False
123
+ ) -> StructuredToolResult:
122
124
  try:
123
125
  url = "https://cloud.mongodb.com/api/atlas/v2/groups/{project_id}/alerts".format(
124
126
  project_id=self.toolset.config.get("project_id")
@@ -143,7 +145,9 @@ class ReturnProjectProcesses(MongoDBAtlasBaseTool):
143
145
  project_id = self.toolset.config.get("project_id", "")
144
146
  return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Project Processes ({project_id})"
145
147
 
146
- def _invoke(self, params: Any) -> StructuredToolResult:
148
+ def _invoke(
149
+ self, params: dict, user_approved: bool = False
150
+ ) -> StructuredToolResult:
147
151
  try:
148
152
  url = "https://cloud.mongodb.com/api/atlas/v2/groups/{project_id}/processes".format(
149
153
  project_id=self.toolset.config.get("project_id")
@@ -176,7 +180,9 @@ class ReturnProjectSlowQueries(MongoDBAtlasBaseTool):
176
180
  process_id = params.get("process_id", "")
177
181
  return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Slow Queries ({process_id})"
178
182
 
179
- def _invoke(self, params: Any) -> StructuredToolResult:
183
+ def _invoke(
184
+ self, params: dict, user_approved: bool = False
185
+ ) -> StructuredToolResult:
180
186
  try:
181
187
  url = self.url.format(
182
188
  project_id=self.toolset.config.get("project_id"),
@@ -203,7 +209,9 @@ class ReturnEventsFromProject(MongoDBAtlasBaseTool):
203
209
  project_id = self.toolset.config.get("project_id", "")
204
210
  return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Project Events ({project_id})"
205
211
 
206
- def _invoke(self, params: Any) -> StructuredToolResult:
212
+ def _invoke(
213
+ self, params: dict, user_approved: bool = False
214
+ ) -> StructuredToolResult:
207
215
  params.update({"itemsPerPage": 500})
208
216
  try:
209
217
  now_utc = datetime.now(timezone.utc)
@@ -260,7 +268,9 @@ class ReturnLogsForProcessInProject(MongoDBAtlasBaseTool):
260
268
  hostname = params.get("hostName", "")
261
269
  return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Host Logs ({hostname})"
262
270
 
263
- def _invoke(self, params: Any) -> StructuredToolResult:
271
+ def _invoke(
272
+ self, params: dict, user_approved: bool = False
273
+ ) -> StructuredToolResult:
264
274
  one_hour_ago = datetime.now(timezone.utc) - timedelta(hours=1)
265
275
  try:
266
276
  url = self.url.format(
@@ -312,7 +322,9 @@ class ReturnEventTypeFromProject(MongoDBAtlasBaseTool):
312
322
  event_type = params.get("eventType", "")
313
323
  return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Event Details ({event_type})"
314
324
 
315
- def _invoke(self, params: Any) -> StructuredToolResult:
325
+ def _invoke(
326
+ self, params: dict, user_approved: bool = False
327
+ ) -> StructuredToolResult:
316
328
  try:
317
329
  url = self.url.format(projectId=self.toolset.config.get("project_id"))
318
330
 
@@ -1,7 +1,7 @@
1
1
  toolsets:
2
2
  aws/security:
3
3
  description: "Set of tools to audit AWS security"
4
- docs_url: "https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/aws.html#security"
4
+ docs_url: "https://holmesgpt.dev/data-sources/builtin-toolsets/aws/"
5
5
  icon_url: "https://upload.wikimedia.org/wikipedia/commons/9/93/Amazon_Web_Services_Logo.svg"
6
6
  tags:
7
7
  - cli
@@ -42,8 +42,12 @@ toolsets:
42
42
 
43
43
  aws/rds:
44
44
  description: "Read access to Amazon RDS resources"
45
- docs_url: "https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/aws.html#rds"
45
+ docs_url: "https://holmesgpt.dev/data-sources/builtin-toolsets/aws/"
46
46
  icon_url: "https://upload.wikimedia.org/wikipedia/commons/9/93/Amazon_Web_Services_Logo.svg"
47
+ llm_instructions: |
48
+ You have access to information on RDS resources.
49
+ Use this toolset to get extra information on RDS resources on AWS.
50
+ When investigating RDS resources ALWAYS fetch additional information if possible.
47
51
  tags:
48
52
  - cli
49
53
  prerequisites:
@@ -56,8 +60,8 @@ toolsets:
56
60
  command: "aws rds describe-events"
57
61
 
58
62
  - name: "aws_rds_describe_instance"
59
- description: "Get the configuration of a RDS instance"
60
- user_description: "Get the configuration of a RDS instance"
63
+ description: "Get the configuration of a RDS instance, its status and availability stats. Runs 'aws rds describe-db-instances' for a specific instance."
64
+ user_description: "Get the configuration of an RDS instance"
61
65
  command: "aws rds describe-db-instances --db-instance-identifier '{{ db_instance_identifier }}'"
62
66
 
63
67
  - name: "aws_rds_describe_instances"
@@ -66,7 +70,7 @@ toolsets:
66
70
  command: "aws rds describe-db-instances"
67
71
 
68
72
  - name: "aws_rds_describe_logs"
69
- description: "Describe all available logs for an AWS RDS instance."
73
+ description: "Describe all available logs for an AWS RDS instance. Runs 'aws rds describe-db-log-files' for a specific instance."
70
74
  user_description: "list available RDS logs (e.g. slow query logs)"
71
75
  command: "aws rds describe-db-log-files --db-instance-identifier '{{ db_instance_identifier }}'"
72
76
 
@@ -213,7 +213,9 @@ class AnalyzeConnectionFailures(BaseAzureSQLTool):
213
213
 
214
214
  return "\n".join(report_sections)
215
215
 
216
- def _invoke(self, params: Dict) -> StructuredToolResult:
216
+ def _invoke(
217
+ self, params: dict, user_approved: bool = False
218
+ ) -> StructuredToolResult:
217
219
  try:
218
220
  # Get configuration
219
221
  db_config = self.toolset.database_config()
@@ -151,7 +151,9 @@ class AnalyzeDatabaseConnections(BaseAzureSQLTool):
151
151
 
152
152
  return "\n".join(report_sections)
153
153
 
154
- def _invoke(self, params: Dict) -> StructuredToolResult:
154
+ def _invoke(
155
+ self, params: dict, user_approved: bool = False
156
+ ) -> StructuredToolResult:
155
157
  try:
156
158
  hours_back = params.get("hours_back", 2)
157
159
 
@@ -131,7 +131,9 @@ class AnalyzeDatabaseHealthStatus(BaseAzureSQLTool):
131
131
 
132
132
  return "\n".join(report_sections)
133
133
 
134
- def _invoke(self, params: Dict) -> StructuredToolResult:
134
+ def _invoke(
135
+ self, params: dict, user_approved: bool = False
136
+ ) -> StructuredToolResult:
135
137
  try:
136
138
  db_config = self.toolset.database_config()
137
139
  client = self.toolset.api_client()
@@ -192,7 +192,9 @@ class AnalyzeDatabasePerformance(BaseAzureSQLTool):
192
192
 
193
193
  return "\n".join(report_sections)
194
194
 
195
- def _invoke(self, params: Dict) -> StructuredToolResult:
195
+ def _invoke(
196
+ self, params: dict, user_approved: bool = False
197
+ ) -> StructuredToolResult:
196
198
  try:
197
199
  db_config = self.toolset.database_config()
198
200
  client = self.toolset.api_client()
@@ -249,7 +249,9 @@ class AnalyzeDatabaseStorage(BaseAzureSQLTool):
249
249
 
250
250
  return "\n".join(report_sections)
251
251
 
252
- def _invoke(self, params: Dict) -> StructuredToolResult:
252
+ def _invoke(
253
+ self, params: dict, user_approved: bool = False
254
+ ) -> StructuredToolResult:
253
255
  try:
254
256
  hours_back = params.get("hours_back", 24)
255
257
  top_tables = params.get("top_tables", 20)
@@ -147,7 +147,9 @@ class GetActiveAlerts(BaseAzureSQLTool):
147
147
 
148
148
  return "\n".join(report_sections)
149
149
 
150
- def _invoke(self, params: Dict) -> StructuredToolResult:
150
+ def _invoke(
151
+ self, params: dict, user_approved: bool = False
152
+ ) -> StructuredToolResult:
151
153
  try:
152
154
  db_config = self.toolset.database_config()
153
155
  api_client = self.toolset.api_client()
@@ -99,7 +99,9 @@ class GetSlowQueries(BaseAzureSQLTool):
99
99
 
100
100
  return "\n".join(report_sections)
101
101
 
102
- def _invoke(self, params: Dict) -> StructuredToolResult:
102
+ def _invoke(
103
+ self, params: dict, user_approved: bool = False
104
+ ) -> StructuredToolResult:
103
105
  try:
104
106
  top_count = params.get("top_count", 15)
105
107
  hours_back = params.get("hours_back", 2)
@@ -97,7 +97,9 @@ class GetTopCPUQueries(BaseAzureSQLTool):
97
97
 
98
98
  return "\n".join(report_sections)
99
99
 
100
- def _invoke(self, params: Dict) -> StructuredToolResult:
100
+ def _invoke(
101
+ self, params: dict, user_approved: bool = False
102
+ ) -> StructuredToolResult:
101
103
  try:
102
104
  top_count = params.get("top_count", 15)
103
105
  hours_back = params.get("hours_back", 2)
@@ -115,7 +115,9 @@ class GetTopDataIOQueries(BaseAzureSQLTool):
115
115
 
116
116
  return "\n".join(report_sections)
117
117
 
118
- def _invoke(self, params: Dict) -> StructuredToolResult:
118
+ def _invoke(
119
+ self, params: dict, user_approved: bool = False
120
+ ) -> StructuredToolResult:
119
121
  try:
120
122
  top_count = params.get("top_count", 15)
121
123
  hours_back = params.get("hours_back", 2)
@@ -107,7 +107,9 @@ class GetTopLogIOQueries(BaseAzureSQLTool):
107
107
 
108
108
  return "\n".join(report_sections)
109
109
 
110
- def _invoke(self, params: Dict) -> StructuredToolResult:
110
+ def _invoke(
111
+ self, params: dict, user_approved: bool = False
112
+ ) -> StructuredToolResult:
111
113
  try:
112
114
  top_count = params.get("top_count", 15)
113
115
  hours_back = params.get("hours_back", 2)
@@ -0,0 +1,65 @@
1
+ import argparse
2
+ from typing import Any, Optional
3
+
4
+ from holmes.plugins.toolsets.bash.common.bash_command import BashCommand
5
+ from holmes.plugins.toolsets.bash.common.config import BashExecutorConfig
6
+ from holmes.plugins.toolsets.bash.common.stringify import escape_shell_args
7
+ from holmes.plugins.toolsets.bash.argocd.constants import (
8
+ ALLOWED_ARGOCD_COMMANDS,
9
+ DENIED_ARGOCD_COMMANDS,
10
+ )
11
+ from holmes.plugins.toolsets.bash.common.validators import (
12
+ validate_command_and_operations,
13
+ )
14
+
15
+
16
+ class ArgocdCommand(BashCommand):
17
+ def __init__(self):
18
+ super().__init__("argocd")
19
+
20
+ def add_parser(self, parent_parser: Any):
21
+ """Create Argo CD CLI parser with safe command validation."""
22
+ argocd_parser = parent_parser.add_parser(
23
+ "argocd", help="Argo CD Command Line Interface", exit_on_error=False
24
+ )
25
+
26
+ # Add command subparser
27
+ argocd_parser.add_argument(
28
+ "command", help="Argo CD command (e.g., app, cluster, proj, repo)"
29
+ )
30
+
31
+ # Capture remaining arguments
32
+ argocd_parser.add_argument(
33
+ "options",
34
+ nargs=argparse.REMAINDER,
35
+ default=[],
36
+ help="Argo CD CLI subcommands, operations, and options",
37
+ )
38
+ return argocd_parser
39
+
40
+ def validate_command(
41
+ self, command: Any, original_command: str, config: Optional[BashExecutorConfig]
42
+ ) -> None:
43
+ if hasattr(command, "options"):
44
+ validate_command_and_operations(
45
+ command.command,
46
+ command.options,
47
+ ALLOWED_ARGOCD_COMMANDS,
48
+ DENIED_ARGOCD_COMMANDS,
49
+ )
50
+
51
+ def stringify_command(
52
+ self, command: Any, original_command: str, config: Optional[BashExecutorConfig]
53
+ ) -> str:
54
+ """Convert parsed Argo CD command back to safe command string."""
55
+ parts = ["argocd", command.command]
56
+
57
+ if hasattr(command, "options") and command.options:
58
+ parts.extend(command.options)
59
+
60
+ return " ".join(escape_shell_args(parts))
61
+
62
+
63
+ def create_argocd_parser(parent_parser: Any):
64
+ argocd_command = ArgocdCommand()
65
+ return argocd_command.add_parser(parent_parser)