holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +20 -6
  3. holmes/common/env_vars.py +58 -3
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +123 -148
  6. holmes/core/conversations.py +71 -15
  7. holmes/core/feedback.py +191 -0
  8. holmes/core/investigation.py +31 -39
  9. holmes/core/investigation_structured_output.py +3 -3
  10. holmes/core/issue.py +1 -1
  11. holmes/core/llm.py +508 -88
  12. holmes/core/models.py +108 -4
  13. holmes/core/openai_formatting.py +14 -1
  14. holmes/core/prompt.py +48 -3
  15. holmes/core/runbooks.py +1 -0
  16. holmes/core/safeguards.py +8 -6
  17. holmes/core/supabase_dal.py +295 -100
  18. holmes/core/tool_calling_llm.py +489 -428
  19. holmes/core/tools.py +325 -56
  20. holmes/core/tools_utils/token_counting.py +21 -0
  21. holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
  22. holmes/core/tools_utils/tool_executor.py +0 -13
  23. holmes/core/tools_utils/toolset_utils.py +1 -0
  24. holmes/core/toolset_manager.py +191 -5
  25. holmes/core/tracing.py +19 -3
  26. holmes/core/transformers/__init__.py +23 -0
  27. holmes/core/transformers/base.py +63 -0
  28. holmes/core/transformers/llm_summarize.py +175 -0
  29. holmes/core/transformers/registry.py +123 -0
  30. holmes/core/transformers/transformer.py +32 -0
  31. holmes/core/truncation/compaction.py +94 -0
  32. holmes/core/truncation/dal_truncation_utils.py +23 -0
  33. holmes/core/truncation/input_context_window_limiter.py +219 -0
  34. holmes/interactive.py +228 -31
  35. holmes/main.py +23 -40
  36. holmes/plugins/interfaces.py +2 -1
  37. holmes/plugins/prompts/__init__.py +2 -1
  38. holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
  39. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
  41. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  42. holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
  43. holmes/plugins/prompts/generic_ask.jinja2 +0 -4
  44. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
  45. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
  46. holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
  47. holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
  48. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
  49. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
  50. holmes/plugins/runbooks/__init__.py +145 -17
  51. holmes/plugins/runbooks/catalog.json +2 -0
  52. holmes/plugins/sources/github/__init__.py +4 -2
  53. holmes/plugins/sources/prometheus/models.py +1 -0
  54. holmes/plugins/toolsets/__init__.py +44 -27
  55. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  56. holmes/plugins/toolsets/aks.yaml +64 -0
  57. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
  58. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  59. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  60. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  61. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  62. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  63. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
  64. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
  65. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
  66. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
  67. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
  68. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
  69. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
  70. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
  71. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
  72. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
  73. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
  74. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  75. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  76. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  77. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  78. holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
  79. holmes/plugins/toolsets/bash/common/bash.py +23 -13
  80. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  81. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  82. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  83. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  84. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  85. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  86. holmes/plugins/toolsets/cilium.yaml +284 -0
  87. holmes/plugins/toolsets/connectivity_check.py +124 -0
  88. holmes/plugins/toolsets/coralogix/api.py +132 -119
  89. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  90. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  91. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  92. holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
  93. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
  94. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  95. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  96. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  97. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  98. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
  99. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
  100. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
  101. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
  102. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  103. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  104. holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  105. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
  106. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
  107. holmes/plugins/toolsets/git.py +54 -50
  108. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  109. holmes/plugins/toolsets/grafana/common.py +13 -29
  110. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
  111. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
  112. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
  113. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  114. holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
  115. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
  116. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  117. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
  118. holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
  119. holmes/plugins/toolsets/internet/internet.py +15 -16
  120. holmes/plugins/toolsets/internet/notion.py +9 -11
  121. holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
  122. holmes/plugins/toolsets/investigator/model.py +3 -1
  123. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  124. holmes/plugins/toolsets/kafka.py +36 -42
  125. holmes/plugins/toolsets/kubernetes.yaml +317 -113
  126. holmes/plugins/toolsets/kubernetes_logs.py +9 -9
  127. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  128. holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
  129. holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
  130. holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
  131. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
  132. holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
  133. holmes/plugins/toolsets/openshift.yaml +283 -0
  134. holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
  135. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
  136. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  137. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  138. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
  139. holmes/plugins/toolsets/robusta/robusta.py +239 -68
  140. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  141. holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
  142. holmes/plugins/toolsets/service_discovery.py +1 -1
  143. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  144. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  145. holmes/plugins/toolsets/utils.py +88 -0
  146. holmes/utils/config_utils.py +91 -0
  147. holmes/utils/connection_utils.py +31 -0
  148. holmes/utils/console/result.py +10 -0
  149. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  150. holmes/utils/env.py +7 -0
  151. holmes/utils/file_utils.py +2 -1
  152. holmes/utils/global_instructions.py +60 -11
  153. holmes/utils/holmes_status.py +6 -4
  154. holmes/utils/holmes_sync_toolsets.py +0 -2
  155. holmes/utils/krr_utils.py +188 -0
  156. holmes/utils/log.py +15 -0
  157. holmes/utils/markdown_utils.py +2 -3
  158. holmes/utils/memory_limit.py +58 -0
  159. holmes/utils/sentry_helper.py +64 -0
  160. holmes/utils/stream.py +69 -8
  161. holmes/utils/tags.py +4 -3
  162. holmes/version.py +37 -15
  163. holmesgpt-0.18.4.dist-info/LICENSE +178 -0
  164. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
  165. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  166. holmes/core/performance_timing.py +0 -72
  167. holmes/plugins/toolsets/aws.yaml +0 -80
  168. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
  169. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  170. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
  171. holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
  172. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  173. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  174. holmes/plugins/toolsets/newrelic.py +0 -231
  175. holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
  176. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  177. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
  178. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  179. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  180. holmes/plugins/toolsets/servicenow/install.md +0 -37
  181. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  182. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  183. holmes/utils/keygen_utils.py +0 -6
  184. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  185. holmesgpt-0.13.2.dist-info/RECORD +0 -234
  186. /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
  187. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  188. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -8,6 +8,10 @@ toolsets:
8
8
  prerequisites:
9
9
  - command: "kubectl version --client"
10
10
 
11
+ # Note: Many tools in this toolset use transformers with llm_summarize
12
+ # to automatically summarize large kubectl outputs when a fast model is configured.
13
+ # This reduces context window usage while preserving key information for debugging.
14
+
11
15
  tools:
12
16
  - name: "kubectl_describe"
13
17
  description: >
@@ -17,6 +21,20 @@ toolsets:
17
21
  - 'describe pod xyz-123'
18
22
  - 'show service xyz-123 in namespace my-ns'
19
23
  command: "kubectl describe {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}"
24
+ transformers:
25
+ - name: llm_summarize
26
+ config:
27
+ input_threshold: 1000
28
+ prompt: |
29
+ Summarize this kubectl describe output focusing on:
30
+ - What needs attention or immediate action
31
+ - Resource status and health indicators
32
+ - Any errors, warnings, or non-standard states
33
+ - Key configuration details that could affect functionality
34
+ - When possible, mention exact field names so the user can grep for specific details
35
+ - Be concise: aim for ≤ 50% of the original length; avoid repeating defaults/healthy/unchanged details
36
+ - Prefer aggregates and counts; list only outliers and actionable items
37
+ - Keep grep-friendly: include exact field names/values that matter
20
38
 
21
39
  - name: "kubectl_get_by_name"
22
40
  description: "Run `kubectl get <kind> <name> --show-labels`"
@@ -25,10 +43,36 @@ toolsets:
25
43
  - name: "kubectl_get_by_kind_in_namespace"
26
44
  description: "Run `kubectl get <kind> -n <namespace> --show-labels` to get all resources of a given type in namespace"
27
45
  command: "kubectl get --show-labels -o wide {{ kind }} -n {{namespace}}"
46
+ transformers:
47
+ - name: llm_summarize
48
+ config:
49
+ input_threshold: 1000
50
+ prompt: |
51
+ Summarize this kubectl output focusing on:
52
+ - What needs attention or immediate action
53
+ - Group similar resources into aggregate descriptions
54
+ - Make sure to mention outliers, errors, and non-standard states
55
+ - List healthy resources as aggregate descriptions
56
+ - When listing unhealthy resources, also try to use aggregate descriptions when possible
57
+ - When possible, mention exact keywords so the user can rerun the command with | grep <keyword> and drill down
58
+ - Be concise and avoid expansion: target ≤ 50% of input size; prefer counts + outliers over full listings
28
59
 
29
60
  - name: "kubectl_get_by_kind_in_cluster"
30
61
  description: "Run `kubectl get -A <kind> --show-labels` to get all resources of a given type in the cluster"
31
62
  command: "kubectl get -A --show-labels -o wide {{ kind }}"
63
+ transformers:
64
+ - name: llm_summarize
65
+ config:
66
+ input_threshold: 1000
67
+ prompt: |
68
+ Summarize this kubectl output focusing on:
69
+ - What needs attention or immediate action
70
+ - Group similar resources into a single line and description
71
+ - Make sure to mention outliers, errors, and non-standard states
72
+ - List healthy resources as aggregate descriptions
73
+ - When listing unhealthy resources, also try to use aggregate descriptions when possible
74
+ - When possible, mention exact keywords so the user can rerun the command with | grep <keyword> and drill down on the parts they care about
75
+ - Strive for ≤ 50% of the original size; keep results compact and grep-friendly (one line per aggregate)
32
76
 
33
77
  - name: "kubectl_find_resource"
34
78
  description: "Run `kubectl get {{ kind }} -A --show-labels | grep {{ keyword }}` to find a resource where you know a substring of the name, IP, namespace, or labels"
@@ -42,142 +86,302 @@ toolsets:
42
86
  description: "Retrieve the events for a specific Kubernetes resource. `resource_type` can be any kubernetes resource type: 'pod', 'service', 'deployment', 'job', 'node', etc."
43
87
  command: "kubectl events --for {{resource_type}}/{{ resource_name }}{% if namespace %} -n {{ namespace }}{% endif %}"
44
88
 
45
- - name: "kubectl_memory_requests_all_namespaces"
46
- description: "Fetch and display memory requests for all pods across all namespaces in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly."
47
- command: |
48
- kubectl get pods --all-namespaces -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,MEMORY_REQUEST:.spec.containers[*].resources.requests.memory" --no-headers | \
49
- awk '
50
- function convert_to_mib(value) {
51
- if (value ~ /^[0-9]+e[0-9]+$/) return (value + 0) / (1024 * 1024); # Scientific notation
52
- if (value ~ /m$/) return (value + 0) / (1024^2 * 1000); # Millibytes (m)
53
- if (value ~ /Ei$/) return (value + 0) * 1024^6 / (1024^2); # Binary units
54
- if (value ~ /Pi$/) return (value + 0) * 1024^5 / (1024^2);
55
- if (value ~ /Ti$/) return (value + 0) * 1024^4 / (1024^2);
56
- if (value ~ /Gi$/) return (value + 0) * 1024^3 / (1024^2);
57
- if (value ~ /Mi$/) return (value + 0);
58
- if (value ~ /Ki$/) return (value + 0) / 1024;
59
- if (value ~ /E$/) return (value + 0) * 1000^6 / (1024^2); # Decimal units
60
- if (value ~ /P$/) return (value + 0) * 1000^5 / (1024^2);
61
- if (value ~ /T$/) return (value + 0) * 1000^4 / (1024^2);
62
- if (value ~ /G$/) return (value + 0) * 1000^3 / (1024^2);
63
- if (value ~ /M$/) return (value + 0) * 1000^2 / (1024^2);
64
- if (value ~ /k$/) return (value + 0) * 1000 / (1024^2);
65
- return (value + 0) / (1024 * 1024); # Default: bytes
66
- }
67
- function sum_memory(requests) {
68
- gsub(/^[ \t]+|[ \t]+$/, "", requests);
69
- if (requests == "" || requests == "<none>") return 0;
70
- split(requests, arr, ",");
71
- total = 0;
72
- for (i in arr) {
73
- if (arr[i] != "<none>") total += convert_to_mib(arr[i]);
74
- }
75
- return total;
76
- }
77
- {
78
- namespace = $1;
79
- name = $2;
80
- requests = $3;
81
- for (i=4; i<=NF; i++) {
82
- requests = requests " " $i;
83
- }
84
- print namespace, name, sum_memory(requests) " Mi";
85
- }' | sort -k3 -nr
86
-
87
- - name: "kubectl_memory_requests_namespace"
88
- description: "Fetch and display memory requests for all pods in a specified namespace in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly."
89
- command: |
90
- kubectl get pods -n {{ namespace }} -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,MEMORY_REQUEST:.spec.containers[*].resources.requests.memory" --no-headers | \
91
- awk '
92
- function convert_to_mib(value) {
93
- if (value ~ /^[0-9]+e[0-9]+$/) return (value + 0) / (1024 * 1024); # Scientific notation
94
- if (value ~ /m$/) return (value + 0) / (1024^2 * 1000); # Millibytes (m)
95
- if (value ~ /Ei$/) return (value + 0) * 1024^6 / (1024^2); # Binary units
96
- if (value ~ /Pi$/) return (value + 0) * 1024^5 / (1024^2);
97
- if (value ~ /Ti$/) return (value + 0) * 1024^4 / (1024^2);
98
- if (value ~ /Gi$/) return (value + 0) * 1024^3 / (1024^2);
99
- if (value ~ /Mi$/) return (value + 0);
100
- if (value ~ /Ki$/) return (value + 0) / 1024;
101
- if (value ~ /E$/) return (value + 0) * 1000^6 / (1024^2); # Decimal units
102
- if (value ~ /P$/) return (value + 0) * 1000^5 / (1024^2);
103
- if (value ~ /T$/) return (value + 0) * 1000^4 / (1024^2);
104
- if (value ~ /G$/) return (value + 0) * 1000^3 / (1024^2);
105
- if (value ~ /M$/) return (value + 0) * 1000^2 / (1024^2);
106
- if (value ~ /k$/) return (value + 0) * 1000 / (1024^2);
107
- return (value + 0) / (1024 * 1024); # Default: bytes
108
- }
109
- function sum_memory(requests) {
110
- gsub(/^[ \t]+|[ \t]+$/, "", requests);
111
- if (requests == "" || requests == "<none>") return 0;
112
- split(requests, arr, ",");
113
- total = 0;
114
- for (i in arr) {
115
- if (arr[i] != "<none>") total += convert_to_mib(arr[i]);
116
- }
117
- return total;
118
- }
119
- {
120
- namespace = $1;
121
- name = $2;
122
- requests = $3;
123
- for (i=4; i<=NF; i++) {
124
- requests = requests " " $i;
125
- }
126
- print namespace, name, sum_memory(requests) " Mi";
127
- }' | sort -k3 -nr
128
-
129
89
  - name: "kubernetes_jq_query"
130
90
  user_description: "Query Kubernetes Resources: kubectl get {{kind}} --all-namespaces -o json | jq -r {{jq_expr}}"
131
91
  description: >
132
- Use kubectl to get json for all resources of a specific kind pipe the results to jq to filter them. Do not worry about escaping the jq_expr it will be done by the system on an unescaped expression that you give. e.g. give an expression like .items[] | .spec.containers[].image | select(test("^gcr.io/") | not)
133
- command: kubectl get {{ kind }} --all-namespaces -o json | jq -r {{ jq_expr }}
92
+ Use kubectl to get json for all resources of a specific kind and filter with jq.
93
+ IMPORTANT: The 'kind' parameter must be the plural form of the resource type
94
+ (e.g., use "pods" not "pod", "services" not "service", "jobs" not "job").
95
+ Do not worry about escaping the jq_expr - it will be done by the system.
96
+ Example: .items[] | .spec.containers[].image | select(test("^gcr.io/") | not)
97
+ script: |
98
+ #!/bin/bash
99
+
100
+ echo "Executing paginated query for {{ kind }} resources..."
101
+ echo "Expression: {{ jq_expr }}"
102
+ echo "---"
103
+
104
+ # Get the API path for the resource kind using kubectl
105
+ API_INFO=$(kubectl api-resources --no-headers | grep "^{{ kind }} " | head -1)
106
+
107
+ if [ -z "$API_INFO" ]; then
108
+ echo "Error: Unable to find resource kind '{{ kind }}'" >&2
109
+ exit 1
110
+ fi
111
+
112
+ # Extract NAMESPACED value
113
+ if [[ "$API_INFO" == *" true "* ]]; then
114
+ NAMESPACED="true"
115
+ PREFIX=$(echo "$API_INFO" | sed 's/ true .*//')
116
+ elif [[ "$API_INFO" == *" false "* ]]; then
117
+ NAMESPACED="false"
118
+ PREFIX=$(echo "$API_INFO" | sed 's/ false .*//')
119
+ else
120
+ echo "Error: Could not find NAMESPACED field (true/false) in API info" >&2
121
+ exit 1
122
+ fi
123
+
124
+ # Trim trailing spaces from prefix and collapse internal spaces
125
+ PREFIX=$(echo "$PREFIX" | sed 's/ *$//' | sed 's/ */ /g')
126
+
127
+ IFS=' ' read -ra PREFIX_FIELDS <<< "$PREFIX"
128
+ FIELD_COUNT=0
129
+ for field in "${PREFIX_FIELDS[@]}"; do
130
+ ((FIELD_COUNT++))
131
+ done
132
+
133
+ RESOURCE_NAME="${PREFIX_FIELDS[0]}"
134
+ if [ $FIELD_COUNT -ge 2 ]; then
135
+ API_VERSION="${PREFIX_FIELDS[$((FIELD_COUNT - 1))]}"
136
+ else
137
+ API_VERSION=""
138
+ fi
139
+
140
+ if [ -z "$API_VERSION" ] || [ -z "$RESOURCE_NAME" ]; then
141
+ echo "Error: Unable to parse API info for resource kind '{{ kind }}'" >&2
142
+ exit 1
143
+ fi
144
+
145
+ # Build API path
146
+ if [[ "$API_VERSION" == "v1" ]]; then
147
+ API_PATH="/api/v1/${RESOURCE_NAME}"
148
+ else
149
+ API_PATH="/apis/${API_VERSION}/${RESOURCE_NAME}"
150
+ fi
151
+
152
+ # Process resources in chunks using API pagination
153
+ LIMIT=500 # Process 500 items at a time
154
+ CONTINUE=""
155
+ PROCESSED=0
156
+ TOTAL_MATCHES=0
157
+
158
+ while true; do
159
+ # Build API query with limit and continue token
160
+ if [ -z "$CONTINUE" ]; then
161
+ # First request - get from all namespaces
162
+ QUERY="${API_PATH}?limit=${LIMIT}"
163
+ else
164
+ # Subsequent requests with continue token
165
+ QUERY="${API_PATH}?limit=${LIMIT}&continue=${CONTINUE}"
166
+ fi
167
+
168
+ OUTPUT=$(kubectl get --raw "$QUERY" 2>&1)
169
+ exit_code=$?
170
+
171
+ if [ $exit_code -ne 0 ]; then
172
+ echo "Error: $OUTPUT" >&2
173
+ exit $exit_code
174
+ fi
175
+
176
+ ITEMS_COUNT=$(echo "$OUTPUT" | jq '.items | length')
177
+
178
+ MATCHES=$(echo "$OUTPUT" | jq -r {{ jq_expr }} 2>&1)
179
+ jq_exit=$?
180
+ if [ $jq_exit -ne 0 ]; then
181
+ echo "Error: jq expression failed: $MATCHES" >&2
182
+ exit $jq_exit
183
+ fi
184
+
185
+ if [ "$ITEMS_COUNT" -gt 0 ]; then
186
+ if [ -n "$MATCHES" ]; then
187
+ echo "$MATCHES"
188
+ MATCH_COUNT=$(echo "$MATCHES" | grep -c . || true)
189
+ TOTAL_MATCHES=$((TOTAL_MATCHES + MATCH_COUNT))
190
+ fi
191
+
192
+ PROCESSED=$((PROCESSED + ITEMS_COUNT))
193
+
194
+ echo "Processed $PROCESSED items, found $TOTAL_MATCHES matches so far..." >&2
195
+ fi
196
+
197
+ CONTINUE=$(echo "$OUTPUT" | jq -r '.metadata.continue // empty')
198
+
199
+ if [ -z "$CONTINUE" ]; then
200
+ break
201
+ fi
202
+ done
203
+
204
+ echo "---" >&2
205
+ echo "Total items processed: $PROCESSED, matches found: $TOTAL_MATCHES" >&2
206
+ transformers:
207
+ - name: llm_summarize
208
+ config:
209
+ input_threshold: 10000
210
+ prompt: |
211
+ Summarize this jq query output focusing on:
212
+ - Key patterns and commonalities in the data
213
+ - Notable outliers, anomalies, or items that need attention
214
+ - Group similar results into aggregate descriptions when possible
215
+ - Highlight any empty results, null values, or missing data
216
+ - When applicable, mention specific resource names, namespaces, or values that stand out
217
+ - Organize findings in a structured way that helps with troubleshooting
218
+ - Be concise: aim for ≤ 50% of the original text; prioritize aggregates and actionable outliers
219
+ - Include grep-ready keys/values; avoid repeating entire objects or unchanged defaults
220
+
221
+ - name: "kubernetes_tabular_query"
222
+ user_description: "Tabular output of specific fields: kubectl get {{kind}} --all-namespaces -o custom-columns={{columns}}"
223
+ description: >
224
+ Extract specific fields from Kubernetes resources in tabular format with optional filtering.
225
+ Memory-efficient way to query large clusters - only requested fields are transmitted.
226
+ Column specification format: HEADER:FIELD_PATH,HEADER2:FIELD_PATH2,...
227
+
228
+ Optional filtering parameter:
229
+ - filter_pattern: Pattern to match in any column (supports grep regex)
230
+
231
+ Examples:
232
+ - Basic fields: NAME:.metadata.name,STATUS:.status.phase,NODE:.spec.nodeName
233
+ - Filter by status: filter_pattern="Running"
234
+ - Filter out lines with <none>: filter_pattern="-v '<none>'"
235
+ - Nested fields: CREATED:.metadata.creationTimestamp,IMAGE:.spec.containers[0].image
236
+ - Array fields: LABELS:.metadata.labels,PORTS:.spec.ports[*].port
237
+
238
+ Note: Output is tabular text with column headers. Filtering works on the entire line.
239
+ Note: not allowed characters are: ' / ; and newline
240
+ command: kubectl get {{ kind }} --all-namespaces -o custom-columns='{{ columns }}'{% if filter_pattern %} | (head -n 1; tail -n +2 | grep {{ filter_pattern }}){% endif %}
241
+ transformers:
242
+ - name: llm_summarize
243
+ config:
244
+ input_threshold: 10000
245
+ prompt: |
246
+ Summarize this tabular output focusing on:
247
+ - Key patterns and trends in the data
248
+ - Resources that need attention (errors, pending, failures)
249
+ - Group similar items into aggregate descriptions
250
+ - Highlight outliers or unusual values
251
+ - Mention specific resource names only for problematic items
252
+ - Provide counts and distributions where relevant
253
+ - Be concise: aim for ≤ 50% of the original size
254
+ - Keep output actionable and focused on anomalies
134
255
 
135
256
  - name: "kubernetes_count"
136
257
  user_description: "Count Kubernetes Resources: kubectl get {{kind}} --all-namespaces -o json | jq -c -r {{ jq_expr }}"
137
258
  description: >
138
259
  Use kubectl to get apply a jq filter and then count the results.
139
260
  Use this whenever asked to count kubernetes resources.
261
+ IMPORTANT: The 'kind' parameter must be the plural form of the resource type
262
+ (e.g., use "pods" not "pod", "services" not "service", "jobs" not "job").
140
263
  Use select() to filter objects before extracting properties, e.g. .items[] | select(.metadata.namespace == "test-1") | .metadata.name
141
264
  Do not worry about escaping the jq_expr it will be done by the system on an unescaped expression that you give.
142
265
  e.g. give an expression like .items[] | select(.spec.containers[].image | test("^gcr.io/") | not) | .metadata.name
143
266
  script: |
267
+ #!/bin/bash
268
+
144
269
  echo "Command executed: kubectl get {{ kind }} --all-namespaces -o json | jq -c -r {{ jq_expr }}"
145
270
  echo "---"
146
271
 
147
- # Execute the command and capture both stdout and stderr separately
148
- temp_error=$(mktemp)
149
- matches=$(kubectl get {{ kind }} --all-namespaces -o json 2>"$temp_error" | jq -c -r {{ jq_expr }} 2>>"$temp_error")
150
- exit_code=$?
151
- error_output=$(cat "$temp_error")
152
- rm -f "$temp_error"
153
-
154
- if [ $exit_code -ne 0 ]; then
155
- echo "Error executing command (exit code: $exit_code):"
156
- echo "$error_output"
157
- exit $exit_code
272
+ # Get the API path for the resource kind
273
+ API_INFO=$(kubectl api-resources --no-headers | grep "^{{ kind }} " | head -1)
274
+
275
+ if [ -z "$API_INFO" ]; then
276
+ echo "Error: Unable to find resource kind '{{ kind }}'" >&2
277
+ exit 1
278
+ fi
279
+
280
+ if [[ "$API_INFO" == *" true "* ]]; then
281
+ NAMESPACED="true"
282
+ PREFIX=$(echo "$API_INFO" | sed 's/ true .*//')
283
+ elif [[ "$API_INFO" == *" false "* ]]; then
284
+ NAMESPACED="false"
285
+ PREFIX=$(echo "$API_INFO" | sed 's/ false .*//')
158
286
  else
159
- # Show any stderr warnings even if command succeeded
160
- if [ -n "$error_output" ]; then
161
- echo "Warnings/stderr output:"
162
- echo "$error_output"
163
- echo "---"
164
- fi
287
+ echo "Error: Could not find NAMESPACED field (true/false) in API info" >&2
288
+ exit 1
289
+ fi
290
+
291
+ PREFIX=$(echo "$PREFIX" | sed 's/ *$//' | sed 's/ */ /g')
292
+
293
+ IFS=' ' read -ra PREFIX_FIELDS <<< "$PREFIX"
294
+ FIELD_COUNT=0
295
+ for field in "${PREFIX_FIELDS[@]}"; do
296
+ ((FIELD_COUNT++))
297
+ done
298
+ RESOURCE_NAME="${PREFIX_FIELDS[0]}"
299
+
300
+ if [ $FIELD_COUNT -ge 2 ]; then
301
+ API_VERSION="${PREFIX_FIELDS[$((FIELD_COUNT - 1))]}"
302
+ else
303
+ API_VERSION=""
304
+ fi
305
+
306
+ if [ -z "$API_VERSION" ] || [ -z "$RESOURCE_NAME" ]; then
307
+ echo "Error: Unable to parse API info for resource kind '{{ kind }}'" >&2
308
+ exit 1
309
+ fi
310
+
311
+ # Build API path
312
+ if [[ "$API_VERSION" == "v1" ]]; then
313
+ API_PATH="/api/v1/${RESOURCE_NAME}"
314
+ else
315
+ API_PATH="/apis/${API_VERSION}/${RESOURCE_NAME}"
316
+ fi
165
317
 
166
- # Filter out empty lines for accurate count
167
- filtered_matches=$(echo "$matches" | grep -v '^$' | grep -v '^null$')
168
- if [ -z "$filtered_matches" ]; then
169
- count=0
318
+ # Process resources in chunks using API pagination
319
+ LIMIT=500
320
+ CONTINUE=""
321
+ ALL_MATCHES=""
322
+ BATCH_NUM=0
323
+ TOTAL_PROCESSED=0
324
+
325
+ while true; do
326
+ BATCH_NUM=$((BATCH_NUM + 1))
327
+
328
+ if [ -z "$CONTINUE" ]; then
329
+ QUERY="${API_PATH}?limit=${LIMIT}"
170
330
  else
171
- count=$(echo "$filtered_matches" | wc -l)
331
+ QUERY="${API_PATH}?limit=${LIMIT}&continue=${CONTINUE}"
172
332
  fi
173
- preview=$(echo "$filtered_matches" | head -n 10 | cut -c 1-200 | nl)
174
333
 
175
- echo "$count results"
176
- echo "---"
177
- echo "A *preview* of results is shown below (up to 10 results, up to 200 chars):"
178
- echo "$preview"
334
+ OUTPUT=$(kubectl get --raw "$QUERY" 2>&1)
335
+ exit_code=$?
336
+
337
+ if [ $exit_code -ne 0 ]; then
338
+ echo "Error for query $QUERY: $OUTPUT" >&2
339
+ exit $exit_code
340
+ fi
341
+
342
+ ITEMS_COUNT=$(echo "$OUTPUT" | jq '.items | length')
343
+ TOTAL_PROCESSED=$((TOTAL_PROCESSED + ITEMS_COUNT))
344
+
345
+ BATCH_MATCHES=$(echo "$OUTPUT" | jq -c -r {{ jq_expr }} 2>&1)
346
+ jq_exit=$?
347
+ if [ $jq_exit -ne 0 ]; then
348
+ echo "Error: jq expression failed: $BATCH_MATCHES" >&2
349
+ exit $jq_exit
350
+ fi
351
+
352
+ if [ -n "$BATCH_MATCHES" ]; then
353
+ if [ -z "$ALL_MATCHES" ]; then
354
+ ALL_MATCHES="$BATCH_MATCHES"
355
+ else
356
+ ALL_MATCHES="$ALL_MATCHES"$'\n'"$BATCH_MATCHES"
357
+ fi
358
+ fi
359
+
360
+ CONTINUE=$(echo "$OUTPUT" | jq -r '.metadata.continue // empty')
361
+ if [ -z "$CONTINUE" ]; then
362
+ break
363
+ fi
364
+
365
+ echo "Processed batch $BATCH_NUM ($TOTAL_PROCESSED items so far)..." >&2
366
+ done
367
+
368
+ # Now process the collected matches
369
+ filtered_matches=$(echo "$ALL_MATCHES" | grep -v '^$' | grep -v '^null$')
370
+ if [ -z "$filtered_matches" ]; then
371
+ count=0
372
+ preview=""
373
+ else
374
+ count=$(echo "$filtered_matches" | wc -l)
375
+ preview=$(echo "$filtered_matches" | head -n 10 | cut -c 1-200 | nl)
179
376
  fi
180
377
 
378
+ echo "$count results"
379
+ echo "---"
380
+ echo "A *preview* of results is shown below (up to 10 results, up to 200 chars):"
381
+ echo "$preview"
382
+ echo "---"
383
+ echo "Total items processed: $TOTAL_PROCESSED" >&2
384
+
181
385
  # NOTE: this is only possible for probes with a healthz endpoint - we do this to avoid giving the LLM generic
182
386
  # http GET capabilities which are more powerful than we want to expose
183
387
  #- name: "check_liveness_probe"
@@ -3,27 +3,27 @@ import re
3
3
  import subprocess
4
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
5
  from datetime import datetime, timezone
6
- from typing import Optional, List, Tuple, Set
6
+ from typing import List, Optional, Set, Tuple
7
+
7
8
  from pydantic import BaseModel
8
9
 
9
10
  from holmes.common.env_vars import KUBERNETES_LOGS_TIMEOUT_SECONDS
10
11
  from holmes.core.tools import (
11
12
  StaticPrerequisite,
12
13
  StructuredToolResult,
13
- ToolResultStatus,
14
+ StructuredToolResultStatus,
14
15
  ToolsetTag,
15
16
  )
16
17
  from holmes.plugins.toolsets.logging_utils.logging_api import (
18
+ DEFAULT_TIME_SPAN_SECONDS,
17
19
  BasePodLoggingToolset,
18
20
  FetchPodLogsParams,
19
21
  LoggingCapability,
20
22
  LoggingConfig,
21
23
  PodLoggingTool,
22
- DEFAULT_TIME_SPAN_SECONDS,
23
24
  )
24
25
  from holmes.plugins.toolsets.utils import process_timestamps_to_int, to_unix_ms
25
26
 
26
-
27
27
  # match ISO 8601 format (YYYY-MM-DDTHH:MM:SS[.fffffffff]Z) or (YYYY-MM-DDTHH:MM:SS[.fffffffff]+/-XX:XX)
28
28
  timestamp_pattern = re.compile(
29
29
  r"^(?P<ts>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2}))"
@@ -140,7 +140,7 @@ class KubernetesLogsToolset(BasePodLoggingToolset):
140
140
  # Ensure both results are not None (they should always be set by the loop)
141
141
  if current_logs_result is None or previous_logs_result is None:
142
142
  return StructuredToolResult(
143
- status=ToolResultStatus.ERROR,
143
+ status=StructuredToolResultStatus.ERROR,
144
144
  error="Internal error: Failed to fetch logs",
145
145
  params=params.model_dump(),
146
146
  )
@@ -162,7 +162,7 @@ class KubernetesLogsToolset(BasePodLoggingToolset):
162
162
  ):
163
163
  # Both commands failed - return error from current logs
164
164
  return StructuredToolResult(
165
- status=ToolResultStatus.ERROR,
165
+ status=StructuredToolResultStatus.ERROR,
166
166
  error=current_logs_result.error,
167
167
  params=params.model_dump(),
168
168
  return_code=return_code,
@@ -206,7 +206,7 @@ class KubernetesLogsToolset(BasePodLoggingToolset):
206
206
  if len(filtered_logs) == 0:
207
207
  # Return NO_DATA status when there are no logs
208
208
  return StructuredToolResult(
209
- status=ToolResultStatus.NO_DATA,
209
+ status=StructuredToolResultStatus.NO_DATA,
210
210
  data="\n".join(
211
211
  metadata_lines
212
212
  ), # Still include metadata for context
@@ -218,7 +218,7 @@ class KubernetesLogsToolset(BasePodLoggingToolset):
218
218
  response_data = formatted_logs + "\n" + "\n".join(metadata_lines)
219
219
 
220
220
  return StructuredToolResult(
221
- status=ToolResultStatus.SUCCESS,
221
+ status=StructuredToolResultStatus.SUCCESS,
222
222
  data=response_data,
223
223
  params=params.model_dump(),
224
224
  return_code=return_code,
@@ -226,7 +226,7 @@ class KubernetesLogsToolset(BasePodLoggingToolset):
226
226
  except Exception as e:
227
227
  logging.exception(f"Error fetching logs for pod {params.pod_name}")
228
228
  return StructuredToolResult(
229
- status=ToolResultStatus.ERROR,
229
+ status=StructuredToolResultStatus.ERROR,
230
230
  error=f"Error fetching logs: {str(e)}",
231
231
  params=params.model_dump(),
232
232
  )
@@ -8,6 +8,10 @@ toolsets:
8
8
  prerequisites:
9
9
  - command: "kubectl version --client"
10
10
 
11
+ # Note: Log tools use transformers with llm_summarize to automatically
12
+ # summarize large log outputs when a fast model is configured. This helps
13
+ # focus on errors, patterns, and key information while reducing context usage.
14
+
11
15
  tools:
12
16
  - name: "kubectl_previous_logs"
13
17
  description: "Run `kubectl logs --previous` on a single Kubernetes pod. Used to fetch logs for a pod that crashed and see logs from before the crash. Never give a deployment name or a resource that is not a pod."
@@ -24,10 +28,38 @@ toolsets:
24
28
  - name: "kubectl_logs"
25
29
  description: "Run `kubectl logs` on a single Kubernetes pod. Never give a deployment name or a resource that is not a pod."
26
30
  command: "kubectl logs {{pod_name}} -n {{ namespace }}"
31
+ transformers:
32
+ - name: llm_summarize
33
+ config:
34
+ input_threshold: 1000
35
+ prompt: |
36
+ Summarize these pod logs focusing on:
37
+ - Errors, exceptions, and warning messages
38
+ - Recent activity patterns and trends
39
+ - Any authentication, connection, or startup issues
40
+ - Performance indicators (response times, throughput)
41
+ - Group similar log entries together
42
+ - When possible, mention exact error codes or keywords for easier searching
43
+ - Be concise: aim for ≤ 50% of the original text; prioritize aggregates and actionable outliers
44
+ - Include grep-ready keys/values; avoid repeating entire logs or unchanged defaults
27
45
 
28
46
  - name: "kubectl_logs_all_containers"
29
47
  description: "Run `kubectl logs` on all containers within a single Kubernetes pod."
30
48
  command: "kubectl logs {{pod_name}} -n {{ namespace }} --all-containers"
49
+ transformers:
50
+ - name: llm_summarize
51
+ config:
52
+ input_threshold: 1000
53
+ prompt: |
54
+ Summarize these multi-container pod logs focusing on:
55
+ - Errors, exceptions, and warning messages by container
56
+ - Inter-container communication patterns
57
+ - Any authentication, connection, or startup issues
58
+ - Performance indicators and resource usage patterns
59
+ - Group similar log entries together by container
60
+ - When possible, mention exact error codes or keywords for easier searching
61
+ - Strive for ≤ 50% of the original size; keep results compact and grep-friendly (one line per aggregate)
62
+ - Prioritize aggregates and actionable outliers over comprehensive details
31
63
 
32
64
  - name: "kubectl_container_logs"
33
65
  description: "Run `kubectl logs` on a single container within a Kubernetes pod. This is to get the logs of a specific container in a multi-container pod."