holmesgpt 0.13.3a0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (86) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +15 -4
  3. holmes/common/env_vars.py +8 -1
  4. holmes/config.py +66 -139
  5. holmes/core/investigation.py +1 -2
  6. holmes/core/llm.py +295 -52
  7. holmes/core/models.py +2 -0
  8. holmes/core/safeguards.py +4 -4
  9. holmes/core/supabase_dal.py +14 -8
  10. holmes/core/tool_calling_llm.py +202 -177
  11. holmes/core/tools.py +260 -25
  12. holmes/core/tools_utils/data_types.py +81 -0
  13. holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
  14. holmes/core/tools_utils/tool_executor.py +2 -2
  15. holmes/core/toolset_manager.py +150 -3
  16. holmes/core/tracing.py +6 -1
  17. holmes/core/transformers/__init__.py +23 -0
  18. holmes/core/transformers/base.py +62 -0
  19. holmes/core/transformers/llm_summarize.py +174 -0
  20. holmes/core/transformers/registry.py +122 -0
  21. holmes/core/transformers/transformer.py +31 -0
  22. holmes/main.py +5 -0
  23. holmes/plugins/prompts/_fetch_logs.jinja2 +10 -1
  24. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  25. holmes/plugins/toolsets/aks.yaml +64 -0
  26. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
  27. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
  28. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
  29. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
  30. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
  31. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
  32. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
  33. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
  34. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
  35. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
  36. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
  37. holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
  38. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  39. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  40. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  41. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  42. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +345 -207
  43. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
  44. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +96 -32
  45. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +10 -10
  46. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +21 -22
  47. holmes/plugins/toolsets/git.py +22 -22
  48. holmes/plugins/toolsets/grafana/common.py +14 -2
  49. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +473 -0
  50. holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
  51. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +5 -4
  52. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  53. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +662 -290
  54. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  55. holmes/plugins/toolsets/internet/internet.py +3 -3
  56. holmes/plugins/toolsets/internet/notion.py +3 -3
  57. holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
  58. holmes/plugins/toolsets/kafka.py +18 -18
  59. holmes/plugins/toolsets/kubernetes.yaml +58 -0
  60. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  61. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  62. holmes/plugins/toolsets/logging_utils/logging_api.py +1 -1
  63. holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
  64. holmes/plugins/toolsets/newrelic.py +8 -8
  65. holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
  66. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  67. holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
  68. holmes/plugins/toolsets/prometheus/prometheus.py +841 -351
  69. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +39 -2
  70. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  71. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
  72. holmes/plugins/toolsets/robusta/robusta.py +10 -10
  73. holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
  74. holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
  75. holmes/plugins/toolsets/utils.py +88 -0
  76. holmes/utils/config_utils.py +91 -0
  77. holmes/utils/env.py +7 -0
  78. holmes/utils/holmes_status.py +2 -1
  79. holmes/utils/sentry_helper.py +41 -0
  80. holmes/utils/stream.py +9 -0
  81. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/METADATA +11 -15
  82. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/RECORD +85 -75
  83. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  84. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/LICENSE.txt +0 -0
  85. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/WHEEL +0 -0
  86. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/entry_points.txt +0 -0
@@ -1,12 +1,247 @@
1
- Use Tempo when investigating latency or performance issues. Tempo provides traces information for application running on the cluster.
1
+ Grafana Tempo provides distributed tracing data through its REST API. Each tool maps directly to a specific Tempo API endpoint.
2
+
2
3
  Assume every application provides tempo traces.
3
- 1. Start by identifying an initial filter to use. This can be a pod name, a deployment name or a service name
4
- 2. Call fetch_tempo_traces_comparative_sample first when investigating performance issues via traces. This tool provides comprehensive analysis for identifying patterns. For other issues not related to performance, you can start with fetch_tempo_traces.
5
- 3. Use `fetch_tempo_traces` setting the appropriate query params
6
- - Use the min_duration filter to ensure you get traces that trigger the alert when you are investigating a performance issue
7
- - If possible, use start and end date to narrow down your search.
8
- - Use fetch_finding_by_id if you are provided with a finding/alert id. It will contain details about when the alert was triggered
9
- - Use at least one of the following argument to ensure you get relevant traces: `service_name`, `pod_name` or `deployment_name`.
10
- 4. When you have a specific trace ID to investigate, use `fetch_tempo_trace_by_id` to get detailed information about that trace.
11
- 5. Look at the duration of each span in any single trace and deduce any issues.
12
- 6. ALWAYS fetch the logs for a pod once you identify a span that is taking a long time. There may be an explanation for the slowness in the logs.
4
+
5
+ ## API Endpoints and Tool Mapping
6
+
7
+ 1. **Trace Search** (GET /api/search)
8
+ - `tempo_search_traces_by_query`: Use with 'q' parameter for TraceQL queries
9
+ - `tempo_search_traces_by_tags`: Use with 'tags' parameter for logfmt queries
10
+
11
+ 2. **Trace Details** (GET /api/v2/traces/{trace_id})
12
+ - `tempo_query_trace_by_id`: Retrieve full trace data
13
+
14
+ 3. **Tag Discovery**
15
+ - `tempo_search_tag_names` (GET /api/v2/search/tags): List available tags
16
+ - `tempo_search_tag_values` (GET /api/v2/search/tag/{tag}/values): Get values for a tag
17
+
18
+ 4. **TraceQL Metrics**
19
+ - `tempo_query_metrics_instant` (GET /api/metrics/query): Single value computation
20
+ - `tempo_query_metrics_range` (GET /api/metrics/query_range): Time series data
21
+
22
+ ## Usage Workflow
23
+
24
+ ### 1. Discovering Available Data
25
+ Start by understanding what tags and values exist:
26
+ - Use `tempo_search_tag_names` to discover available tags
27
+ - Use `tempo_search_tag_values` to see all values for a specific tag (e.g., service names)
28
+
29
+ ### 2. Searching for Traces
30
+
31
+ **TraceQL Search (recommended):**
32
+ Use `tempo_search_traces_by_query` with TraceQL syntax for powerful filtering.
33
+
34
+ **TraceQL Capabilities:**
35
+ TraceQL can select traces based on the following:
36
+ - **Span and resource attributes** - Filter by any attribute on spans or resources
37
+ - **Timing and duration** - Filter by trace/span duration
38
+ - **Basic aggregates** - Use aggregate functions to compute values across spans
39
+
40
+ **Supported Aggregate Functions:**
41
+ - `count()` - Count the number of spans matching the criteria
42
+ - `avg(attribute)` - Calculate average of a numeric attribute across spans
43
+ - `min(attribute)` - Find minimum value of a numeric attribute
44
+ - `max(attribute)` - Find maximum value of a numeric attribute
45
+ - `sum(attribute)` - Sum values of a numeric attribute across spans
46
+
47
+ **Aggregate Function Usage:**
48
+ Aggregates are used with the pipe operator `|` to filter traces based on computed values across their spans.
49
+
50
+ **Aggregate Examples:**
51
+ - `{ span.http.status_code = 200 } | count() > 3` - Find traces with more than 3 spans having HTTP 200 status
52
+ - `{ } | sum(span.bytesProcessed) > 1000000000` - Find traces where total processed bytes exceed 1 GB
53
+ - `{ status = error } | by(resource.service.name) | count() > 1` - Find services with more than 1 error
54
+
55
+ **Select Function:**
56
+ - `{ status = error } | select(span.http.status_code, span.http.url)` - Select specific attributes from error spans
57
+
58
+ **TraceQL Query Structure:**
59
+ TraceQL queries follow the pattern: `{span-selectors} | aggregate`
60
+
61
+ **TraceQL Query Examples (from official docs):**
62
+
63
+ 1. **Find traces of a specific operation:**
64
+ ```
65
+ {resource.service.name = "frontend" && name = "POST /api/orders"}
66
+ ```
67
+ ```
68
+ {
69
+ resource.service.namespace = "ecommerce" &&
70
+ resource.service.name = "frontend" &&
71
+ resource.deployment.environment = "production" &&
72
+ name = "POST /api/orders"
73
+ }
74
+ ```
75
+
76
+ 2. **Find traces with a particular outcome:**
77
+ ```
78
+ {
79
+ resource.service.name="frontend" &&
80
+ name = "POST /api/orders" &&
81
+ status = error
82
+ }
83
+ ```
84
+ ```
85
+ {
86
+ resource.service.name="frontend" &&
87
+ name = "POST /api/orders" &&
88
+ span.http.status_code >= 500
89
+ }
90
+ ```
91
+
92
+ 3. **Find traces with a particular behavior:**
93
+ ```
94
+ {span.service.name="frontend" && name = "GET /api/products/{id}"} && {span.db.system="postgresql"}
95
+ ```
96
+
97
+ 4. **Find traces across environments:**
98
+ ```
99
+ { resource.deployment.environment = "production" } && { resource.deployment.environment = "staging" }
100
+ ```
101
+
102
+ 5. **Structural operators (advanced):**
103
+ ```
104
+ { resource.service.name="frontend" } >> { status = error } # Frontend spans followed by errors
105
+ { } !< { resource.service.name = "productcatalogservice" } # Traces without productcatalog as child
106
+ { resource.service.name = "productcatalogservice" } ~ { resource.service.name="frontend" } # Sibling spans
107
+ ```
108
+
109
+ 6. **Additional operator examples:**
110
+ ```
111
+ { span.http.method = "GET" && status = ok } && { span.http.method = "DELETE" && status != ok } # && for multiple conditions
112
+ ```
113
+
114
+ ```
115
+ { resource.deployment.environment =~ "prod-.*" && span.http.status_code = 200 } # =~ regex match
116
+ { span.http.method =~ "DELETE|GET" } # Regex match multiple values
117
+ { trace:rootName !~ ".*perf.*" } # !~ negated regex
118
+ { resource.cloud.region = "us-east-1" } || { resource.cloud.region = "us-west-1" } # || OR operator
119
+ ```
120
+
121
+ ```
122
+ { span.http.status_code >= 400 && span.http.status_code < 500 } # Client errors (4xx)
123
+ { span.http.url = "/path/of/api" } >> { span.db.name = "db-shard-001" } # >> descendant
124
+ { span.http.status_code = 200 } | select(resource.service.name) # Select specific attributes
125
+ ```
126
+
127
+ **Common Attributes to Query:**
128
+ - `resource.service.name` - Service name
129
+ - `resource.k8s.*` - Kubernetes metadata (pod.name, namespace.name, deployment.name, etc.)
130
+ - `span.http.*` - HTTP attributes (status_code, method, route, url, etc.)
131
+ - `name` - Span name
132
+ - `status` - Span status (error, ok)
133
+ - `duration` - Span duration
134
+ - `kind` - Span kind (server, client, producer, consumer, internal)
135
+
136
+ **Tag-based Search (legacy):**
137
+ Use `tempo_search_traces_by_tags` with logfmt format when you need min/max duration filters:
138
+ - Example: `service.name="api" http.status_code="500"`
139
+ - Supports `min_duration` and `max_duration` parameters
140
+
141
+ ### 3. Analyzing Specific Traces
142
+ When you have trace IDs from search results:
143
+ - Use `tempo_query_trace_by_id` to get full trace details
144
+ - Examine spans for errors, slow operations, and bottlenecks
145
+
146
+ ### 4. Computing Metrics from Traces
147
+ **TraceQL metrics** compute aggregated metrics from your trace data, helping you answer critical questions like:
148
+ - How many database calls across all systems are downstream of your application?
149
+ - What services beneath a given endpoint are failing?
150
+ - What services beneath an endpoint are slow?
151
+
152
+ TraceQL metrics parse your traces in aggregate to provide RED (Rate, Error, Duration) metrics from trace data.
153
+
154
+ **Supported Functions:**
155
+ - `rate` - Calculate rate of spans/traces
156
+ - `count_over_time` - Count spans/traces over time
157
+ - `sum_over_time` - Sum span attributes
158
+ - `avg_over_time` - Average of span attributes
159
+ - `max_over_time` - Maximum value over time
160
+ - `min_over_time` - Minimum value over time
161
+ - `quantile_over_time` - Calculate quantiles
162
+ - `histogram_over_time` - Generate histogram data
163
+ - `compare` - Compare metrics between time periods
164
+
165
+ **Modifiers:**
166
+ - `topk` - Return top N results
167
+ - `bottomk` - Return bottom N results
168
+
169
+ **TraceQL Metrics Query Examples:**
170
+
171
+ 1. **rate** - Calculate error rate by service and HTTP route:
172
+ ```
173
+ { resource.service.name = "foo" && status = error } | rate() by (span.http.route)
174
+ ```
175
+
176
+ 2. **count_over_time** - Count spans by HTTP status code:
177
+ ```
178
+ { name = "GET /:endpoint" } | count_over_time() by (span.http.status_code)
179
+ ```
180
+
181
+ 3. **sum_over_time** - Sum HTTP response sizes by service:
182
+ ```
183
+ { name = "GET /:endpoint" } | sum_over_time(span.http.response.size) by (resource.service.name)
184
+ ```
185
+
186
+ 4. **avg_over_time** - Average duration by HTTP status code:
187
+ ```
188
+ { name = "GET /:endpoint" } | avg_over_time(duration) by (span.http.status_code)
189
+ ```
190
+
191
+ 5. **max_over_time** - Maximum response size by HTTP target:
192
+ ```
193
+ { name = "GET /:endpoint" } | max_over_time(span.http.response.size) by (span.http.target)
194
+ ```
195
+
196
+ 6. **min_over_time** - Minimum duration by HTTP target:
197
+ ```
198
+ { name = "GET /:endpoint" } | min_over_time(duration) by (span.http.target)
199
+ ```
200
+
201
+ 7. **quantile_over_time** - Calculate multiple percentiles (99th, 90th, 50th) with exemplars:
202
+ ```
203
+ { span:name = "GET /:endpoint" } | quantile_over_time(duration, .99, .9, .5) by (span.http.target) with (exemplars=true)
204
+ ```
205
+
206
+ 8. **histogram_over_time** - Build duration histogram grouped by custom attribute:
207
+ ```
208
+ { name = "GET /:endpoint" } | histogram_over_time(duration) by (span.foo)
209
+ ```
210
+
211
+ 9. **compare** - Compare error spans against baseline (10 attributes):
212
+ ```
213
+ { resource.service.name="a" && span.http.path="/myapi" } | compare({status=error}, 10)
214
+ ```
215
+
216
+ 10. **Using topk modifier** - Find top 10 endpoints by request rate:
217
+ ```
218
+ { resource.service.name = "foo" } | rate() by (span.http.url) | topk(10)
219
+ ```
220
+
221
+ **Choosing Between Instant and Range Queries:**
222
+
223
+ **Instant Metrics** (`tempo_query_metrics_instant`) - Returns a single aggregated value for the entire time range. Use this when:
224
+ - You need a total count or sum across the whole period
225
+ - You want a single metric value (e.g., total error count, average latency)
226
+ - You don't need to see how the metric changes over time
227
+ - You're computing a KPI or summary statistic
228
+
229
+ **Time Series Metrics** (`tempo_query_metrics_range`) - Returns values at regular intervals controlled by the 'step' parameter. Use this when:
230
+ - You need to graph metrics over time or analyze trends
231
+ - You want to see patterns, spikes, or changes in metrics
232
+ - You're troubleshooting time-based issues
233
+ - You need to correlate metrics with specific time periods
234
+
235
+ ## Special workflow for performance issues
236
+ When investigating performance issues in kubernetes via traces, call tempo_fetch_traces_comparative_sample. This tool provides comprehensive analysis for identifying patterns.
237
+
238
+ ## Important Notes
239
+ - TraceQL is the modern query language - prefer it over tag-based search
240
+ - TraceQL metrics are computed from trace data, not traditional Prometheus metrics
241
+ - TraceQL metrics is an experimental feature that computes RED (Rate, Error, Duration) metrics from trace data
242
+ - Common attributes to use in queries: resource.service.name, span.http.route, span.http.status_code, span.http.target, status, name, duration
243
+ - All timestamps can be Unix epoch seconds or RFC3339 format
244
+ - Use time filters (start/end) to improve query performance
245
+ - To get information about Kubernetes resources try these first: resource.service.name, resource.k8s.pod.name, resource.k8s.namespace.name, resource.k8s.deployment.name, resource.k8s.node.name, resource.k8s.container.name
246
+ - TraceQL and TraceQL metrics language are complex. If you get empty data, try to simplify your query and try again!
247
+ - IMPORTANT: TraceQL is not the same as 'TraceQL metrics' - Make sure you use the correct syntax and functions