holmesgpt 0.14.0a0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (82) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +15 -4
  3. holmes/common/env_vars.py +8 -1
  4. holmes/config.py +66 -139
  5. holmes/core/investigation.py +1 -2
  6. holmes/core/llm.py +295 -52
  7. holmes/core/models.py +2 -0
  8. holmes/core/safeguards.py +4 -4
  9. holmes/core/supabase_dal.py +14 -8
  10. holmes/core/tool_calling_llm.py +110 -102
  11. holmes/core/tools.py +260 -25
  12. holmes/core/tools_utils/data_types.py +81 -0
  13. holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
  14. holmes/core/tools_utils/tool_executor.py +2 -2
  15. holmes/core/toolset_manager.py +150 -3
  16. holmes/core/transformers/__init__.py +23 -0
  17. holmes/core/transformers/base.py +62 -0
  18. holmes/core/transformers/llm_summarize.py +174 -0
  19. holmes/core/transformers/registry.py +122 -0
  20. holmes/core/transformers/transformer.py +31 -0
  21. holmes/main.py +5 -0
  22. holmes/plugins/prompts/_fetch_logs.jinja2 +10 -1
  23. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  24. holmes/plugins/toolsets/aks.yaml +64 -0
  25. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
  26. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
  27. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
  28. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
  29. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
  30. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
  31. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
  32. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
  33. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
  34. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
  35. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
  36. holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
  37. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  38. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  39. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  40. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  41. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +344 -205
  42. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +189 -17
  43. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +95 -30
  44. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +10 -10
  45. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +20 -20
  46. holmes/plugins/toolsets/git.py +21 -21
  47. holmes/plugins/toolsets/grafana/common.py +2 -2
  48. holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
  49. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +5 -4
  50. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +123 -23
  51. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +165 -307
  52. holmes/plugins/toolsets/internet/internet.py +3 -3
  53. holmes/plugins/toolsets/internet/notion.py +3 -3
  54. holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
  55. holmes/plugins/toolsets/kafka.py +18 -18
  56. holmes/plugins/toolsets/kubernetes.yaml +58 -0
  57. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  58. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  59. holmes/plugins/toolsets/logging_utils/logging_api.py +1 -1
  60. holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
  61. holmes/plugins/toolsets/newrelic.py +5 -5
  62. holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
  63. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  64. holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
  65. holmes/plugins/toolsets/prometheus/prometheus.py +841 -351
  66. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +39 -2
  67. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  68. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
  69. holmes/plugins/toolsets/robusta/robusta.py +10 -10
  70. holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
  71. holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
  72. holmes/plugins/toolsets/utils.py +88 -0
  73. holmes/utils/config_utils.py +91 -0
  74. holmes/utils/env.py +7 -0
  75. holmes/utils/holmes_status.py +2 -1
  76. holmes/utils/sentry_helper.py +41 -0
  77. holmes/utils/stream.py +9 -0
  78. {holmesgpt-0.14.0a0.dist-info → holmesgpt-0.14.1.dist-info}/METADATA +10 -14
  79. {holmesgpt-0.14.0a0.dist-info → holmesgpt-0.14.1.dist-info}/RECORD +82 -72
  80. {holmesgpt-0.14.0a0.dist-info → holmesgpt-0.14.1.dist-info}/LICENSE.txt +0 -0
  81. {holmesgpt-0.14.0a0.dist-info → holmesgpt-0.14.1.dist-info}/WHEEL +0 -0
  82. {holmesgpt-0.14.0a0.dist-info → holmesgpt-0.14.1.dist-info}/entry_points.txt +0 -0
@@ -5,43 +5,142 @@ Assume every application provides tempo traces.
5
5
  ## API Endpoints and Tool Mapping
6
6
 
7
7
  1. **Trace Search** (GET /api/search)
8
- - `search_traces_by_query`: Use with 'q' parameter for TraceQL queries
9
- - `search_traces_by_tags`: Use with 'tags' parameter for logfmt queries
8
+ - `tempo_search_traces_by_query`: Use with 'q' parameter for TraceQL queries
9
+ - `tempo_search_traces_by_tags`: Use with 'tags' parameter for logfmt queries
10
10
 
11
11
  2. **Trace Details** (GET /api/v2/traces/{trace_id})
12
- - `query_trace_by_id`: Retrieve full trace data
12
+ - `tempo_query_trace_by_id`: Retrieve full trace data
13
13
 
14
14
  3. **Tag Discovery**
15
- - `search_tag_names` (GET /api/v2/search/tags): List available tags
16
- - `search_tag_values` (GET /api/v2/search/tag/{tag}/values): Get values for a tag
15
+ - `tempo_search_tag_names` (GET /api/v2/search/tags): List available tags
16
+ - `tempo_search_tag_values` (GET /api/v2/search/tag/{tag}/values): Get values for a tag
17
17
 
18
18
  4. **TraceQL Metrics**
19
- - `query_metrics_instant` (GET /api/metrics/query): Single value computation
20
- - `query_metrics_range` (GET /api/metrics/query_range): Time series data
19
+ - `tempo_query_metrics_instant` (GET /api/metrics/query): Single value computation
20
+ - `tempo_query_metrics_range` (GET /api/metrics/query_range): Time series data
21
21
 
22
22
  ## Usage Workflow
23
23
 
24
24
  ### 1. Discovering Available Data
25
25
  Start by understanding what tags and values exist:
26
- - Use `search_tag_names` to discover available tags
27
- - Use `search_tag_values` to see all values for a specific tag (e.g., service names)
26
+ - Use `tempo_search_tag_names` to discover available tags
27
+ - Use `tempo_search_tag_values` to see all values for a specific tag (e.g., service names)
28
28
 
29
29
  ### 2. Searching for Traces
30
+
30
31
  **TraceQL Search (recommended):**
31
- Use `search_traces_by_query` with TraceQL syntax for powerful filtering:
32
- - Find errors: `{span.http.status_code>=400}`
33
- - Service traces: `{resource.service.name="api"}`
34
- - Slow traces: `{duration>100ms}`
35
- - Complex queries: `{resource.service.name="api" && span.http.status_code=500 && duration>1s}`
32
+ Use `tempo_search_traces_by_query` with TraceQL syntax for powerful filtering.
33
+
34
+ **TraceQL Capabilities:**
35
+ TraceQL can select traces based on the following:
36
+ - **Span and resource attributes** - Filter by any attribute on spans or resources
37
+ - **Timing and duration** - Filter by trace/span duration
38
+ - **Basic aggregates** - Use aggregate functions to compute values across spans
39
+
40
+ **Supported Aggregate Functions:**
41
+ - `count()` - Count the number of spans matching the criteria
42
+ - `avg(attribute)` - Calculate average of a numeric attribute across spans
43
+ - `min(attribute)` - Find minimum value of a numeric attribute
44
+ - `max(attribute)` - Find maximum value of a numeric attribute
45
+ - `sum(attribute)` - Sum values of a numeric attribute across spans
46
+
47
+ **Aggregate Function Usage:**
48
+ Aggregates are used with the pipe operator `|` to filter traces based on computed values across their spans.
49
+
50
+ **Aggregate Examples:**
51
+ - `{ span.http.status_code = 200 } | count() > 3` - Find traces with more than 3 spans having HTTP 200 status
52
+ - `{ } | sum(span.bytesProcessed) > 1000000000` - Find traces where total processed bytes exceed 1 GB
53
+ - `{ status = error } | by(resource.service.name) | count() > 1` - Find services with more than 1 error
54
+
55
+ **Select Function:**
56
+ - `{ status = error } | select(span.http.status_code, span.http.url)` - Select specific attributes from error spans
57
+
58
+ **TraceQL Query Structure:**
59
+ TraceQL queries follow the pattern: `{span-selectors} | aggregate`
60
+
61
+ **TraceQL Query Examples (from official docs):**
62
+
63
+ 1. **Find traces of a specific operation:**
64
+ ```
65
+ {resource.service.name = "frontend" && name = "POST /api/orders"}
66
+ ```
67
+ ```
68
+ {
69
+ resource.service.namespace = "ecommerce" &&
70
+ resource.service.name = "frontend" &&
71
+ resource.deployment.environment = "production" &&
72
+ name = "POST /api/orders"
73
+ }
74
+ ```
75
+
76
+ 2. **Find traces with a particular outcome:**
77
+ ```
78
+ {
79
+ resource.service.name="frontend" &&
80
+ name = "POST /api/orders" &&
81
+ status = error
82
+ }
83
+ ```
84
+ ```
85
+ {
86
+ resource.service.name="frontend" &&
87
+ name = "POST /api/orders" &&
88
+ span.http.status_code >= 500
89
+ }
90
+ ```
91
+
92
+ 3. **Find traces with a particular behavior:**
93
+ ```
94
+ {span.service.name="frontend" && name = "GET /api/products/{id}"} && {span.db.system="postgresql"}
95
+ ```
96
+
97
+ 4. **Find traces across environments:**
98
+ ```
99
+ { resource.deployment.environment = "production" } && { resource.deployment.environment = "staging" }
100
+ ```
101
+
102
+ 5. **Structural operators (advanced):**
103
+ ```
104
+ { resource.service.name="frontend" } >> { status = error } # Frontend spans followed by errors
105
+ { } !< { resource.service.name = "productcatalogservice" } # Traces without productcatalog as child
106
+ { resource.service.name = "productcatalogservice" } ~ { resource.service.name="frontend" } # Sibling spans
107
+ ```
108
+
109
+ 6. **Additional operator examples:**
110
+ ```
111
+ { span.http.method = "GET" && status = ok } && { span.http.method = "DELETE" && status != ok } # && for multiple conditions
112
+ ```
113
+
114
+ ```
115
+ { resource.deployment.environment =~ "prod-.*" && span.http.status_code = 200 } # =~ regex match
116
+ { span.http.method =~ "DELETE|GET" } # Regex match multiple values
117
+ { trace:rootName !~ ".*perf.*" } # !~ negated regex
118
+ { resource.cloud.region = "us-east-1" } || { resource.cloud.region = "us-west-1" } # || OR operator
119
+ ```
120
+
121
+ ```
122
+ { span.http.status_code >= 400 && span.http.status_code < 500 } # Client errors (4xx)
123
+ { span.http.url = "/path/of/api" } >> { span.db.name = "db-shard-001" } # >> descendant
124
+ { span.http.status_code = 200 } | select(resource.service.name) # Select specific attributes
125
+ ```
126
+
127
+ **Common Attributes to Query:**
128
+ - `resource.service.name` - Service name
129
+ - `resource.k8s.*` - Kubernetes metadata (pod.name, namespace.name, deployment.name, etc.)
130
+ - `span.http.*` - HTTP attributes (status_code, method, route, url, etc.)
131
+ - `name` - Span name
132
+ - `status` - Span status (error, ok)
133
+ - `duration` - Span duration
134
+ - `kind` - Span kind (server, client, producer, consumer, internal)
36
135
 
37
136
  **Tag-based Search (legacy):**
38
- Use `search_traces_by_tags` with logfmt format when you need min/max duration filters:
39
- - Example: `resource.service.name="api" http.status_code="500"`
137
+ Use `tempo_search_traces_by_tags` with logfmt format when you need min/max duration filters:
138
+ - Example: `service.name="api" http.status_code="500"`
40
139
  - Supports `min_duration` and `max_duration` parameters
41
140
 
42
141
  ### 3. Analyzing Specific Traces
43
142
  When you have trace IDs from search results:
44
- - Use `query_trace_by_id` to get full trace details
143
+ - Use `tempo_query_trace_by_id` to get full trace details
45
144
  - Examine spans for errors, slow operations, and bottlenecks
46
145
 
47
146
  ### 4. Computing Metrics from Traces
@@ -115,26 +214,26 @@ TraceQL metrics parse your traces in aggregate to provide RED (Rate, Error, Dura
115
214
  ```
116
215
 
117
216
  10. **Using topk modifier** - Find top 10 endpoints by request rate:
118
- ```
119
- { resource.service.name = "foo" } | rate() by (span.http.url) | topk(10)
120
- ```
217
+ ```
218
+ { resource.service.name = "foo" } | rate() by (span.http.url) | topk(10)
219
+ ```
121
220
 
122
221
  **Choosing Between Instant and Range Queries:**
123
222
 
124
- **Instant Metrics** (`query_metrics_instant`) - Returns a single aggregated value for the entire time range. Use this when:
223
+ **Instant Metrics** (`tempo_query_metrics_instant`) - Returns a single aggregated value for the entire time range. Use this when:
125
224
  - You need a total count or sum across the whole period
126
225
  - You want a single metric value (e.g., total error count, average latency)
127
226
  - You don't need to see how the metric changes over time
128
227
  - You're computing a KPI or summary statistic
129
228
 
130
- **Time Series Metrics** (`query_metrics_range`) - Returns values at regular intervals controlled by the 'step' parameter. Use this when:
229
+ **Time Series Metrics** (`tempo_query_metrics_range`) - Returns values at regular intervals controlled by the 'step' parameter. Use this when:
131
230
  - You need to graph metrics over time or analyze trends
132
231
  - You want to see patterns, spikes, or changes in metrics
133
232
  - You're troubleshooting time-based issues
134
233
  - You need to correlate metrics with specific time periods
135
234
 
136
235
  ## Special workflow for performance issues
137
- When investigating performance issues in kubernetes via traces, call fetch_tempo_traces_comparative_sample. This tool provides comprehensive analysis for identifying patterns.
236
+ When investigating performance issues in kubernetes via traces, call tempo_fetch_traces_comparative_sample. This tool provides comprehensive analysis for identifying patterns.
138
237
 
139
238
  ## Important Notes
140
239
  - TraceQL is the modern query language - prefer it over tag-based search
@@ -145,3 +244,4 @@ When investigating performance issues in kubernetes via traces, call fetch_tempo
145
244
  - Use time filters (start/end) to improve query performance
146
245
  - To get information about Kubernetes resources try these first: resource.service.name, resource.k8s.pod.name, resource.k8s.namespace.name, resource.k8s.deployment.name, resource.k8s.node.name, resource.k8s.container.name
147
246
  - TraceQL and TraceQL metrics language are complex. If you get empty data, try to simplify your query and try again!
247
+ - IMPORTANT: TraceQL is not the same as 'TraceQL metrics' - Make sure you use the correct syntax and functions