holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +3 -5
- holmes/clients/robusta_client.py +20 -6
- holmes/common/env_vars.py +58 -3
- holmes/common/openshift.py +1 -1
- holmes/config.py +123 -148
- holmes/core/conversations.py +71 -15
- holmes/core/feedback.py +191 -0
- holmes/core/investigation.py +31 -39
- holmes/core/investigation_structured_output.py +3 -3
- holmes/core/issue.py +1 -1
- holmes/core/llm.py +508 -88
- holmes/core/models.py +108 -4
- holmes/core/openai_formatting.py +14 -1
- holmes/core/prompt.py +48 -3
- holmes/core/runbooks.py +1 -0
- holmes/core/safeguards.py +8 -6
- holmes/core/supabase_dal.py +295 -100
- holmes/core/tool_calling_llm.py +489 -428
- holmes/core/tools.py +325 -56
- holmes/core/tools_utils/token_counting.py +21 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
- holmes/core/tools_utils/tool_executor.py +0 -13
- holmes/core/tools_utils/toolset_utils.py +1 -0
- holmes/core/toolset_manager.py +191 -5
- holmes/core/tracing.py +19 -3
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +63 -0
- holmes/core/transformers/llm_summarize.py +175 -0
- holmes/core/transformers/registry.py +123 -0
- holmes/core/transformers/transformer.py +32 -0
- holmes/core/truncation/compaction.py +94 -0
- holmes/core/truncation/dal_truncation_utils.py +23 -0
- holmes/core/truncation/input_context_window_limiter.py +219 -0
- holmes/interactive.py +228 -31
- holmes/main.py +23 -40
- holmes/plugins/interfaces.py +2 -1
- holmes/plugins/prompts/__init__.py +2 -1
- holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
- holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
- holmes/plugins/prompts/generic_ask.jinja2 +0 -4
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
- holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
- holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
- holmes/plugins/runbooks/__init__.py +145 -17
- holmes/plugins/runbooks/catalog.json +2 -0
- holmes/plugins/sources/github/__init__.py +4 -2
- holmes/plugins/sources/prometheus/models.py +1 -0
- holmes/plugins/toolsets/__init__.py +44 -27
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
- holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
- holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/utils.py +0 -32
- holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
- holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
- holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
- holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
- holmes/plugins/toolsets/bash/common/bash.py +23 -13
- holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
- holmes/plugins/toolsets/bash/common/stringify.py +1 -1
- holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
- holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
- holmes/plugins/toolsets/bash/parse_command.py +12 -13
- holmes/plugins/toolsets/cilium.yaml +284 -0
- holmes/plugins/toolsets/connectivity_check.py +124 -0
- holmes/plugins/toolsets/coralogix/api.py +132 -119
- holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
- holmes/plugins/toolsets/coralogix/utils.py +15 -79
- holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
- holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
- holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
- holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
- holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
- holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
- holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/git.py +54 -50
- holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
- holmes/plugins/toolsets/grafana/common.py +13 -29
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
- holmes/plugins/toolsets/grafana/loki_api.py +4 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
- holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
- holmes/plugins/toolsets/internet/internet.py +15 -16
- holmes/plugins/toolsets/internet/notion.py +9 -11
- holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
- holmes/plugins/toolsets/investigator/model.py +3 -1
- holmes/plugins/toolsets/json_filter_mixin.py +134 -0
- holmes/plugins/toolsets/kafka.py +36 -42
- holmes/plugins/toolsets/kubernetes.yaml +317 -113
- holmes/plugins/toolsets/kubernetes_logs.py +9 -9
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
- holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
- holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
- holmes/plugins/toolsets/openshift.yaml +283 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/api.py +23 -4
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
- holmes/plugins/toolsets/robusta/robusta.py +239 -68
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/connection_utils.py +31 -0
- holmes/utils/console/result.py +10 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
- holmes/utils/env.py +7 -0
- holmes/utils/file_utils.py +2 -1
- holmes/utils/global_instructions.py +60 -11
- holmes/utils/holmes_status.py +6 -4
- holmes/utils/holmes_sync_toolsets.py +0 -2
- holmes/utils/krr_utils.py +188 -0
- holmes/utils/log.py +15 -0
- holmes/utils/markdown_utils.py +2 -3
- holmes/utils/memory_limit.py +58 -0
- holmes/utils/sentry_helper.py +64 -0
- holmes/utils/stream.py +69 -8
- holmes/utils/tags.py +4 -3
- holmes/version.py +37 -15
- holmesgpt-0.18.4.dist-info/LICENSE +178 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
- holmesgpt-0.18.4.dist-info/RECORD +258 -0
- holmes/core/performance_timing.py +0 -72
- holmes/plugins/toolsets/aws.yaml +0 -80
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
- holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
- holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
- holmes/plugins/toolsets/newrelic.py +0 -231
- holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
- holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
- holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
- holmes/plugins/toolsets/servicenow/install.md +0 -37
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
- holmes/utils/keygen_utils.py +0 -6
- holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
- holmesgpt-0.13.2.dist-info/RECORD +0 -234
- /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
---
|
|
2
|
+
## **IMPORTANT: Handling Grafana Dashboard URLs**
|
|
3
|
+
|
|
4
|
+
**When the user provides a Grafana dashboard URL (e.g., http://some-domain.com/d/09ec8aa1e996d6ffcd6817bbaff4db1b/kubernetes-api-server):**
|
|
5
|
+
|
|
6
|
+
1. **DO NOT use the internet/fetch_webpage tool** - Grafana URLs should always be handled by the grafana dashboards toolset
|
|
7
|
+
2. **Extract the dashboard parameters from the URL:**
|
|
8
|
+
- Pattern: `/d/{uid}/{dashboard-name}`
|
|
9
|
+
- Example: `/d/09ec8aa1e996d6ffcd6817bbaff4db1b/kubernetes-api-server` → UID is `09ec8aa1e996d6ffcd6817bbaff4db1b`
|
|
10
|
+
3. **Use `grafana_get_dashboard_by_uid` with the extracted UID** to fetch the dashboard definition
|
|
11
|
+
4. **Extract queries from the dashboard panels** and execute them with the appropriate toolset (Prometheus, Loki, etc.)
|
|
12
|
+
|
|
13
|
+
### **Example Workflow:**
|
|
14
|
+
User: "Look at this graph: http://localhost:3000/d/abc123/my-dashboard?from=now-1h&to=now"
|
|
15
|
+
↓
|
|
16
|
+
1. Recognize this is a Grafana URL (contains /d/)
|
|
17
|
+
2. Extract UID: abc123
|
|
18
|
+
3. Call grafana_get_dashboard_by_uid(uid="abc123")
|
|
19
|
+
4. Analyze dashboard panels and their queries
|
|
20
|
+
5. Execute relevant queries using the time range from the URL (from=now-1h, to=now)
|
|
21
|
+
|
|
22
|
+
**Key Point:** Always prefer the Grafana toolset for any URL pointing to a Grafana instance. This gives you access to the dashboard structure, panel queries, and metadata - not just HTML content.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
## **Instruction: Prometheus Dashboard Query Execution**
|
|
26
|
+
|
|
27
|
+
**When running Prometheus queries from Grafana dashboards:**
|
|
28
|
+
|
|
29
|
+
### **Key Rules:**
|
|
30
|
+
1. **Always run as range queries** (not instant queries) to match dashboard behavior
|
|
31
|
+
2. **Use dashboard's time range** from JSON: `"time": {"from": "now-1h", "to": "now"}`
|
|
32
|
+
3. **Substitute variables:**
|
|
33
|
+
- `$__rate_interval` → `1m` (1-hour range) or `5m` (3+ hour range)
|
|
34
|
+
- `$cluster` → omit if empty
|
|
35
|
+
4. **Copy query exactly** including `ceil()`, `sort_desc()`, etc.
|
|
36
|
+
|
|
37
|
+
### **Example:**
|
|
38
|
+
```
|
|
39
|
+
Dashboard: "now-1h" + query with [$__rate_interval]
|
|
40
|
+
↓
|
|
41
|
+
Execute: start="-3600" + query with [1m]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### **Ignore:**
|
|
45
|
+
- `"refresh": "10s"` (UI setting, not query-related)
|
|
46
|
+
|
|
47
|
+
**This ensures you see the same time series data and historical spikes that users see on their dashboards.**
|
|
48
|
+
|
|
49
|
+
---
|
|
@@ -1,12 +1,247 @@
|
|
|
1
|
-
|
|
1
|
+
Grafana Tempo provides distributed tracing data through its REST API. Each tool maps directly to a specific Tempo API endpoint.
|
|
2
|
+
|
|
2
3
|
Assume every application provides tempo traces.
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
4
|
+
|
|
5
|
+
## API Endpoints and Tool Mapping
|
|
6
|
+
|
|
7
|
+
1. **Trace Search** (GET /api/search)
|
|
8
|
+
- `tempo_search_traces_by_query`: Use with 'q' parameter for TraceQL queries
|
|
9
|
+
- `tempo_search_traces_by_tags`: Use with 'tags' parameter for logfmt queries
|
|
10
|
+
|
|
11
|
+
2. **Trace Details** (GET /api/v2/traces/{trace_id})
|
|
12
|
+
- `tempo_query_trace_by_id`: Retrieve full trace data
|
|
13
|
+
|
|
14
|
+
3. **Tag Discovery**
|
|
15
|
+
- `tempo_search_tag_names` (GET /api/v2/search/tags): List available tags
|
|
16
|
+
- `tempo_search_tag_values` (GET /api/v2/search/tag/{tag}/values): Get values for a tag
|
|
17
|
+
|
|
18
|
+
4. **TraceQL Metrics**
|
|
19
|
+
- `tempo_query_metrics_instant` (GET /api/metrics/query): Single value computation
|
|
20
|
+
- `tempo_query_metrics_range` (GET /api/metrics/query_range): Time series data
|
|
21
|
+
|
|
22
|
+
## Usage Workflow
|
|
23
|
+
|
|
24
|
+
### 1. Discovering Available Data
|
|
25
|
+
Start by understanding what tags and values exist:
|
|
26
|
+
- Use `tempo_search_tag_names` to discover available tags
|
|
27
|
+
- Use `tempo_search_tag_values` to see all values for a specific tag (e.g., service names)
|
|
28
|
+
|
|
29
|
+
### 2. Searching for Traces
|
|
30
|
+
|
|
31
|
+
**TraceQL Search (recommended):**
|
|
32
|
+
Use `tempo_search_traces_by_query` with TraceQL syntax for powerful filtering.
|
|
33
|
+
|
|
34
|
+
**TraceQL Capabilities:**
|
|
35
|
+
TraceQL can select traces based on the following:
|
|
36
|
+
- **Span and resource attributes** - Filter by any attribute on spans or resources
|
|
37
|
+
- **Timing and duration** - Filter by trace/span duration
|
|
38
|
+
- **Basic aggregates** - Use aggregate functions to compute values across spans
|
|
39
|
+
|
|
40
|
+
**Supported Aggregate Functions:**
|
|
41
|
+
- `count()` - Count the number of spans matching the criteria
|
|
42
|
+
- `avg(attribute)` - Calculate average of a numeric attribute across spans
|
|
43
|
+
- `min(attribute)` - Find minimum value of a numeric attribute
|
|
44
|
+
- `max(attribute)` - Find maximum value of a numeric attribute
|
|
45
|
+
- `sum(attribute)` - Sum values of a numeric attribute across spans
|
|
46
|
+
|
|
47
|
+
**Aggregate Function Usage:**
|
|
48
|
+
Aggregates are used with the pipe operator `|` to filter traces based on computed values across their spans.
|
|
49
|
+
|
|
50
|
+
**Aggregate Examples:**
|
|
51
|
+
- `{ span.http.status_code = 200 } | count() > 3` - Find traces with more than 3 spans having HTTP 200 status
|
|
52
|
+
- `{ } | sum(span.bytesProcessed) > 1000000000` - Find traces where total processed bytes exceed 1 GB
|
|
53
|
+
- `{ status = error } | by(resource.service.name) | count() > 1` - Find services with more than 1 error
|
|
54
|
+
|
|
55
|
+
**Select Function:**
|
|
56
|
+
- `{ status = error } | select(span.http.status_code, span.http.url)` - Select specific attributes from error spans
|
|
57
|
+
|
|
58
|
+
**TraceQL Query Structure:**
|
|
59
|
+
TraceQL queries follow the pattern: `{span-selectors} | aggregate`
|
|
60
|
+
|
|
61
|
+
**TraceQL Query Examples (from official docs):**
|
|
62
|
+
|
|
63
|
+
1. **Find traces of a specific operation:**
|
|
64
|
+
```
|
|
65
|
+
{resource.service.name = "frontend" && name = "POST /api/orders"}
|
|
66
|
+
```
|
|
67
|
+
```
|
|
68
|
+
{
|
|
69
|
+
resource.service.namespace = "ecommerce" &&
|
|
70
|
+
resource.service.name = "frontend" &&
|
|
71
|
+
resource.deployment.environment = "production" &&
|
|
72
|
+
name = "POST /api/orders"
|
|
73
|
+
}
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
2. **Find traces with a particular outcome:**
|
|
77
|
+
```
|
|
78
|
+
{
|
|
79
|
+
resource.service.name="frontend" &&
|
|
80
|
+
name = "POST /api/orders" &&
|
|
81
|
+
status = error
|
|
82
|
+
}
|
|
83
|
+
```
|
|
84
|
+
```
|
|
85
|
+
{
|
|
86
|
+
resource.service.name="frontend" &&
|
|
87
|
+
name = "POST /api/orders" &&
|
|
88
|
+
span.http.status_code >= 500
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
3. **Find traces with a particular behavior:**
|
|
93
|
+
```
|
|
94
|
+
{span.service.name="frontend" && name = "GET /api/products/{id}"} && {span.db.system="postgresql"}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
4. **Find traces across environments:**
|
|
98
|
+
```
|
|
99
|
+
{ resource.deployment.environment = "production" } && { resource.deployment.environment = "staging" }
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
5. **Structural operators (advanced):**
|
|
103
|
+
```
|
|
104
|
+
{ resource.service.name="frontend" } >> { status = error } # Frontend spans followed by errors
|
|
105
|
+
{ } !< { resource.service.name = "productcatalogservice" } # Traces without productcatalog as child
|
|
106
|
+
{ resource.service.name = "productcatalogservice" } ~ { resource.service.name="frontend" } # Sibling spans
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
6. **Additional operator examples:**
|
|
110
|
+
```
|
|
111
|
+
{ span.http.method = "GET" && status = ok } && { span.http.method = "DELETE" && status != ok } # && for multiple conditions
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
{ resource.deployment.environment =~ "prod-.*" && span.http.status_code = 200 } # =~ regex match
|
|
116
|
+
{ span.http.method =~ "DELETE|GET" } # Regex match multiple values
|
|
117
|
+
{ trace:rootName !~ ".*perf.*" } # !~ negated regex
|
|
118
|
+
{ resource.cloud.region = "us-east-1" } || { resource.cloud.region = "us-west-1" } # || OR operator
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
{ span.http.status_code >= 400 && span.http.status_code < 500 } # Client errors (4xx)
|
|
123
|
+
{ span.http.url = "/path/of/api" } >> { span.db.name = "db-shard-001" } # >> descendant
|
|
124
|
+
{ span.http.status_code = 200 } | select(resource.service.name) # Select specific attributes
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Common Attributes to Query:**
|
|
128
|
+
- `resource.service.name` - Service name
|
|
129
|
+
- `resource.k8s.*` - Kubernetes metadata (pod.name, namespace.name, deployment.name, etc.)
|
|
130
|
+
- `span.http.*` - HTTP attributes (status_code, method, route, url, etc.)
|
|
131
|
+
- `name` - Span name
|
|
132
|
+
- `status` - Span status (error, ok)
|
|
133
|
+
- `duration` - Span duration
|
|
134
|
+
- `kind` - Span kind (server, client, producer, consumer, internal)
|
|
135
|
+
|
|
136
|
+
**Tag-based Search (legacy):**
|
|
137
|
+
Use `tempo_search_traces_by_tags` with logfmt format when you need min/max duration filters:
|
|
138
|
+
- Example: `service.name="api" http.status_code="500"`
|
|
139
|
+
- Supports `min_duration` and `max_duration` parameters
|
|
140
|
+
|
|
141
|
+
### 3. Analyzing Specific Traces
|
|
142
|
+
When you have trace IDs from search results:
|
|
143
|
+
- Use `tempo_query_trace_by_id` to get full trace details
|
|
144
|
+
- Examine spans for errors, slow operations, and bottlenecks
|
|
145
|
+
|
|
146
|
+
### 4. Computing Metrics from Traces
|
|
147
|
+
**TraceQL metrics** compute aggregated metrics from your trace data, helping you answer critical questions like:
|
|
148
|
+
- How many database calls across all systems are downstream of your application?
|
|
149
|
+
- What services beneath a given endpoint are failing?
|
|
150
|
+
- What services beneath an endpoint are slow?
|
|
151
|
+
|
|
152
|
+
TraceQL metrics parse your traces in aggregate to provide RED (Rate, Error, Duration) metrics from trace data.
|
|
153
|
+
|
|
154
|
+
**Supported Functions:**
|
|
155
|
+
- `rate` - Calculate rate of spans/traces
|
|
156
|
+
- `count_over_time` - Count spans/traces over time
|
|
157
|
+
- `sum_over_time` - Sum span attributes
|
|
158
|
+
- `avg_over_time` - Average of span attributes
|
|
159
|
+
- `max_over_time` - Maximum value over time
|
|
160
|
+
- `min_over_time` - Minimum value over time
|
|
161
|
+
- `quantile_over_time` - Calculate quantiles
|
|
162
|
+
- `histogram_over_time` - Generate histogram data
|
|
163
|
+
- `compare` - Compare metrics between time periods
|
|
164
|
+
|
|
165
|
+
**Modifiers:**
|
|
166
|
+
- `topk` - Return top N results
|
|
167
|
+
- `bottomk` - Return bottom N results
|
|
168
|
+
|
|
169
|
+
**TraceQL Metrics Query Examples:**
|
|
170
|
+
|
|
171
|
+
1. **rate** - Calculate error rate by service and HTTP route:
|
|
172
|
+
```
|
|
173
|
+
{ resource.service.name = "foo" && status = error } | rate() by (span.http.route)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
2. **count_over_time** - Count spans by HTTP status code:
|
|
177
|
+
```
|
|
178
|
+
{ name = "GET /:endpoint" } | count_over_time() by (span.http.status_code)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
3. **sum_over_time** - Sum HTTP response sizes by service:
|
|
182
|
+
```
|
|
183
|
+
{ name = "GET /:endpoint" } | sum_over_time(span.http.response.size) by (resource.service.name)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
4. **avg_over_time** - Average duration by HTTP status code:
|
|
187
|
+
```
|
|
188
|
+
{ name = "GET /:endpoint" } | avg_over_time(duration) by (span.http.status_code)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
5. **max_over_time** - Maximum response size by HTTP target:
|
|
192
|
+
```
|
|
193
|
+
{ name = "GET /:endpoint" } | max_over_time(span.http.response.size) by (span.http.target)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
6. **min_over_time** - Minimum duration by HTTP target:
|
|
197
|
+
```
|
|
198
|
+
{ name = "GET /:endpoint" } | min_over_time(duration) by (span.http.target)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
7. **quantile_over_time** - Calculate multiple percentiles (99th, 90th, 50th) with exemplars:
|
|
202
|
+
```
|
|
203
|
+
{ span:name = "GET /:endpoint" } | quantile_over_time(duration, .99, .9, .5) by (span.http.target) with (exemplars=true)
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
8. **histogram_over_time** - Build duration histogram grouped by custom attribute:
|
|
207
|
+
```
|
|
208
|
+
{ name = "GET /:endpoint" } | histogram_over_time(duration) by (span.foo)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
9. **compare** - Compare error spans against baseline (10 attributes):
|
|
212
|
+
```
|
|
213
|
+
{ resource.service.name="a" && span.http.path="/myapi" } | compare({status=error}, 10)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
10. **Using topk modifier** - Find top 10 endpoints by request rate:
|
|
217
|
+
```
|
|
218
|
+
{ resource.service.name = "foo" } | rate() by (span.http.url) | topk(10)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
**Choosing Between Instant and Range Queries:**
|
|
222
|
+
|
|
223
|
+
**Instant Metrics** (`tempo_query_metrics_instant`) - Returns a single aggregated value for the entire time range. Use this when:
|
|
224
|
+
- You need a total count or sum across the whole period
|
|
225
|
+
- You want a single metric value (e.g., total error count, average latency)
|
|
226
|
+
- You don't need to see how the metric changes over time
|
|
227
|
+
- You're computing a KPI or summary statistic
|
|
228
|
+
|
|
229
|
+
**Time Series Metrics** (`tempo_query_metrics_range`) - Returns values at regular intervals controlled by the 'step' parameter. Use this when:
|
|
230
|
+
- You need to graph metrics over time or analyze trends
|
|
231
|
+
- You want to see patterns, spikes, or changes in metrics
|
|
232
|
+
- You're troubleshooting time-based issues
|
|
233
|
+
- You need to correlate metrics with specific time periods
|
|
234
|
+
|
|
235
|
+
## Special workflow for performance issues
|
|
236
|
+
When investigating performance issues in kubernetes via traces, call tempo_fetch_traces_comparative_sample. This tool provides comprehensive analysis for identifying patterns.
|
|
237
|
+
|
|
238
|
+
## Important Notes
|
|
239
|
+
- TraceQL is the modern query language - prefer it over tag-based search
|
|
240
|
+
- TraceQL metrics are computed from trace data, not traditional Prometheus metrics
|
|
241
|
+
- TraceQL metrics is an experimental feature that computes RED (Rate, Error, Duration) metrics from trace data
|
|
242
|
+
- Common attributes to use in queries: resource.service.name, span.http.route, span.http.status_code, span.http.target, status, name, duration
|
|
243
|
+
- All timestamps can be Unix epoch seconds or RFC3339 format
|
|
244
|
+
- Use time filters (start/end) to improve query performance
|
|
245
|
+
- To get information about Kubernetes resources try these first: resource.service.name, resource.k8s.pod.name, resource.k8s.namespace.name, resource.k8s.deployment.name, resource.k8s.node.name, resource.k8s.container.name
|
|
246
|
+
- TraceQL and TraceQL metrics language are complex. If you get empty data, try to simplify your query and try again!
|
|
247
|
+
- IMPORTANT: TraceQL is not the same as 'TraceQL metrics' - Make sure you use the correct syntax and functions
|