holmesgpt 0.14.0a0__py3-none-any.whl → 0.14.1a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +10 -2
- holmes/common/env_vars.py +8 -1
- holmes/config.py +66 -139
- holmes/core/investigation.py +1 -2
- holmes/core/llm.py +256 -51
- holmes/core/models.py +2 -0
- holmes/core/safeguards.py +4 -4
- holmes/core/supabase_dal.py +14 -8
- holmes/core/tool_calling_llm.py +101 -101
- holmes/core/tools.py +260 -25
- holmes/core/tools_utils/data_types.py +81 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
- holmes/core/tools_utils/tool_executor.py +2 -2
- holmes/core/toolset_manager.py +150 -3
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +62 -0
- holmes/core/transformers/llm_summarize.py +174 -0
- holmes/core/transformers/registry.py +122 -0
- holmes/core/transformers/transformer.py +31 -0
- holmes/main.py +5 -0
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
- holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
- holmes/plugins/toolsets/bash/common/bash.py +7 -7
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +15 -15
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +8 -8
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +20 -20
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +8 -8
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +17 -17
- holmes/plugins/toolsets/git.py +21 -21
- holmes/plugins/toolsets/grafana/common.py +2 -2
- holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +3 -3
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +123 -23
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +165 -307
- holmes/plugins/toolsets/internet/internet.py +3 -3
- holmes/plugins/toolsets/internet/notion.py +3 -3
- holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
- holmes/plugins/toolsets/kafka.py +18 -18
- holmes/plugins/toolsets/kubernetes.yaml +58 -0
- holmes/plugins/toolsets/kubernetes_logs.py +6 -6
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
- holmes/plugins/toolsets/newrelic.py +5 -5
- holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
- holmes/plugins/toolsets/prometheus/prometheus.py +172 -39
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +25 -0
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
- holmes/plugins/toolsets/robusta/robusta.py +10 -10
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
- holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/env.py +7 -0
- holmes/utils/holmes_status.py +2 -1
- holmes/utils/sentry_helper.py +41 -0
- holmes/utils/stream.py +9 -0
- {holmesgpt-0.14.0a0.dist-info → holmesgpt-0.14.1a0.dist-info}/METADATA +9 -13
- {holmesgpt-0.14.0a0.dist-info → holmesgpt-0.14.1a0.dist-info}/RECORD +78 -68
- {holmesgpt-0.14.0a0.dist-info → holmesgpt-0.14.1a0.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.14.0a0.dist-info → holmesgpt-0.14.1a0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.14.0a0.dist-info → holmesgpt-0.14.1a0.dist-info}/entry_points.txt +0 -0
holmes/plugins/toolsets/git.py
CHANGED
|
@@ -4,7 +4,7 @@ import requests # type: ignore
|
|
|
4
4
|
import os
|
|
5
5
|
from typing import Any, Optional, Dict, List, Tuple
|
|
6
6
|
from pydantic import BaseModel
|
|
7
|
-
from holmes.core.tools import StructuredToolResult,
|
|
7
|
+
from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
|
|
8
8
|
|
|
9
9
|
from holmes.core.tools import (
|
|
10
10
|
Toolset,
|
|
@@ -259,7 +259,7 @@ class GitReadFileWithLineNumbers(Tool):
|
|
|
259
259
|
resp = requests.get(url, headers=headers)
|
|
260
260
|
if resp.status_code != 200:
|
|
261
261
|
return StructuredToolResult(
|
|
262
|
-
status=
|
|
262
|
+
status=StructuredToolResultStatus.ERROR,
|
|
263
263
|
data=self.toolset._sanitize_error(
|
|
264
264
|
f"Error fetching file: {resp.text}"
|
|
265
265
|
),
|
|
@@ -268,13 +268,13 @@ class GitReadFileWithLineNumbers(Tool):
|
|
|
268
268
|
content = base64.b64decode(resp.json()["content"]).decode().splitlines()
|
|
269
269
|
numbered = "\n".join(f"{i+1}: {line}" for i, line in enumerate(content))
|
|
270
270
|
return StructuredToolResult(
|
|
271
|
-
status=
|
|
271
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
272
272
|
data=numbered,
|
|
273
273
|
params=params,
|
|
274
274
|
)
|
|
275
275
|
except Exception as e:
|
|
276
276
|
return StructuredToolResult(
|
|
277
|
-
status=
|
|
277
|
+
status=StructuredToolResultStatus.ERROR,
|
|
278
278
|
data=self.toolset._sanitize_error(str(e)),
|
|
279
279
|
params=params,
|
|
280
280
|
)
|
|
@@ -304,7 +304,7 @@ class GitListFiles(Tool):
|
|
|
304
304
|
resp = requests.get(url, headers=headers)
|
|
305
305
|
if resp.status_code != 200:
|
|
306
306
|
return StructuredToolResult(
|
|
307
|
-
status=
|
|
307
|
+
status=StructuredToolResultStatus.ERROR,
|
|
308
308
|
data=self.toolset._sanitize_error(
|
|
309
309
|
f"Error listing files: {resp.text}"
|
|
310
310
|
),
|
|
@@ -312,13 +312,13 @@ class GitListFiles(Tool):
|
|
|
312
312
|
)
|
|
313
313
|
paths = [entry["path"] for entry in resp.json()["tree"]]
|
|
314
314
|
return StructuredToolResult(
|
|
315
|
-
status=
|
|
315
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
316
316
|
data=paths,
|
|
317
317
|
params=params,
|
|
318
318
|
)
|
|
319
319
|
except Exception as e:
|
|
320
320
|
return StructuredToolResult(
|
|
321
|
-
status=
|
|
321
|
+
status=StructuredToolResultStatus.ERROR,
|
|
322
322
|
data=self.toolset._sanitize_error(str(e)),
|
|
323
323
|
params=params,
|
|
324
324
|
)
|
|
@@ -353,13 +353,13 @@ class GitListOpenPRs(Tool):
|
|
|
353
353
|
for pr in prs
|
|
354
354
|
]
|
|
355
355
|
return StructuredToolResult(
|
|
356
|
-
status=
|
|
356
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
357
357
|
data=formatted,
|
|
358
358
|
params=params,
|
|
359
359
|
)
|
|
360
360
|
except Exception as e:
|
|
361
361
|
return StructuredToolResult(
|
|
362
|
-
status=
|
|
362
|
+
status=StructuredToolResultStatus.ERROR,
|
|
363
363
|
data=self.toolset._sanitize_error(str(e)),
|
|
364
364
|
params=params,
|
|
365
365
|
)
|
|
@@ -413,14 +413,14 @@ class GitExecuteChanges(Tool):
|
|
|
413
413
|
) -> StructuredToolResult:
|
|
414
414
|
def error(msg: str) -> StructuredToolResult:
|
|
415
415
|
return StructuredToolResult(
|
|
416
|
-
status=
|
|
416
|
+
status=StructuredToolResultStatus.ERROR,
|
|
417
417
|
data=self.toolset._sanitize_error(msg),
|
|
418
418
|
params=params,
|
|
419
419
|
)
|
|
420
420
|
|
|
421
421
|
def success(msg: Any) -> StructuredToolResult:
|
|
422
422
|
return StructuredToolResult(
|
|
423
|
-
status=
|
|
423
|
+
status=StructuredToolResultStatus.SUCCESS, data=msg, params=params
|
|
424
424
|
)
|
|
425
425
|
|
|
426
426
|
def modify_lines(lines: List[str]) -> List[str]:
|
|
@@ -643,24 +643,24 @@ class GitUpdatePR(Tool):
|
|
|
643
643
|
# Validate inputs
|
|
644
644
|
if not commit_message.strip():
|
|
645
645
|
return StructuredToolResult(
|
|
646
|
-
status=
|
|
646
|
+
status=StructuredToolResultStatus.ERROR,
|
|
647
647
|
error="Tool call failed to run: Commit message cannot be empty",
|
|
648
648
|
)
|
|
649
649
|
if not filename.strip():
|
|
650
650
|
return StructuredToolResult(
|
|
651
|
-
status=
|
|
651
|
+
status=StructuredToolResultStatus.ERROR,
|
|
652
652
|
error="Tool call failed to run: Filename cannot be empty",
|
|
653
653
|
)
|
|
654
654
|
if line < 1:
|
|
655
655
|
return StructuredToolResult(
|
|
656
|
-
status=
|
|
656
|
+
status=StructuredToolResultStatus.ERROR,
|
|
657
657
|
error="Tool call failed to run: Line number must be positive",
|
|
658
658
|
)
|
|
659
659
|
|
|
660
660
|
# Verify this is a PR created by our tool
|
|
661
661
|
if not self.toolset.is_created_pr(pr_number):
|
|
662
662
|
return StructuredToolResult(
|
|
663
|
-
status=
|
|
663
|
+
status=StructuredToolResultStatus.ERROR,
|
|
664
664
|
error=f"Tool call failed to run: PR #{pr_number} was not created by this tool. Only PRs created using git_execute_changes can be updated.",
|
|
665
665
|
)
|
|
666
666
|
|
|
@@ -714,7 +714,7 @@ class GitUpdatePR(Tool):
|
|
|
714
714
|
del content_lines[line - 1]
|
|
715
715
|
else:
|
|
716
716
|
return StructuredToolResult(
|
|
717
|
-
status=
|
|
717
|
+
status=StructuredToolResultStatus.ERROR,
|
|
718
718
|
error=f"Tool call failed to run: Invalid command: {command}",
|
|
719
719
|
)
|
|
720
720
|
|
|
@@ -722,7 +722,7 @@ class GitUpdatePR(Tool):
|
|
|
722
722
|
|
|
723
723
|
if dry_run:
|
|
724
724
|
return StructuredToolResult(
|
|
725
|
-
status=
|
|
725
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
726
726
|
data=f"DRY RUN: Updated content for PR #{pr_number}:\n\n{updated_content}",
|
|
727
727
|
)
|
|
728
728
|
|
|
@@ -731,13 +731,13 @@ class GitUpdatePR(Tool):
|
|
|
731
731
|
pr_number, filename, updated_content, commit_message
|
|
732
732
|
)
|
|
733
733
|
return StructuredToolResult(
|
|
734
|
-
status=
|
|
734
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
735
735
|
data=f"Added commit to PR #{pr_number} successfully",
|
|
736
736
|
)
|
|
737
737
|
|
|
738
738
|
except Exception as e:
|
|
739
739
|
return StructuredToolResult(
|
|
740
|
-
status=
|
|
740
|
+
status=StructuredToolResultStatus.ERROR,
|
|
741
741
|
error=self.toolset._sanitize_error(
|
|
742
742
|
f"Tool call failed to run: Error updating PR: {str(e)}"
|
|
743
743
|
),
|
|
@@ -745,14 +745,14 @@ class GitUpdatePR(Tool):
|
|
|
745
745
|
|
|
746
746
|
except requests.exceptions.RequestException as e:
|
|
747
747
|
return StructuredToolResult(
|
|
748
|
-
status=
|
|
748
|
+
status=StructuredToolResultStatus.ERROR,
|
|
749
749
|
error=self.toolset._sanitize_error(
|
|
750
750
|
f"Tool call failed to run: Network error: {str(e)}"
|
|
751
751
|
),
|
|
752
752
|
)
|
|
753
753
|
except Exception as e:
|
|
754
754
|
return StructuredToolResult(
|
|
755
|
-
status=
|
|
755
|
+
status=StructuredToolResultStatus.ERROR,
|
|
756
756
|
error=self.toolset._sanitize_error(
|
|
757
757
|
f"Tool call failed to run: Unexpected error: {str(e)}"
|
|
758
758
|
),
|
|
@@ -3,7 +3,7 @@ from typing import Dict, Optional
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
import datetime
|
|
5
5
|
|
|
6
|
-
from holmes.core.tools import StructuredToolResult,
|
|
6
|
+
from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class GrafanaConfig(BaseModel):
|
|
@@ -61,7 +61,7 @@ def ensure_grafana_uid_or_return_error_result(
|
|
|
61
61
|
) -> Optional[StructuredToolResult]:
|
|
62
62
|
if not config.grafana_datasource_uid:
|
|
63
63
|
return StructuredToolResult(
|
|
64
|
-
status=
|
|
64
|
+
status=StructuredToolResultStatus.ERROR,
|
|
65
65
|
error="This tool only works when the toolset is configued ",
|
|
66
66
|
)
|
|
67
67
|
else:
|
|
@@ -4,7 +4,7 @@ from holmes.core.tools import (
|
|
|
4
4
|
StructuredToolResult,
|
|
5
5
|
Tool,
|
|
6
6
|
ToolParameter,
|
|
7
|
-
|
|
7
|
+
StructuredToolResultStatus,
|
|
8
8
|
)
|
|
9
9
|
from holmes.plugins.toolsets.grafana.base_grafana_toolset import BaseGrafanaToolset
|
|
10
10
|
import requests # type: ignore
|
|
@@ -90,9 +90,9 @@ class ListAndBuildGrafanaDashboardURLs(Tool):
|
|
|
90
90
|
)
|
|
91
91
|
|
|
92
92
|
return StructuredToolResult(
|
|
93
|
-
status=
|
|
93
|
+
status=StructuredToolResultStatus.SUCCESS
|
|
94
94
|
if formatted_dashboards
|
|
95
|
-
else
|
|
95
|
+
else StructuredToolResultStatus.NO_DATA,
|
|
96
96
|
data="\n".join(formatted_dashboards)
|
|
97
97
|
if formatted_dashboards
|
|
98
98
|
else "No dashboards found.",
|
|
@@ -102,7 +102,7 @@ class ListAndBuildGrafanaDashboardURLs(Tool):
|
|
|
102
102
|
except requests.RequestException as e:
|
|
103
103
|
logging.error(f"Error fetching dashboards: {str(e)}")
|
|
104
104
|
return StructuredToolResult(
|
|
105
|
-
status=
|
|
105
|
+
status=StructuredToolResultStatus.ERROR,
|
|
106
106
|
error=f"Error fetching dashboards: {str(e)}",
|
|
107
107
|
url=url,
|
|
108
108
|
params=params,
|
|
@@ -22,7 +22,7 @@ from holmes.plugins.toolsets.utils import (
|
|
|
22
22
|
from holmes.plugins.toolsets.grafana.loki_api import (
|
|
23
23
|
query_loki_logs_by_label,
|
|
24
24
|
)
|
|
25
|
-
from holmes.core.tools import StructuredToolResult,
|
|
25
|
+
from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
class GrafanaLokiLabelsConfig(BaseModel):
|
|
@@ -99,12 +99,12 @@ class GrafanaLokiToolset(BasePodLoggingToolset):
|
|
|
99
99
|
if logs:
|
|
100
100
|
logs.sort(key=lambda x: x["timestamp"])
|
|
101
101
|
return StructuredToolResult(
|
|
102
|
-
status=
|
|
102
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
103
103
|
data="\n".join([format_log(log) for log in logs]),
|
|
104
104
|
params=params.model_dump(),
|
|
105
105
|
)
|
|
106
106
|
else:
|
|
107
107
|
return StructuredToolResult(
|
|
108
|
-
status=
|
|
108
|
+
status=StructuredToolResultStatus.NO_DATA,
|
|
109
109
|
params=params.model_dump(),
|
|
110
110
|
)
|
|
@@ -5,43 +5,142 @@ Assume every application provides tempo traces.
|
|
|
5
5
|
## API Endpoints and Tool Mapping
|
|
6
6
|
|
|
7
7
|
1. **Trace Search** (GET /api/search)
|
|
8
|
-
- `
|
|
9
|
-
- `
|
|
8
|
+
- `tempo_search_traces_by_query`: Use with 'q' parameter for TraceQL queries
|
|
9
|
+
- `tempo_search_traces_by_tags`: Use with 'tags' parameter for logfmt queries
|
|
10
10
|
|
|
11
11
|
2. **Trace Details** (GET /api/v2/traces/{trace_id})
|
|
12
|
-
- `
|
|
12
|
+
- `tempo_query_trace_by_id`: Retrieve full trace data
|
|
13
13
|
|
|
14
14
|
3. **Tag Discovery**
|
|
15
|
-
- `
|
|
16
|
-
- `
|
|
15
|
+
- `tempo_search_tag_names` (GET /api/v2/search/tags): List available tags
|
|
16
|
+
- `tempo_search_tag_values` (GET /api/v2/search/tag/{tag}/values): Get values for a tag
|
|
17
17
|
|
|
18
18
|
4. **TraceQL Metrics**
|
|
19
|
-
- `
|
|
20
|
-
- `
|
|
19
|
+
- `tempo_query_metrics_instant` (GET /api/metrics/query): Single value computation
|
|
20
|
+
- `tempo_query_metrics_range` (GET /api/metrics/query_range): Time series data
|
|
21
21
|
|
|
22
22
|
## Usage Workflow
|
|
23
23
|
|
|
24
24
|
### 1. Discovering Available Data
|
|
25
25
|
Start by understanding what tags and values exist:
|
|
26
|
-
- Use `
|
|
27
|
-
- Use `
|
|
26
|
+
- Use `tempo_search_tag_names` to discover available tags
|
|
27
|
+
- Use `tempo_search_tag_values` to see all values for a specific tag (e.g., service names)
|
|
28
28
|
|
|
29
29
|
### 2. Searching for Traces
|
|
30
|
+
|
|
30
31
|
**TraceQL Search (recommended):**
|
|
31
|
-
Use `
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
-
|
|
32
|
+
Use `tempo_search_traces_by_query` with TraceQL syntax for powerful filtering.
|
|
33
|
+
|
|
34
|
+
**TraceQL Capabilities:**
|
|
35
|
+
TraceQL can select traces based on the following:
|
|
36
|
+
- **Span and resource attributes** - Filter by any attribute on spans or resources
|
|
37
|
+
- **Timing and duration** - Filter by trace/span duration
|
|
38
|
+
- **Basic aggregates** - Use aggregate functions to compute values across spans
|
|
39
|
+
|
|
40
|
+
**Supported Aggregate Functions:**
|
|
41
|
+
- `count()` - Count the number of spans matching the criteria
|
|
42
|
+
- `avg(attribute)` - Calculate average of a numeric attribute across spans
|
|
43
|
+
- `min(attribute)` - Find minimum value of a numeric attribute
|
|
44
|
+
- `max(attribute)` - Find maximum value of a numeric attribute
|
|
45
|
+
- `sum(attribute)` - Sum values of a numeric attribute across spans
|
|
46
|
+
|
|
47
|
+
**Aggregate Function Usage:**
|
|
48
|
+
Aggregates are used with the pipe operator `|` to filter traces based on computed values across their spans.
|
|
49
|
+
|
|
50
|
+
**Aggregate Examples:**
|
|
51
|
+
- `{ span.http.status_code = 200 } | count() > 3` - Find traces with more than 3 spans having HTTP 200 status
|
|
52
|
+
- `{ } | sum(span.bytesProcessed) > 1000000000` - Find traces where total processed bytes exceed 1 GB
|
|
53
|
+
- `{ status = error } | by(resource.service.name) | count() > 1` - Find services with more than 1 error
|
|
54
|
+
|
|
55
|
+
**Select Function:**
|
|
56
|
+
- `{ status = error } | select(span.http.status_code, span.http.url)` - Select specific attributes from error spans
|
|
57
|
+
|
|
58
|
+
**TraceQL Query Structure:**
|
|
59
|
+
TraceQL queries follow the pattern: `{span-selectors} | aggregate`
|
|
60
|
+
|
|
61
|
+
**TraceQL Query Examples (from official docs):**
|
|
62
|
+
|
|
63
|
+
1. **Find traces of a specific operation:**
|
|
64
|
+
```
|
|
65
|
+
{resource.service.name = "frontend" && name = "POST /api/orders"}
|
|
66
|
+
```
|
|
67
|
+
```
|
|
68
|
+
{
|
|
69
|
+
resource.service.namespace = "ecommerce" &&
|
|
70
|
+
resource.service.name = "frontend" &&
|
|
71
|
+
resource.deployment.environment = "production" &&
|
|
72
|
+
name = "POST /api/orders"
|
|
73
|
+
}
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
2. **Find traces with a particular outcome:**
|
|
77
|
+
```
|
|
78
|
+
{
|
|
79
|
+
resource.service.name="frontend" &&
|
|
80
|
+
name = "POST /api/orders" &&
|
|
81
|
+
status = error
|
|
82
|
+
}
|
|
83
|
+
```
|
|
84
|
+
```
|
|
85
|
+
{
|
|
86
|
+
resource.service.name="frontend" &&
|
|
87
|
+
name = "POST /api/orders" &&
|
|
88
|
+
span.http.status_code >= 500
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
3. **Find traces with a particular behavior:**
|
|
93
|
+
```
|
|
94
|
+
{span.service.name="frontend" && name = "GET /api/products/{id}"} && {span.db.system="postgresql"}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
4. **Find traces across environments:**
|
|
98
|
+
```
|
|
99
|
+
{ resource.deployment.environment = "production" } && { resource.deployment.environment = "staging" }
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
5. **Structural operators (advanced):**
|
|
103
|
+
```
|
|
104
|
+
{ resource.service.name="frontend" } >> { status = error } # Frontend spans followed by errors
|
|
105
|
+
{ } !< { resource.service.name = "productcatalogservice" } # Traces without productcatalog as child
|
|
106
|
+
{ resource.service.name = "productcatalogservice" } ~ { resource.service.name="frontend" } # Sibling spans
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
6. **Additional operator examples:**
|
|
110
|
+
```
|
|
111
|
+
{ span.http.method = "GET" && status = ok } && { span.http.method = "DELETE" && status != ok } # && for multiple conditions
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
{ resource.deployment.environment =~ "prod-.*" && span.http.status_code = 200 } # =~ regex match
|
|
116
|
+
{ span.http.method =~ "DELETE|GET" } # Regex match multiple values
|
|
117
|
+
{ trace:rootName !~ ".*perf.*" } # !~ negated regex
|
|
118
|
+
{ resource.cloud.region = "us-east-1" } || { resource.cloud.region = "us-west-1" } # || OR operator
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
{ span.http.status_code >= 400 && span.http.status_code < 500 } # Client errors (4xx)
|
|
123
|
+
{ span.http.url = "/path/of/api" } >> { span.db.name = "db-shard-001" } # >> descendant
|
|
124
|
+
{ span.http.status_code = 200 } | select(resource.service.name) # Select specific attributes
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Common Attributes to Query:**
|
|
128
|
+
- `resource.service.name` - Service name
|
|
129
|
+
- `resource.k8s.*` - Kubernetes metadata (pod.name, namespace.name, deployment.name, etc.)
|
|
130
|
+
- `span.http.*` - HTTP attributes (status_code, method, route, url, etc.)
|
|
131
|
+
- `name` - Span name
|
|
132
|
+
- `status` - Span status (error, ok)
|
|
133
|
+
- `duration` - Span duration
|
|
134
|
+
- `kind` - Span kind (server, client, producer, consumer, internal)
|
|
36
135
|
|
|
37
136
|
**Tag-based Search (legacy):**
|
|
38
|
-
Use `
|
|
39
|
-
- Example: `
|
|
137
|
+
Use `tempo_search_traces_by_tags` with logfmt format when you need min/max duration filters:
|
|
138
|
+
- Example: `service.name="api" http.status_code="500"`
|
|
40
139
|
- Supports `min_duration` and `max_duration` parameters
|
|
41
140
|
|
|
42
141
|
### 3. Analyzing Specific Traces
|
|
43
142
|
When you have trace IDs from search results:
|
|
44
|
-
- Use `
|
|
143
|
+
- Use `tempo_query_trace_by_id` to get full trace details
|
|
45
144
|
- Examine spans for errors, slow operations, and bottlenecks
|
|
46
145
|
|
|
47
146
|
### 4. Computing Metrics from Traces
|
|
@@ -115,26 +214,26 @@ TraceQL metrics parse your traces in aggregate to provide RED (Rate, Error, Dura
|
|
|
115
214
|
```
|
|
116
215
|
|
|
117
216
|
10. **Using topk modifier** - Find top 10 endpoints by request rate:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
217
|
+
```
|
|
218
|
+
{ resource.service.name = "foo" } | rate() by (span.http.url) | topk(10)
|
|
219
|
+
```
|
|
121
220
|
|
|
122
221
|
**Choosing Between Instant and Range Queries:**
|
|
123
222
|
|
|
124
|
-
**Instant Metrics** (`
|
|
223
|
+
**Instant Metrics** (`tempo_query_metrics_instant`) - Returns a single aggregated value for the entire time range. Use this when:
|
|
125
224
|
- You need a total count or sum across the whole period
|
|
126
225
|
- You want a single metric value (e.g., total error count, average latency)
|
|
127
226
|
- You don't need to see how the metric changes over time
|
|
128
227
|
- You're computing a KPI or summary statistic
|
|
129
228
|
|
|
130
|
-
**Time Series Metrics** (`
|
|
229
|
+
**Time Series Metrics** (`tempo_query_metrics_range`) - Returns values at regular intervals controlled by the 'step' parameter. Use this when:
|
|
131
230
|
- You need to graph metrics over time or analyze trends
|
|
132
231
|
- You want to see patterns, spikes, or changes in metrics
|
|
133
232
|
- You're troubleshooting time-based issues
|
|
134
233
|
- You need to correlate metrics with specific time periods
|
|
135
234
|
|
|
136
235
|
## Special workflow for performance issues
|
|
137
|
-
When investigating performance issues in kubernetes via traces, call
|
|
236
|
+
When investigating performance issues in kubernetes via traces, call tempo_fetch_traces_comparative_sample. This tool provides comprehensive analysis for identifying patterns.
|
|
138
237
|
|
|
139
238
|
## Important Notes
|
|
140
239
|
- TraceQL is the modern query language - prefer it over tag-based search
|
|
@@ -145,3 +244,4 @@ When investigating performance issues in kubernetes via traces, call fetch_tempo
|
|
|
145
244
|
- Use time filters (start/end) to improve query performance
|
|
146
245
|
- To get information about Kubernetes resources try these first: resource.service.name, resource.k8s.pod.name, resource.k8s.namespace.name, resource.k8s.deployment.name, resource.k8s.node.name, resource.k8s.container.name
|
|
147
246
|
- TraceQL and TraceQL metrics language are complex. If you get empty data, try to simplify your query and try again!
|
|
247
|
+
- IMPORTANT: TraceQL is not the same as 'TraceQL metrics' - Make sure you use the correct syntax and functions
|