PyPI - holmesgpt - Versions diffs - 0.13.3a0__py3-none-any.whl → 0.14.0a0__py3-none-any.whl - Mend - Supply Chain Defender

holmesgpt 0.13.3a0py3-none-any.whl → 0.14.0a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of holmesgpt might be problematic. Click here for more details.

Files changed (20) hide show

holmes/plugins/toolsets/grafana/grafana_tempo_api.py ADDED Viewed

@@ -0,0 +1,473 @@
+"""Grafana Tempo API wrapper for querying traces and metrics."""
+import logging
+from typing import Any, Dict, Optional, Union
+from urllib.parse import quote
+import backoff
+import requests  # type: ignore
+from holmes.plugins.toolsets.grafana.common import (
+    GrafanaTempoConfig,
+    build_headers,
+    get_base_url,
+)
+logger = logging.getLogger(__name__)
+class TempoAPIError(Exception):
+    """Custom exception for Tempo API errors with detailed response information."""
+    def __init__(self, status_code: int, response_text: str, url: str):
+        self.status_code = status_code
+        self.response_text = response_text
+        self.url = url
+        # Try to extract error message from JSON response
+        try:
+            import json
+            error_data = json.loads(response_text)
+            # Tempo may return errors in different formats
+            error_message = (
+                error_data.get("error")
+                or error_data.get("message")
+                or error_data.get("errorType")
+                or response_text
+            )
+        except (json.JSONDecodeError, TypeError):
+            error_message = response_text
+        super().__init__(f"Tempo API error {status_code}: {error_message}")
+class GrafanaTempoAPI:
+    """Python wrapper for Grafana Tempo REST API.
+    This class provides a clean interface to all Tempo API endpoints,
+    supporting both GET and POST methods based on configuration.
+    """
+    def __init__(self, config: GrafanaTempoConfig, use_post: bool = False):
+        """Initialize the Tempo API wrapper.
+        Args:
+            config: GrafanaTempoConfig instance with connection details
+            use_post: If True, use POST method for API calls. Defaults to False (GET).
+        """
+        self.config = config
+        self.base_url = get_base_url(config)
+        self.headers = build_headers(config.api_key, config.headers)
+        self.use_post = use_post
+    def _make_request(
+        self,
+        endpoint: str,
+        params: Optional[Dict[str, Any]] = None,
+        path_params: Optional[Dict[str, str]] = None,
+        timeout: int = 30,
+        retries: int = 3,
+    ) -> Dict[str, Any]:
+        """Make HTTP request to Tempo API with retry logic.
+        Args:
+            endpoint: API endpoint path (e.g., "/api/echo")
+            params: Query parameters (GET) or body parameters (POST)
+            path_params: Parameters to substitute in the endpoint path
+            timeout: Request timeout in seconds
+            retries: Number of retry attempts
+        Returns:
+            JSON response from the API
+        Raises:
+            Exception: If the request fails after all retries
+        """
+        # Format endpoint with path parameters
+        if path_params:
+            for key, value in path_params.items():
+                endpoint = endpoint.replace(f"{{{key}}}", quote(str(value), safe=""))
+        url = f"{self.base_url}{endpoint}"
+        @backoff.on_exception(
+            backoff.expo,
+            requests.exceptions.RequestException,
+            max_tries=retries,
+            giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
+            and getattr(e, "response", None) is not None
+            and e.response.status_code < 500,
+        )
+        def make_request():
+            if self.use_post:
+                # POST request with JSON body
+                response = requests.post(
+                    url,
+                    headers=self.headers,
+                    json=params or {},
+                    timeout=timeout,
+                )
+            else:
+                # GET request with query parameters
+                response = requests.get(
+                    url,
+                    headers=self.headers,
+                    params=params,
+                    timeout=timeout,
+                )
+            response.raise_for_status()
+            return response.json()
+        try:
+            return make_request()
+        except requests.exceptions.HTTPError as e:
+            # Extract detailed error message from response
+            response = e.response
+            if response is not None:
+                logger.error(
+                    f"HTTP error {response.status_code} for {url}: {response.text}"
+                )
+                raise TempoAPIError(
+                    status_code=response.status_code,
+                    response_text=response.text,
+                    url=url,
+                )
+            else:
+                logger.error(f"Request failed for {url}: {e}")
+                raise
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request failed for {url}: {e}")
+            raise
+    def query_echo_endpoint(self) -> bool:
+        """Query the echo endpoint to check Tempo status.
+        API Endpoint: GET /api/echo
+        HTTP Method: GET (or POST if use_post=True)
+        Returns:
+            bool: True if endpoint returns 200 status code, False otherwise
+        """
+        url = f"{self.base_url}/api/echo"
+        try:
+            if self.use_post:
+                response = requests.post(
+                    url,
+                    headers=self.headers,
+                    timeout=30,
+                )
+            else:
+                response = requests.get(
+                    url,
+                    headers=self.headers,
+                    timeout=30,
+                )
+            # Just check status code, don't try to parse JSON
+            return response.status_code == 200
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request failed for {url}: {e}")
+            return False
+    def query_trace_by_id_v2(
+        self,
+        trace_id: str,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """Query a trace by its ID.
+        API Endpoint: GET /api/v2/traces/{trace_id}
+        HTTP Method: GET (or POST if use_post=True)
+        Args:
+            trace_id: The trace ID to retrieve
+            start: Optional start time in Unix epoch seconds
+            end: Optional end time in Unix epoch seconds
+        Returns:
+            dict: OpenTelemetry format trace data
+        """
+        params = {}
+        if start is not None:
+            params["start"] = str(start)
+        if end is not None:
+            params["end"] = str(end)
+        return self._make_request(
+            "/api/v2/traces/{trace_id}",
+            params=params,
+            path_params={"trace_id": trace_id},
+        )
+    def _search_traces_common(
+        self,
+        search_params: Dict[str, Any],
+        limit: Optional[int] = None,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+        spss: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """Common search implementation for both tag and TraceQL searches.
+        Args:
+            search_params: The search-specific parameters (tags or q)
+            limit: Optional max number of traces to return
+            start: Optional start time in Unix epoch seconds
+            end: Optional end time in Unix epoch seconds
+            spss: Optional spans per span set
+        Returns:
+            dict: Search results with trace metadata
+        """
+        params = search_params.copy()
+        if limit is not None:
+            params["limit"] = str(limit)
+        if start is not None:
+            params["start"] = str(start)
+        if end is not None:
+            params["end"] = str(end)
+        if spss is not None:
+            params["spss"] = str(spss)
+        return self._make_request("/api/search", params=params)
+    def search_traces_by_tags(
+        self,
+        tags: str,
+        min_duration: Optional[str] = None,
+        max_duration: Optional[str] = None,
+        limit: Optional[int] = None,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+        spss: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """Search for traces using tag-based search.
+        API Endpoint: GET /api/search
+        HTTP Method: GET (or POST if use_post=True)
+        Args:
+            tags: logfmt-encoded span/process attributes (required)
+            min_duration: Optional minimum trace duration (e.g., "5s")
+            max_duration: Optional maximum trace duration
+            limit: Optional max number of traces to return
+            start: Optional start time in Unix epoch seconds
+            end: Optional end time in Unix epoch seconds
+            spss: Optional spans per span set
+        Returns:
+            dict: Search results with trace metadata
+        """
+        search_params = {"tags": tags}
+        # minDuration and maxDuration are only supported with tag-based search
+        if min_duration is not None:
+            search_params["minDuration"] = min_duration
+        if max_duration is not None:
+            search_params["maxDuration"] = max_duration
+        return self._search_traces_common(
+            search_params=search_params,
+            limit=limit,
+            start=start,
+            end=end,
+            spss=spss,
+        )
+    def search_traces_by_query(
+        self,
+        q: str,
+        limit: Optional[int] = None,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+        spss: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """Search for traces using TraceQL query.
+        API Endpoint: GET /api/search
+        HTTP Method: GET (or POST if use_post=True)
+        Note: minDuration and maxDuration are not supported with TraceQL queries.
+        Use the TraceQL query syntax to filter by duration instead.
+        Args:
+            q: TraceQL query (required)
+            limit: Optional max number of traces to return
+            start: Optional start time in Unix epoch seconds
+            end: Optional end time in Unix epoch seconds
+            spss: Optional spans per span set
+        Returns:
+            dict: Search results with trace metadata
+        """
+        return self._search_traces_common(
+            search_params={"q": q},
+            limit=limit,
+            start=start,
+            end=end,
+            spss=spss,
+        )
+    def search_tag_names_v2(
+        self,
+        scope: Optional[str] = None,
+        q: Optional[str] = None,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+        limit: Optional[int] = None,
+        max_stale_values: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """Search for available tag names.
+        API Endpoint: GET /api/v2/search/tags
+        HTTP Method: GET (or POST if use_post=True)
+        Args:
+            scope: Optional scope filter ("resource", "span", or "intrinsic")
+            q: Optional TraceQL query to filter tags
+            start: Optional start time in Unix epoch seconds
+            end: Optional end time in Unix epoch seconds
+            limit: Optional max number of tag names
+            max_stale_values: Optional max stale values parameter
+        Returns:
+            dict: Available tag names organized by scope
+        """
+        params = {}
+        if scope is not None:
+            params["scope"] = scope
+        if q is not None:
+            params["q"] = q
+        if start is not None:
+            params["start"] = str(start)
+        if end is not None:
+            params["end"] = str(end)
+        if limit is not None:
+            params["limit"] = str(limit)
+        if max_stale_values is not None:
+            params["maxStaleValues"] = str(max_stale_values)
+        return self._make_request("/api/v2/search/tags", params=params)
+    def search_tag_values_v2(
+        self,
+        tag: str,
+        q: Optional[str] = None,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+        limit: Optional[int] = None,
+        max_stale_values: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """Search for values of a specific tag with optional TraceQL filtering.
+        API Endpoint: GET /api/v2/search/tag/{tag}/values
+        HTTP Method: GET (or POST if use_post=True)
+        Args:
+            tag: The tag name to get values for (required)
+            q: Optional TraceQL query to filter tag values (e.g., '{resource.cluster="us-east-1"}')
+            start: Optional start time in Unix epoch seconds
+            end: Optional end time in Unix epoch seconds
+            limit: Optional max number of values
+            max_stale_values: Optional max stale values parameter
+        Returns:
+            dict: List of discovered values for the tag
+        """
+        params = {}
+        if q is not None:
+            params["q"] = q
+        if start is not None:
+            params["start"] = str(start)
+        if end is not None:
+            params["end"] = str(end)
+        if limit is not None:
+            params["limit"] = str(limit)
+        if max_stale_values is not None:
+            params["maxStaleValues"] = str(max_stale_values)
+        return self._make_request(
+            "/api/v2/search/tag/{tag}/values",
+            params=params,
+            path_params={"tag": tag},
+        )
+    def query_metrics_instant(
+        self,
+        q: str,
+        start: Optional[Union[int, str]] = None,
+        end: Optional[Union[int, str]] = None,
+        since: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Query TraceQL metrics for an instant value.
+        Computes a single value across the entire time range.
+        API Endpoint: GET /api/metrics/query
+        HTTP Method: GET (or POST if use_post=True)
+        Args:
+            q: TraceQL metrics query (required)
+            start: Optional start time (Unix seconds/nanoseconds/RFC3339)
+            end: Optional end time (Unix seconds/nanoseconds/RFC3339)
+            since: Optional duration string (e.g., "1h")
+        Returns:
+            dict: Single computed metric value
+        """
+        params = {"q": q}
+        if start is not None:
+            params["start"] = str(start)
+        if end is not None:
+            params["end"] = str(end)
+        if since is not None:
+            params["since"] = since
+        return self._make_request("/api/metrics/query", params=params)
+    def query_metrics_range(
+        self,
+        q: str,
+        step: Optional[str] = None,
+        start: Optional[Union[int, str]] = None,
+        end: Optional[Union[int, str]] = None,
+        since: Optional[str] = None,
+        exemplars: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """Query TraceQL metrics for a time series range.
+        Returns metrics computed at regular intervals over the time range.
+        API Endpoint: GET /api/metrics/query_range
+        HTTP Method: GET (or POST if use_post=True)
+        Args:
+            q: TraceQL metrics query (required)
+            step: Optional time series granularity (e.g., "1m", "5m")
+            start: Optional start time (Unix seconds/nanoseconds/RFC3339)
+            end: Optional end time (Unix seconds/nanoseconds/RFC3339)
+            since: Optional duration string (e.g., "3h")
+            exemplars: Optional maximum number of exemplars to return
+        Returns:
+            dict: Time series of metric values
+        """
+        params = {"q": q}
+        if step is not None:
+            params["step"] = step
+        if start is not None:
+            params["start"] = str(start)
+        if end is not None:
+            params["end"] = str(end)
+        if since is not None:
+            params["since"] = since
+        if exemplars is not None:
+            params["exemplars"] = str(exemplars)
+        return self._make_request("/api/metrics/query_range", params=params)

holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 CHANGED Viewed

@@ -1,12 +1,147 @@
-Use Tempo when investigating latency or performance issues. Tempo provides traces information for application running on the cluster.
+Grafana Tempo provides distributed tracing data through its REST API. Each tool maps directly to a specific Tempo API endpoint.
 Assume every application provides tempo traces.
-1. Start by identifying an initial filter to use. This can be a pod name, a deployment name or a service name
-2. Call fetch_tempo_traces_comparative_sample first when investigating performance issues via traces. This tool provides comprehensive analysis for identifying patterns. For other issues not related to performance, you can start with fetch_tempo_traces.
-3. Use `fetch_tempo_traces` setting the appropriate query params
-    - Use the min_duration filter to ensure you get traces that trigger the alert when you are investigating a performance issue
-    - If possible, use start and end date to narrow down your search.
-        - Use fetch_finding_by_id if you are provided with a finding/alert id. It will contain details about when the alert was triggered
-    - Use at least one of the following argument to ensure you get relevant traces: `service_name`, `pod_name` or `deployment_name`.
-4. When you have a specific trace ID to investigate, use `fetch_tempo_trace_by_id` to get detailed information about that trace.
-5. Look at the duration of each span in any single trace and deduce any issues.
-6. ALWAYS fetch the logs for a pod once you identify a span that is taking a long time. There may be an explanation for the slowness in the logs.
+## API Endpoints and Tool Mapping
+1. **Trace Search** (GET /api/search)
+   - `search_traces_by_query`: Use with 'q' parameter for TraceQL queries
+   - `search_traces_by_tags`: Use with 'tags' parameter for logfmt queries
+2. **Trace Details** (GET /api/v2/traces/{trace_id})
+   - `query_trace_by_id`: Retrieve full trace data
+3. **Tag Discovery**
+   - `search_tag_names` (GET /api/v2/search/tags): List available tags
+   - `search_tag_values` (GET /api/v2/search/tag/{tag}/values): Get values for a tag
+4. **TraceQL Metrics**
+   - `query_metrics_instant` (GET /api/metrics/query): Single value computation
+   - `query_metrics_range` (GET /api/metrics/query_range): Time series data
+## Usage Workflow
+### 1. Discovering Available Data
+Start by understanding what tags and values exist:
+- Use `search_tag_names` to discover available tags
+- Use `search_tag_values` to see all values for a specific tag (e.g., service names)
+### 2. Searching for Traces
+**TraceQL Search (recommended):**
+Use `search_traces_by_query` with TraceQL syntax for powerful filtering:
+- Find errors: `{span.http.status_code>=400}`
+- Service traces: `{resource.service.name="api"}`
+- Slow traces: `{duration>100ms}`
+- Complex queries: `{resource.service.name="api" && span.http.status_code=500 && duration>1s}`
+**Tag-based Search (legacy):**
+Use `search_traces_by_tags` with logfmt format when you need min/max duration filters:
+- Example: `resource.service.name="api" http.status_code="500"`
+- Supports `min_duration` and `max_duration` parameters
+### 3. Analyzing Specific Traces
+When you have trace IDs from search results:
+- Use `query_trace_by_id` to get full trace details
+- Examine spans for errors, slow operations, and bottlenecks
+### 4. Computing Metrics from Traces
+**TraceQL metrics** compute aggregated metrics from your trace data, helping you answer critical questions like:
+- How many database calls across all systems are downstream of your application?
+- What services beneath a given endpoint are failing?
+- What services beneath an endpoint are slow?
+TraceQL metrics parse your traces in aggregate to provide RED (Rate, Error, Duration) metrics from trace data.
+**Supported Functions:**
+- `rate` - Calculate rate of spans/traces
+- `count_over_time` - Count spans/traces over time
+- `sum_over_time` - Sum span attributes
+- `avg_over_time` - Average of span attributes
+- `max_over_time` - Maximum value over time
+- `min_over_time` - Minimum value over time
+- `quantile_over_time` - Calculate quantiles
+- `histogram_over_time` - Generate histogram data
+- `compare` - Compare metrics between time periods
+**Modifiers:**
+- `topk` - Return top N results
+- `bottomk` - Return bottom N results
+**TraceQL Metrics Query Examples:**
+1. **rate** - Calculate error rate by service and HTTP route:
+   ```
+   { resource.service.name = "foo" && status = error } | rate() by (span.http.route)
+   ```
+2. **count_over_time** - Count spans by HTTP status code:
+   ```
+   { name = "GET /:endpoint" } | count_over_time() by (span.http.status_code)
+   ```
+3. **sum_over_time** - Sum HTTP response sizes by service:
+   ```
+   { name = "GET /:endpoint" } | sum_over_time(span.http.response.size) by (resource.service.name)
+   ```
+4. **avg_over_time** - Average duration by HTTP status code:
+   ```
+   { name = "GET /:endpoint" } | avg_over_time(duration) by (span.http.status_code)
+   ```
+5. **max_over_time** - Maximum response size by HTTP target:
+   ```
+   { name = "GET /:endpoint" } | max_over_time(span.http.response.size) by (span.http.target)
+   ```
+6. **min_over_time** - Minimum duration by HTTP target:
+   ```
+   { name = "GET /:endpoint" } | min_over_time(duration) by (span.http.target)
+   ```
+7. **quantile_over_time** - Calculate multiple percentiles (99th, 90th, 50th) with exemplars:
+   ```
+   { span:name = "GET /:endpoint" } | quantile_over_time(duration, .99, .9, .5) by (span.http.target) with (exemplars=true)
+   ```
+8. **histogram_over_time** - Build duration histogram grouped by custom attribute:
+   ```
+   { name = "GET /:endpoint" } | histogram_over_time(duration) by (span.foo)
+   ```
+9. **compare** - Compare error spans against baseline (10 attributes):
+   ```
+   { resource.service.name="a" && span.http.path="/myapi" } | compare({status=error}, 10)
+   ```
+10. **Using topk modifier** - Find top 10 endpoints by request rate:
+    ```
+    { resource.service.name = "foo" } | rate() by (span.http.url) | topk(10)
+    ```
+**Choosing Between Instant and Range Queries:**
+**Instant Metrics** (`query_metrics_instant`) - Returns a single aggregated value for the entire time range. Use this when:
+- You need a total count or sum across the whole period
+- You want a single metric value (e.g., total error count, average latency)
+- You don't need to see how the metric changes over time
+- You're computing a KPI or summary statistic
+**Time Series Metrics** (`query_metrics_range`) - Returns values at regular intervals controlled by the 'step' parameter. Use this when:
+- You need to graph metrics over time or analyze trends
+- You want to see patterns, spikes, or changes in metrics
+- You're troubleshooting time-based issues
+- You need to correlate metrics with specific time periods
+## Special workflow for performance issues
+When investigating performance issues in kubernetes via traces, call fetch_tempo_traces_comparative_sample. This tool provides comprehensive analysis for identifying patterns.
+## Important Notes
+- TraceQL is the modern query language - prefer it over tag-based search
+- TraceQL metrics are computed from trace data, not traditional Prometheus metrics
+- TraceQL metrics is an experimental feature that computes RED (Rate, Error, Duration) metrics from trace data
+- Common attributes to use in queries: resource.service.name, span.http.route, span.http.status_code, span.http.target, status, name, duration
+- All timestamps can be Unix epoch seconds or RFC3339 format
+- Use time filters (start/end) to improve query performance
+- To get information about Kubernetes resources try these first: resource.service.name, resource.k8s.pod.name, resource.k8s.namespace.name, resource.k8s.deployment.name, resource.k8s.node.name, resource.k8s.container.name
+- TraceQL and TraceQL metrics language are complex. If you get empty data, try to simplify your query and try again!