holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +3 -5
- holmes/clients/robusta_client.py +20 -6
- holmes/common/env_vars.py +58 -3
- holmes/common/openshift.py +1 -1
- holmes/config.py +123 -148
- holmes/core/conversations.py +71 -15
- holmes/core/feedback.py +191 -0
- holmes/core/investigation.py +31 -39
- holmes/core/investigation_structured_output.py +3 -3
- holmes/core/issue.py +1 -1
- holmes/core/llm.py +508 -88
- holmes/core/models.py +108 -4
- holmes/core/openai_formatting.py +14 -1
- holmes/core/prompt.py +48 -3
- holmes/core/runbooks.py +1 -0
- holmes/core/safeguards.py +8 -6
- holmes/core/supabase_dal.py +295 -100
- holmes/core/tool_calling_llm.py +489 -428
- holmes/core/tools.py +325 -56
- holmes/core/tools_utils/token_counting.py +21 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
- holmes/core/tools_utils/tool_executor.py +0 -13
- holmes/core/tools_utils/toolset_utils.py +1 -0
- holmes/core/toolset_manager.py +191 -5
- holmes/core/tracing.py +19 -3
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +63 -0
- holmes/core/transformers/llm_summarize.py +175 -0
- holmes/core/transformers/registry.py +123 -0
- holmes/core/transformers/transformer.py +32 -0
- holmes/core/truncation/compaction.py +94 -0
- holmes/core/truncation/dal_truncation_utils.py +23 -0
- holmes/core/truncation/input_context_window_limiter.py +219 -0
- holmes/interactive.py +228 -31
- holmes/main.py +23 -40
- holmes/plugins/interfaces.py +2 -1
- holmes/plugins/prompts/__init__.py +2 -1
- holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
- holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
- holmes/plugins/prompts/generic_ask.jinja2 +0 -4
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
- holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
- holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
- holmes/plugins/runbooks/__init__.py +145 -17
- holmes/plugins/runbooks/catalog.json +2 -0
- holmes/plugins/sources/github/__init__.py +4 -2
- holmes/plugins/sources/prometheus/models.py +1 -0
- holmes/plugins/toolsets/__init__.py +44 -27
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
- holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
- holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/utils.py +0 -32
- holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
- holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
- holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
- holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
- holmes/plugins/toolsets/bash/common/bash.py +23 -13
- holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
- holmes/plugins/toolsets/bash/common/stringify.py +1 -1
- holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
- holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
- holmes/plugins/toolsets/bash/parse_command.py +12 -13
- holmes/plugins/toolsets/cilium.yaml +284 -0
- holmes/plugins/toolsets/connectivity_check.py +124 -0
- holmes/plugins/toolsets/coralogix/api.py +132 -119
- holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
- holmes/plugins/toolsets/coralogix/utils.py +15 -79
- holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
- holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
- holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
- holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
- holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
- holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
- holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/git.py +54 -50
- holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
- holmes/plugins/toolsets/grafana/common.py +13 -29
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
- holmes/plugins/toolsets/grafana/loki_api.py +4 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
- holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
- holmes/plugins/toolsets/internet/internet.py +15 -16
- holmes/plugins/toolsets/internet/notion.py +9 -11
- holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
- holmes/plugins/toolsets/investigator/model.py +3 -1
- holmes/plugins/toolsets/json_filter_mixin.py +134 -0
- holmes/plugins/toolsets/kafka.py +36 -42
- holmes/plugins/toolsets/kubernetes.yaml +317 -113
- holmes/plugins/toolsets/kubernetes_logs.py +9 -9
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
- holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
- holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
- holmes/plugins/toolsets/openshift.yaml +283 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/api.py +23 -4
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
- holmes/plugins/toolsets/robusta/robusta.py +239 -68
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/connection_utils.py +31 -0
- holmes/utils/console/result.py +10 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
- holmes/utils/env.py +7 -0
- holmes/utils/file_utils.py +2 -1
- holmes/utils/global_instructions.py +60 -11
- holmes/utils/holmes_status.py +6 -4
- holmes/utils/holmes_sync_toolsets.py +0 -2
- holmes/utils/krr_utils.py +188 -0
- holmes/utils/log.py +15 -0
- holmes/utils/markdown_utils.py +2 -3
- holmes/utils/memory_limit.py +58 -0
- holmes/utils/sentry_helper.py +64 -0
- holmes/utils/stream.py +69 -8
- holmes/utils/tags.py +4 -3
- holmes/version.py +37 -15
- holmesgpt-0.18.4.dist-info/LICENSE +178 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
- holmesgpt-0.18.4.dist-info/RECORD +258 -0
- holmes/core/performance_timing.py +0 -72
- holmes/plugins/toolsets/aws.yaml +0 -80
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
- holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
- holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
- holmes/plugins/toolsets/newrelic.py +0 -231
- holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
- holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
- holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
- holmes/plugins/toolsets/servicenow/install.md +0 -37
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
- holmes/utils/keygen_utils.py +0 -6
- holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
- holmesgpt-0.13.2.dist-info/RECORD +0 -234
- /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from abc import abstractmethod
|
|
2
3
|
from typing import Any, ClassVar, Tuple, Type
|
|
3
4
|
|
|
4
5
|
from holmes.core.tools import CallablePrerequisite, Tool, Toolset, ToolsetTag
|
|
5
6
|
from holmes.plugins.toolsets.consts import TOOLSET_CONFIG_MISSING_ERROR
|
|
6
7
|
from holmes.plugins.toolsets.grafana.common import GrafanaConfig
|
|
7
8
|
|
|
8
|
-
from holmes.plugins.toolsets.grafana.grafana_api import grafana_health_check
|
|
9
|
-
|
|
10
9
|
|
|
11
10
|
class BaseGrafanaToolset(Toolset):
|
|
12
11
|
config_class: ClassVar[Type[GrafanaConfig]] = GrafanaConfig
|
|
@@ -39,16 +38,29 @@ class BaseGrafanaToolset(Toolset):
|
|
|
39
38
|
|
|
40
39
|
try:
|
|
41
40
|
self._grafana_config = self.config_class(**config)
|
|
42
|
-
return
|
|
41
|
+
return self.health_check()
|
|
43
42
|
|
|
44
43
|
except Exception as e:
|
|
45
44
|
logging.exception(f"Failed to set up grafana toolset {self.name}")
|
|
46
45
|
return False, str(e)
|
|
47
46
|
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def health_check(self) -> Tuple[bool, str]:
|
|
49
|
+
"""
|
|
50
|
+
Check if the toolset is healthy and can connect to its data source.
|
|
51
|
+
|
|
52
|
+
Subclasses must implement this method to verify connectivity.
|
|
53
|
+
This method should NOT raise exceptions - catch them internally
|
|
54
|
+
and return (False, "error message") instead.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Tuple[bool, str]: (True, "") on success, (False, "error message") on failure.
|
|
58
|
+
"""
|
|
59
|
+
raise NotImplementedError("Subclasses must implement health_check()")
|
|
60
|
+
|
|
48
61
|
def get_example_config(self):
|
|
49
62
|
example_config = GrafanaConfig(
|
|
50
63
|
api_key="YOUR API KEY",
|
|
51
64
|
url="YOUR GRAFANA URL",
|
|
52
|
-
grafana_datasource_uid="UID OF DATASOURCE IN GRAFANA",
|
|
53
65
|
)
|
|
54
66
|
return example_config.model_dump()
|
|
@@ -1,14 +1,11 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from typing import Dict, Optional
|
|
3
|
-
from pydantic import BaseModel
|
|
4
|
-
import datetime
|
|
5
2
|
|
|
6
|
-
from
|
|
3
|
+
from pydantic import BaseModel
|
|
7
4
|
|
|
8
5
|
|
|
9
6
|
class GrafanaConfig(BaseModel):
|
|
10
7
|
"""A config that represents one of the Grafana related tools like Loki or Tempo
|
|
11
|
-
If `grafana_datasource_uid` is set, then it is
|
|
8
|
+
If `grafana_datasource_uid` is set, then it is assumed that Holmes will proxy all
|
|
12
9
|
requests through grafana. In this case `url` should be the grafana URL.
|
|
13
10
|
If `grafana_datasource_uid` is not set, it is assumed that the `url` is the
|
|
14
11
|
systems' URL
|
|
@@ -19,7 +16,7 @@ class GrafanaConfig(BaseModel):
|
|
|
19
16
|
url: str
|
|
20
17
|
grafana_datasource_uid: Optional[str] = None
|
|
21
18
|
external_url: Optional[str] = None
|
|
22
|
-
|
|
19
|
+
verify_ssl: bool = True
|
|
23
20
|
|
|
24
21
|
|
|
25
22
|
def build_headers(api_key: Optional[str], additional_headers: Optional[Dict[str, str]]):
|
|
@@ -36,19 +33,6 @@ def build_headers(api_key: Optional[str], additional_headers: Optional[Dict[str,
|
|
|
36
33
|
return headers
|
|
37
34
|
|
|
38
35
|
|
|
39
|
-
def format_log(log: Dict) -> str:
|
|
40
|
-
log_str = log.get("log", "")
|
|
41
|
-
timestamp_nanoseconds = log.get("timestamp")
|
|
42
|
-
if timestamp_nanoseconds:
|
|
43
|
-
timestamp_seconds = int(timestamp_nanoseconds) // 1_000_000_000
|
|
44
|
-
dt = datetime.datetime.fromtimestamp(timestamp_seconds)
|
|
45
|
-
log_str = dt.strftime("%Y-%m-%dT%H:%M:%SZ") + " " + log_str
|
|
46
|
-
else:
|
|
47
|
-
log_str = json.dumps(log)
|
|
48
|
-
|
|
49
|
-
return log_str
|
|
50
|
-
|
|
51
|
-
|
|
52
36
|
def get_base_url(config: GrafanaConfig) -> str:
|
|
53
37
|
if config.grafana_datasource_uid:
|
|
54
38
|
return f"{config.url}/api/datasources/proxy/uid/{config.grafana_datasource_uid}"
|
|
@@ -56,13 +40,13 @@ def get_base_url(config: GrafanaConfig) -> str:
|
|
|
56
40
|
return config.url
|
|
57
41
|
|
|
58
42
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
43
|
+
class GrafanaTempoLabelsConfig(BaseModel):
|
|
44
|
+
pod: str = "k8s.pod.name"
|
|
45
|
+
namespace: str = "k8s.namespace.name"
|
|
46
|
+
deployment: str = "k8s.deployment.name"
|
|
47
|
+
node: str = "k8s.node.name"
|
|
48
|
+
service: str = "service.name"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class GrafanaTempoConfig(GrafanaConfig):
|
|
52
|
+
labels: GrafanaTempoLabelsConfig = GrafanaTempoLabelsConfig()
|
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
"""Grafana Tempo API wrapper for querying traces and metrics."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Dict, Optional, Union
|
|
5
|
+
from urllib.parse import quote
|
|
6
|
+
|
|
7
|
+
import backoff
|
|
8
|
+
import requests # type: ignore
|
|
9
|
+
|
|
10
|
+
from holmes.plugins.toolsets.grafana.common import (
|
|
11
|
+
GrafanaTempoConfig,
|
|
12
|
+
build_headers,
|
|
13
|
+
get_base_url,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TempoAPIError(Exception):
|
|
20
|
+
"""Custom exception for Tempo API errors with detailed response information."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, status_code: int, response_text: str, url: str):
|
|
23
|
+
self.status_code = status_code
|
|
24
|
+
self.response_text = response_text
|
|
25
|
+
self.url = url
|
|
26
|
+
|
|
27
|
+
# Try to extract error message from JSON response
|
|
28
|
+
try:
|
|
29
|
+
import json
|
|
30
|
+
|
|
31
|
+
error_data = json.loads(response_text)
|
|
32
|
+
# Tempo may return errors in different formats
|
|
33
|
+
error_message = (
|
|
34
|
+
error_data.get("error")
|
|
35
|
+
or error_data.get("message")
|
|
36
|
+
or error_data.get("errorType")
|
|
37
|
+
or response_text
|
|
38
|
+
)
|
|
39
|
+
except (json.JSONDecodeError, TypeError):
|
|
40
|
+
error_message = response_text
|
|
41
|
+
|
|
42
|
+
super().__init__(f"Tempo API error {status_code}: {error_message}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class GrafanaTempoAPI:
|
|
46
|
+
"""Python wrapper for Grafana Tempo REST API.
|
|
47
|
+
|
|
48
|
+
This class provides a clean interface to all Tempo API endpoints.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self, config: GrafanaTempoConfig):
|
|
52
|
+
"""Initialize the Tempo API wrapper.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
config: GrafanaTempoConfig instance with connection details
|
|
56
|
+
"""
|
|
57
|
+
self.config = config
|
|
58
|
+
self.base_url = get_base_url(config)
|
|
59
|
+
self.headers = build_headers(config.api_key, config.headers)
|
|
60
|
+
|
|
61
|
+
def _make_request(
|
|
62
|
+
self,
|
|
63
|
+
endpoint: str,
|
|
64
|
+
params: Optional[Dict[str, Any]] = None,
|
|
65
|
+
path_params: Optional[Dict[str, str]] = None,
|
|
66
|
+
timeout: int = 30,
|
|
67
|
+
retries: int = 3,
|
|
68
|
+
) -> Dict[str, Any]:
|
|
69
|
+
"""Make HTTP request to Tempo API with retry logic.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
endpoint: API endpoint path (e.g., "/api/echo")
|
|
73
|
+
params: Query parameters
|
|
74
|
+
path_params: Parameters to substitute in the endpoint path
|
|
75
|
+
timeout: Request timeout in seconds
|
|
76
|
+
retries: Number of retry attempts
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
JSON response from the API
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
Exception: If the request fails after all retries
|
|
83
|
+
"""
|
|
84
|
+
# Format endpoint with path parameters
|
|
85
|
+
if path_params:
|
|
86
|
+
for key, value in path_params.items():
|
|
87
|
+
endpoint = endpoint.replace(f"{{{key}}}", quote(str(value), safe=""))
|
|
88
|
+
|
|
89
|
+
url = f"{self.base_url}{endpoint}"
|
|
90
|
+
|
|
91
|
+
@backoff.on_exception(
|
|
92
|
+
backoff.expo,
|
|
93
|
+
requests.exceptions.RequestException,
|
|
94
|
+
max_tries=retries,
|
|
95
|
+
giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
|
|
96
|
+
and getattr(e, "response", None) is not None
|
|
97
|
+
and e.response.status_code < 500,
|
|
98
|
+
)
|
|
99
|
+
def make_request():
|
|
100
|
+
# GET request with query parameters
|
|
101
|
+
response = requests.get(
|
|
102
|
+
url,
|
|
103
|
+
headers=self.headers,
|
|
104
|
+
params=params,
|
|
105
|
+
timeout=timeout,
|
|
106
|
+
verify=self.config.verify_ssl,
|
|
107
|
+
)
|
|
108
|
+
response.raise_for_status()
|
|
109
|
+
return response.json()
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
return make_request()
|
|
113
|
+
except requests.exceptions.HTTPError as e:
|
|
114
|
+
# Extract detailed error message from response
|
|
115
|
+
response = e.response
|
|
116
|
+
if response is not None:
|
|
117
|
+
logger.error(
|
|
118
|
+
f"HTTP error {response.status_code} for {url}: {response.text}"
|
|
119
|
+
)
|
|
120
|
+
raise TempoAPIError(
|
|
121
|
+
status_code=response.status_code,
|
|
122
|
+
response_text=response.text,
|
|
123
|
+
url=url,
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
logger.error(f"Request failed for {url}: {e}")
|
|
127
|
+
raise
|
|
128
|
+
except requests.exceptions.RequestException as e:
|
|
129
|
+
logger.error(f"Request failed for {url}: {e}")
|
|
130
|
+
raise
|
|
131
|
+
|
|
132
|
+
def query_echo_endpoint(self) -> bool:
|
|
133
|
+
"""Query the echo endpoint to check Tempo status.
|
|
134
|
+
|
|
135
|
+
API Endpoint: GET /api/echo
|
|
136
|
+
HTTP Method: GET
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
bool: True if endpoint returns 200 status code, False otherwise
|
|
140
|
+
"""
|
|
141
|
+
url = f"{self.base_url}/api/echo"
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
response = requests.get(
|
|
145
|
+
url,
|
|
146
|
+
headers=self.headers,
|
|
147
|
+
timeout=30,
|
|
148
|
+
verify=self.config.verify_ssl,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Just check status code, don't try to parse JSON
|
|
152
|
+
return response.status_code == 200
|
|
153
|
+
|
|
154
|
+
except requests.exceptions.RequestException as e:
|
|
155
|
+
logger.error(f"Request failed for {url}: {e}")
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
def query_trace_by_id_v2(
|
|
159
|
+
self,
|
|
160
|
+
trace_id: str,
|
|
161
|
+
start: Optional[int] = None,
|
|
162
|
+
end: Optional[int] = None,
|
|
163
|
+
) -> Dict[str, Any]:
|
|
164
|
+
"""Query a trace by its ID.
|
|
165
|
+
|
|
166
|
+
API Endpoint: GET /api/v2/traces/{trace_id}
|
|
167
|
+
HTTP Method: GET
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
trace_id: The trace ID to retrieve
|
|
171
|
+
start: Optional start time in Unix epoch seconds
|
|
172
|
+
end: Optional end time in Unix epoch seconds
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
dict: OpenTelemetry format trace data
|
|
176
|
+
"""
|
|
177
|
+
params = {}
|
|
178
|
+
if start is not None:
|
|
179
|
+
params["start"] = str(start)
|
|
180
|
+
if end is not None:
|
|
181
|
+
params["end"] = str(end)
|
|
182
|
+
|
|
183
|
+
return self._make_request(
|
|
184
|
+
"/api/v2/traces/{trace_id}",
|
|
185
|
+
params=params,
|
|
186
|
+
path_params={"trace_id": trace_id},
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def _search_traces_common(
|
|
190
|
+
self,
|
|
191
|
+
search_params: Dict[str, Any],
|
|
192
|
+
limit: Optional[int] = None,
|
|
193
|
+
start: Optional[int] = None,
|
|
194
|
+
end: Optional[int] = None,
|
|
195
|
+
spss: Optional[int] = None,
|
|
196
|
+
) -> Dict[str, Any]:
|
|
197
|
+
"""Common search implementation for both tag and TraceQL searches.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
search_params: The search-specific parameters (tags or q)
|
|
201
|
+
limit: Optional max number of traces to return
|
|
202
|
+
start: Optional start time in Unix epoch seconds
|
|
203
|
+
end: Optional end time in Unix epoch seconds
|
|
204
|
+
spss: Optional spans per span set
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
dict: Search results with trace metadata
|
|
208
|
+
"""
|
|
209
|
+
params = search_params.copy()
|
|
210
|
+
|
|
211
|
+
if limit is not None:
|
|
212
|
+
params["limit"] = str(limit)
|
|
213
|
+
if start is not None:
|
|
214
|
+
params["start"] = str(start)
|
|
215
|
+
if end is not None:
|
|
216
|
+
params["end"] = str(end)
|
|
217
|
+
if spss is not None:
|
|
218
|
+
params["spss"] = str(spss)
|
|
219
|
+
|
|
220
|
+
return self._make_request("/api/search", params=params)
|
|
221
|
+
|
|
222
|
+
def search_traces_by_tags(
|
|
223
|
+
self,
|
|
224
|
+
tags: str,
|
|
225
|
+
min_duration: Optional[str] = None,
|
|
226
|
+
max_duration: Optional[str] = None,
|
|
227
|
+
limit: Optional[int] = None,
|
|
228
|
+
start: Optional[int] = None,
|
|
229
|
+
end: Optional[int] = None,
|
|
230
|
+
spss: Optional[int] = None,
|
|
231
|
+
) -> Dict[str, Any]:
|
|
232
|
+
"""Search for traces using tag-based search.
|
|
233
|
+
|
|
234
|
+
API Endpoint: GET /api/search
|
|
235
|
+
HTTP Method: GET
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
tags: logfmt-encoded span/process attributes (required)
|
|
239
|
+
min_duration: Optional minimum trace duration (e.g., "5s")
|
|
240
|
+
max_duration: Optional maximum trace duration
|
|
241
|
+
limit: Optional max number of traces to return
|
|
242
|
+
start: Optional start time in Unix epoch seconds
|
|
243
|
+
end: Optional end time in Unix epoch seconds
|
|
244
|
+
spss: Optional spans per span set
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
dict: Search results with trace metadata
|
|
248
|
+
"""
|
|
249
|
+
search_params = {"tags": tags}
|
|
250
|
+
|
|
251
|
+
# minDuration and maxDuration are only supported with tag-based search
|
|
252
|
+
if min_duration is not None:
|
|
253
|
+
search_params["minDuration"] = min_duration
|
|
254
|
+
if max_duration is not None:
|
|
255
|
+
search_params["maxDuration"] = max_duration
|
|
256
|
+
|
|
257
|
+
return self._search_traces_common(
|
|
258
|
+
search_params=search_params,
|
|
259
|
+
limit=limit,
|
|
260
|
+
start=start,
|
|
261
|
+
end=end,
|
|
262
|
+
spss=spss,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def search_traces_by_query(
|
|
266
|
+
self,
|
|
267
|
+
q: str,
|
|
268
|
+
limit: Optional[int] = None,
|
|
269
|
+
start: Optional[int] = None,
|
|
270
|
+
end: Optional[int] = None,
|
|
271
|
+
spss: Optional[int] = None,
|
|
272
|
+
) -> Dict[str, Any]:
|
|
273
|
+
"""Search for traces using TraceQL query.
|
|
274
|
+
|
|
275
|
+
API Endpoint: GET /api/search
|
|
276
|
+
HTTP Method: GET
|
|
277
|
+
|
|
278
|
+
Note: minDuration and maxDuration are not supported with TraceQL queries.
|
|
279
|
+
Use the TraceQL query syntax to filter by duration instead.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
q: TraceQL query (required)
|
|
283
|
+
limit: Optional max number of traces to return
|
|
284
|
+
start: Optional start time in Unix epoch seconds
|
|
285
|
+
end: Optional end time in Unix epoch seconds
|
|
286
|
+
spss: Optional spans per span set
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
dict: Search results with trace metadata
|
|
290
|
+
"""
|
|
291
|
+
return self._search_traces_common(
|
|
292
|
+
search_params={"q": q},
|
|
293
|
+
limit=limit,
|
|
294
|
+
start=start,
|
|
295
|
+
end=end,
|
|
296
|
+
spss=spss,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
def search_tag_names_v2(
|
|
300
|
+
self,
|
|
301
|
+
scope: Optional[str] = None,
|
|
302
|
+
q: Optional[str] = None,
|
|
303
|
+
start: Optional[int] = None,
|
|
304
|
+
end: Optional[int] = None,
|
|
305
|
+
limit: Optional[int] = None,
|
|
306
|
+
max_stale_values: Optional[int] = None,
|
|
307
|
+
) -> Dict[str, Any]:
|
|
308
|
+
"""Search for available tag names.
|
|
309
|
+
|
|
310
|
+
API Endpoint: GET /api/v2/search/tags
|
|
311
|
+
HTTP Method: GET
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
scope: Optional scope filter ("resource", "span", or "intrinsic")
|
|
315
|
+
q: Optional TraceQL query to filter tags
|
|
316
|
+
start: Optional start time in Unix epoch seconds
|
|
317
|
+
end: Optional end time in Unix epoch seconds
|
|
318
|
+
limit: Optional max number of tag names
|
|
319
|
+
max_stale_values: Optional max stale values parameter
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
dict: Available tag names organized by scope
|
|
323
|
+
"""
|
|
324
|
+
params = {}
|
|
325
|
+
if scope is not None:
|
|
326
|
+
params["scope"] = scope
|
|
327
|
+
if q is not None:
|
|
328
|
+
params["q"] = q
|
|
329
|
+
if start is not None:
|
|
330
|
+
params["start"] = str(start)
|
|
331
|
+
if end is not None:
|
|
332
|
+
params["end"] = str(end)
|
|
333
|
+
if limit is not None:
|
|
334
|
+
params["limit"] = str(limit)
|
|
335
|
+
if max_stale_values is not None:
|
|
336
|
+
params["maxStaleValues"] = str(max_stale_values)
|
|
337
|
+
|
|
338
|
+
return self._make_request("/api/v2/search/tags", params=params)
|
|
339
|
+
|
|
340
|
+
def search_tag_values_v2(
|
|
341
|
+
self,
|
|
342
|
+
tag: str,
|
|
343
|
+
q: Optional[str] = None,
|
|
344
|
+
start: Optional[int] = None,
|
|
345
|
+
end: Optional[int] = None,
|
|
346
|
+
limit: Optional[int] = None,
|
|
347
|
+
max_stale_values: Optional[int] = None,
|
|
348
|
+
) -> Dict[str, Any]:
|
|
349
|
+
"""Search for values of a specific tag with optional TraceQL filtering.
|
|
350
|
+
|
|
351
|
+
API Endpoint: GET /api/v2/search/tag/{tag}/values
|
|
352
|
+
HTTP Method: GET
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
tag: The tag name to get values for (required)
|
|
356
|
+
q: Optional TraceQL query to filter tag values (e.g., '{resource.cluster="us-east-1"}')
|
|
357
|
+
start: Optional start time in Unix epoch seconds
|
|
358
|
+
end: Optional end time in Unix epoch seconds
|
|
359
|
+
limit: Optional max number of values
|
|
360
|
+
max_stale_values: Optional max stale values parameter
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
dict: List of discovered values for the tag
|
|
364
|
+
"""
|
|
365
|
+
params = {}
|
|
366
|
+
if q is not None:
|
|
367
|
+
params["q"] = q
|
|
368
|
+
if start is not None:
|
|
369
|
+
params["start"] = str(start)
|
|
370
|
+
if end is not None:
|
|
371
|
+
params["end"] = str(end)
|
|
372
|
+
if limit is not None:
|
|
373
|
+
params["limit"] = str(limit)
|
|
374
|
+
if max_stale_values is not None:
|
|
375
|
+
params["maxStaleValues"] = str(max_stale_values)
|
|
376
|
+
|
|
377
|
+
return self._make_request(
|
|
378
|
+
"/api/v2/search/tag/{tag}/values",
|
|
379
|
+
params=params,
|
|
380
|
+
path_params={"tag": tag},
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
def query_metrics_instant(
|
|
384
|
+
self,
|
|
385
|
+
q: str,
|
|
386
|
+
start: Optional[Union[int, str]] = None,
|
|
387
|
+
end: Optional[Union[int, str]] = None,
|
|
388
|
+
since: Optional[str] = None,
|
|
389
|
+
) -> Dict[str, Any]:
|
|
390
|
+
"""Query TraceQL metrics for an instant value.
|
|
391
|
+
|
|
392
|
+
Computes a single value across the entire time range.
|
|
393
|
+
|
|
394
|
+
API Endpoint: GET /api/metrics/query
|
|
395
|
+
HTTP Method: GET
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
q: TraceQL metrics query (required)
|
|
399
|
+
start: Optional start time (Unix seconds/nanoseconds/RFC3339)
|
|
400
|
+
end: Optional end time (Unix seconds/nanoseconds/RFC3339)
|
|
401
|
+
since: Optional duration string (e.g., "1h")
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
dict: Single computed metric value
|
|
405
|
+
"""
|
|
406
|
+
params = {"q": q}
|
|
407
|
+
if start is not None:
|
|
408
|
+
params["start"] = str(start)
|
|
409
|
+
if end is not None:
|
|
410
|
+
params["end"] = str(end)
|
|
411
|
+
if since is not None:
|
|
412
|
+
params["since"] = since
|
|
413
|
+
|
|
414
|
+
return self._make_request("/api/metrics/query", params=params)
|
|
415
|
+
|
|
416
|
+
def query_metrics_range(
|
|
417
|
+
self,
|
|
418
|
+
q: str,
|
|
419
|
+
step: Optional[str] = None,
|
|
420
|
+
start: Optional[Union[int, str]] = None,
|
|
421
|
+
end: Optional[Union[int, str]] = None,
|
|
422
|
+
since: Optional[str] = None,
|
|
423
|
+
exemplars: Optional[int] = None,
|
|
424
|
+
) -> Dict[str, Any]:
|
|
425
|
+
"""Query TraceQL metrics for a time series range.
|
|
426
|
+
|
|
427
|
+
Returns metrics computed at regular intervals over the time range.
|
|
428
|
+
|
|
429
|
+
API Endpoint: GET /api/metrics/query_range
|
|
430
|
+
HTTP Method: GET
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
q: TraceQL metrics query (required)
|
|
434
|
+
step: Optional time series granularity (e.g., "1m", "5m")
|
|
435
|
+
start: Optional start time (Unix seconds/nanoseconds/RFC3339)
|
|
436
|
+
end: Optional end time (Unix seconds/nanoseconds/RFC3339)
|
|
437
|
+
since: Optional duration string (e.g., "3h")
|
|
438
|
+
exemplars: Optional maximum number of exemplars to return
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
dict: Time series of metric values
|
|
442
|
+
"""
|
|
443
|
+
params = {"q": q}
|
|
444
|
+
if step is not None:
|
|
445
|
+
params["step"] = step
|
|
446
|
+
if start is not None:
|
|
447
|
+
params["start"] = str(start)
|
|
448
|
+
if end is not None:
|
|
449
|
+
params["end"] = str(end)
|
|
450
|
+
if since is not None:
|
|
451
|
+
params["since"] = since
|
|
452
|
+
if exemplars is not None:
|
|
453
|
+
params["exemplars"] = str(exemplars)
|
|
454
|
+
|
|
455
|
+
return self._make_request("/api/metrics/query_range", params=params)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
Grafana Loki is a multi-tenant log aggregation system designed to store and query logs from all your applications and infrastructure.
|
|
2
|
+
|
|
3
|
+
**IMPORTANT WILDCARD USAGE:**
|
|
4
|
+
- **ALWAYS use wildcards** when searching for pods unless you have the COMPLETE pod name with all suffixes
|
|
5
|
+
- Kubernetes pod names include deployment hash + replica ID (e.g., `nginx-ingress-7b9899-x2km9`, `frontend-5f4d3b2a1-abc123`)
|
|
6
|
+
- When user says "nginx pod" or "frontend pod", search for `nginx-*` or `frontend-*` NOT just `nginx` or `frontend`
|
|
7
|
+
- Loki supports wildcards: `*` matches any characters (e.g., `nginx-*`, `*ingress*`, `*-x2km9`)
|
|
8
|
+
- For partial matches, use wildcards on both sides: `*keyword*` to find logs from any pod containing "keyword"
|
|
9
|
+
|
|
10
|
+
**When user provides what looks like a complete pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
|
|
11
|
+
- Query Loki directly with that exact pod name
|
|
12
|
+
- Do NOT try to verify if the pod exists in Kubernetes first
|
|
13
|
+
- This allows querying historical pods that have been deleted/replaced
|
|
14
|
+
|
|
15
|
+
* If asked to check for logs, you must always try 1-2 of the best queries you can construct to search for the logs.
|
|
16
|
+
|
|
17
|
+
Loki indexes log lines using labels to help find relevant log lines.
|
|
18
|
+
For example a default Kubernetes labels setup would look like that
|
|
19
|
+
{namespace="prod", app="backend-api", container="api", pod="backend-api-68b7d9df9c-xyz12", stream="stdout"}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
### Time Parameters
|
|
23
|
+
- Use RFC3339 format: `2023-03-01T10:30:00Z`
|
|
24
|
+
- Or relative seconds: `-3600` for 1 hour ago
|
|
25
|
+
- If no time range is specificed use last 4 hours as default time.
|