PyPI - holmesgpt - Versions diffs - 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl - Mend

holmesgpt 0.13.2py3-none-any.whl → 0.18.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (188) hide show

holmes/__init__.py +3 -5
holmes/clients/robusta_client.py +20 -6
holmes/common/env_vars.py +58 -3
holmes/common/openshift.py +1 -1
holmes/config.py +123 -148
holmes/core/conversations.py +71 -15
holmes/core/feedback.py +191 -0
holmes/core/investigation.py +31 -39
holmes/core/investigation_structured_output.py +3 -3
holmes/core/issue.py +1 -1
holmes/core/llm.py +508 -88
holmes/core/models.py +108 -4
holmes/core/openai_formatting.py +14 -1
holmes/core/prompt.py +48 -3
holmes/core/runbooks.py +1 -0
holmes/core/safeguards.py +8 -6
holmes/core/supabase_dal.py +295 -100
holmes/core/tool_calling_llm.py +489 -428
holmes/core/tools.py +325 -56
holmes/core/tools_utils/token_counting.py +21 -0
holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
holmes/core/tools_utils/tool_executor.py +0 -13
holmes/core/tools_utils/toolset_utils.py +1 -0
holmes/core/toolset_manager.py +191 -5
holmes/core/tracing.py +19 -3
holmes/core/transformers/__init__.py +23 -0
holmes/core/transformers/base.py +63 -0
holmes/core/transformers/llm_summarize.py +175 -0
holmes/core/transformers/registry.py +123 -0
holmes/core/transformers/transformer.py +32 -0
holmes/core/truncation/compaction.py +94 -0
holmes/core/truncation/dal_truncation_utils.py +23 -0
holmes/core/truncation/input_context_window_limiter.py +219 -0
holmes/interactive.py +228 -31
holmes/main.py +23 -40
holmes/plugins/interfaces.py +2 -1
holmes/plugins/prompts/__init__.py +2 -1
holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
holmes/plugins/prompts/generic_ask.jinja2 +0 -4
holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
holmes/plugins/runbooks/__init__.py +145 -17
holmes/plugins/runbooks/catalog.json +2 -0
holmes/plugins/sources/github/__init__.py +4 -2
holmes/plugins/sources/prometheus/models.py +1 -0
holmes/plugins/toolsets/__init__.py +44 -27
holmes/plugins/toolsets/aks-node-health.yaml +46 -0
holmes/plugins/toolsets/aks.yaml +64 -0
holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
holmes/plugins/toolsets/azure_sql/utils.py +0 -32
holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
holmes/plugins/toolsets/bash/common/bash.py +23 -13
holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
holmes/plugins/toolsets/bash/common/stringify.py +1 -1
holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
holmes/plugins/toolsets/bash/parse_command.py +12 -13
holmes/plugins/toolsets/cilium.yaml +284 -0
holmes/plugins/toolsets/connectivity_check.py +124 -0
holmes/plugins/toolsets/coralogix/api.py +132 -119
holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
holmes/plugins/toolsets/coralogix/utils.py +15 -79
holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
holmes/plugins/toolsets/git.py +54 -50
holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
holmes/plugins/toolsets/grafana/common.py +13 -29
holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
holmes/plugins/toolsets/grafana/loki_api.py +4 -0
holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
holmes/plugins/toolsets/internet/internet.py +15 -16
holmes/plugins/toolsets/internet/notion.py +9 -11
holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
holmes/plugins/toolsets/investigator/model.py +3 -1
holmes/plugins/toolsets/json_filter_mixin.py +134 -0
holmes/plugins/toolsets/kafka.py +36 -42
holmes/plugins/toolsets/kubernetes.yaml +317 -113
holmes/plugins/toolsets/kubernetes_logs.py +9 -9
holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
holmes/plugins/toolsets/openshift.yaml +283 -0
holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
holmes/plugins/toolsets/prometheus/utils.py +28 -0
holmes/plugins/toolsets/rabbitmq/api.py +23 -4
holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
holmes/plugins/toolsets/robusta/robusta.py +239 -68
holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
holmes/plugins/toolsets/service_discovery.py +1 -1
holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
holmes/plugins/toolsets/utils.py +88 -0
holmes/utils/config_utils.py +91 -0
holmes/utils/connection_utils.py +31 -0
holmes/utils/console/result.py +10 -0
holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
holmes/utils/env.py +7 -0
holmes/utils/file_utils.py +2 -1
holmes/utils/global_instructions.py +60 -11
holmes/utils/holmes_status.py +6 -4
holmes/utils/holmes_sync_toolsets.py +0 -2
holmes/utils/krr_utils.py +188 -0
holmes/utils/log.py +15 -0
holmes/utils/markdown_utils.py +2 -3
holmes/utils/memory_limit.py +58 -0
holmes/utils/sentry_helper.py +64 -0
holmes/utils/stream.py +69 -8
holmes/utils/tags.py +4 -3
holmes/version.py +37 -15
holmesgpt-0.18.4.dist-info/LICENSE +178 -0
{holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
holmesgpt-0.18.4.dist-info/RECORD +258 -0
holmes/core/performance_timing.py +0 -72
holmes/plugins/toolsets/aws.yaml +0 -80
holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
holmes/plugins/toolsets/newrelic.py +0 -231
holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
holmes/plugins/toolsets/servicenow/install.md +0 -37
holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
holmes/utils/keygen_utils.py +0 -6
holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
holmesgpt-0.13.2.dist-info/RECORD +0 -234
/holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
{holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
{holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0

holmes/plugins/toolsets/prometheus/prometheus.py CHANGED Viewed

@@ -1,27 +1,41 @@
 import json
 import logging
 import os
-import re
 import time
-import dateutil.parser
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, Optional, Tuple, Type, Union
 from urllib.parse import urljoin
+import dateutil.parser
 import requests  # type: ignore
-from pydantic import BaseModel, field_validator, Field, model_validator
-from requests import RequestException
+from prometrix.auth import PrometheusAuthorization
 from prometrix.connect.aws_connect import AWSPrometheusConnect
+from prometrix.models.prometheus_config import (
+    AzurePrometheusConfig as PrometrixAzureConfig,
+)
 from prometrix.models.prometheus_config import PrometheusConfig as BasePrometheusConfig
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from requests import RequestException
+from requests.exceptions import SSLError  # type: ignore
+from holmes.common.env_vars import IS_OPENSHIFT, MAX_GRAPH_POINTS
+from holmes.common.openshift import load_openshift_token
 from holmes.core.tools import (
     CallablePrerequisite,
     StructuredToolResult,
+    StructuredToolResultStatus,
     Tool,
+    ToolInvokeContext,
     ToolParameter,
-    ToolResultStatus,
     Toolset,
     ToolsetTag,
 )
+from holmes.core.tools_utils.token_counting import count_tool_response_tokens
+from holmes.core.tools_utils.tool_context_window_limiter import get_pct_token_count
 from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
+from holmes.plugins.toolsets.logging_utils.logging_api import (
+    DEFAULT_GRAPH_TIME_SPAN_SECONDS,
+)
+from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
 from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
 from holmes.plugins.toolsets.utils import (
     get_param_or_raise,
@@ -30,31 +44,76 @@ from holmes.plugins.toolsets.utils import (
     toolset_name_for_one_liner,
 )
 from holmes.utils.cache import TTLCache
-from holmes.common.env_vars import IS_OPENSHIFT, MAX_GRAPH_POINTS
-from holmes.common.openshift import load_openshift_token
-from holmes.plugins.toolsets.logging_utils.logging_api import (
-    DEFAULT_GRAPH_TIME_SPAN_SECONDS,
-)
-from holmes.utils.keygen_utils import generate_random_key
 PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
+PROMETHEUS_METADATA_API_LIMIT = 100  # Default limit for Prometheus metadata APIs (series, labels, metadata) to prevent overwhelming responses
+# Default timeout values for PromQL queries
+DEFAULT_QUERY_TIMEOUT_SECONDS = 20
+MAX_QUERY_TIMEOUT_SECONDS = 180
+# Default timeout for metadata API calls (discovery endpoints)
+DEFAULT_METADATA_TIMEOUT_SECONDS = 20
+MAX_METADATA_TIMEOUT_SECONDS = 60
+# Default time window for metadata APIs (in hours)
+DEFAULT_METADATA_TIME_WINDOW_HRS = 1
+def format_ssl_error_message(prometheus_url: str, error: SSLError) -> str:
+    """Format a clear SSL error message with remediation steps."""
+    return (
+        f"SSL certificate verification failed when connecting to Prometheus at {prometheus_url}. "
+        f"Error: {str(error)}. "
+        f"To disable SSL verification, set 'verify_ssl: false' in your configuration. "
+        f"For Helm deployments, add this to your values.yaml:\n"
+        f"  toolsets:\n"
+        f"    prometheus/metrics:\n"
+        f"      config:\n"
+        f"        verify_ssl: false"
+    )
 class PrometheusConfig(BaseModel):
+    """Prometheus toolset configuration.
+    Deprecated config names (still accepted but not in schema):
+    - default_metadata_time_window_hrs -> discover_metrics_from_last_hours
+    - default_query_timeout_seconds -> query_timeout_seconds_default
+    - max_query_timeout_seconds -> query_timeout_seconds_hard_max
+    - default_metadata_timeout_seconds -> metadata_timeout_seconds_default
+    - max_metadata_timeout_seconds -> metadata_timeout_seconds_hard_max
+    - metrics_labels_time_window_hrs -> discover_metrics_from_last_hours
+    - prometheus_ssl_enabled -> verify_ssl
+    - metrics_labels_cache_duration_hrs (no longer used)
+    - fetch_labels_with_labels_api (no longer used)
+    - fetch_metadata_with_series_api (no longer used)
+    """
+    model_config = ConfigDict(extra="allow")
     # URL is optional because it can be set with an env var
-    prometheus_url: Optional[str]
-    healthcheck: str = "-/healthy"
-    # Setting to None will remove the time window from the request for labels
-    metrics_labels_time_window_hrs: Union[int, None] = 48
-    # Setting to None will disable the cache
-    metrics_labels_cache_duration_hrs: Union[int, None] = 12
-    fetch_labels_with_labels_api: bool = False
-    fetch_metadata_with_series_api: bool = False
+    prometheus_url: Optional[str] = None
+    # Discovery API time window - only return metrics with data in the last N hours
+    discover_metrics_from_last_hours: int = DEFAULT_METADATA_TIME_WINDOW_HRS
+    # Query timeout configuration
+    query_timeout_seconds_default: int = DEFAULT_QUERY_TIMEOUT_SECONDS
+    query_timeout_seconds_hard_max: int = MAX_QUERY_TIMEOUT_SECONDS
+    # Metadata API timeout configuration
+    metadata_timeout_seconds_default: int = DEFAULT_METADATA_TIMEOUT_SECONDS
+    metadata_timeout_seconds_hard_max: int = MAX_METADATA_TIMEOUT_SECONDS
     tool_calls_return_data: bool = True
     headers: Dict = Field(default_factory=dict)
-    rules_cache_duration_seconds: Union[int, None] = 1800  # 30 minutes
+    rules_cache_duration_seconds: Optional[int] = 1800  # 30 minutes
     additional_labels: Optional[Dict[str, str]] = None
-    prometheus_ssl_enabled: bool = True
+    verify_ssl: bool = True
+    # Custom limit to the max number of tokens that a query result can take to proactively
+    #   prevent token limit issues. Expressed in % of the model's context window.
+    # This limit only overrides the global limit for all tools  (TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT)
+    #   if it is lower.
+    query_response_size_limit_pct: Optional[int] = None
     @field_validator("prometheus_url")
     def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
@@ -64,11 +123,52 @@ class PrometheusConfig(BaseModel):
     @model_validator(mode="after")
     def validate_prom_config(self):
+        # Handle deprecated config names passed as extra fields
+        # These are accepted via extra="allow" but not defined in schema
+        extra = self.model_extra or {}
+        deprecated_with_replacement = []
+        # Map of old names -> new names
+        deprecated_mappings = {
+            "default_metadata_time_window_hrs": "discover_metrics_from_last_hours",
+            "default_query_timeout_seconds": "query_timeout_seconds_default",
+            "max_query_timeout_seconds": "query_timeout_seconds_hard_max",
+            "default_metadata_timeout_seconds": "metadata_timeout_seconds_default",
+            "max_metadata_timeout_seconds": "metadata_timeout_seconds_hard_max",
+            "metrics_labels_time_window_hrs": "discover_metrics_from_last_hours",
+            "prometheus_ssl_enabled": "verify_ssl",
+        }
+        for old_name, new_name in deprecated_mappings.items():
+            if old_name in extra:
+                setattr(self, new_name, extra[old_name])
+                deprecated_with_replacement.append(f"{old_name} -> {new_name}")
+        if deprecated_with_replacement:
+            logging.warning(
+                f"Prometheus config uses deprecated names. Please update: "
+                f"{', '.join(deprecated_with_replacement)}"
+            )
+        # Check for deprecated config values that no longer have any effect
+        deprecated_no_effect = [
+            name
+            for name in [
+                "metrics_labels_cache_duration_hrs",
+                "fetch_labels_with_labels_api",
+                "fetch_metadata_with_series_api",
+            ]
+            if name in extra
+        ]
+        if deprecated_no_effect:
+            logging.warning(
+                f"The following Prometheus config values are deprecated and have no effect: "
+                f"{', '.join(deprecated_no_effect)}"
+            )
         # If openshift is enabled, and the user didn't configure auth headers, we will try to load the token from the service account.
         if IS_OPENSHIFT:
-            if self.healthcheck == "-/healthy":
-                self.healthcheck = "api/v1/query?query=up"
             if self.headers.get("Authorization"):
                 return self
@@ -88,8 +188,7 @@ class AMPConfig(PrometheusConfig):
     aws_secret_access_key: Optional[str] = None
     aws_region: str
     aws_service_name: str = "aps"
-    healthcheck: str = "api/v1/query?query=up"
-    prometheus_ssl_enabled: bool = False
+    verify_ssl: bool = False
     assume_role_arn: Optional[str] = None
     # Refresh the AWS client (and its STS creds) every N seconds (default: 15 minutes)
@@ -113,7 +212,7 @@ class AMPConfig(PrometheusConfig):
             try:
                 base_config = BasePrometheusConfig(
                     url=self.prometheus_url,
-                    disable_ssl=not self.prometheus_ssl_enabled,
+                    disable_ssl=not self.verify_ssl,
                     additional_labels=self.additional_labels,
                 )
                 self._aws_client = AWSPrometheusConnect(
@@ -132,12 +231,155 @@ class AMPConfig(PrometheusConfig):
         return self._aws_client
+class AzurePrometheusConfig(PrometheusConfig):
+    azure_resource: Optional[str] = None
+    azure_metadata_endpoint: Optional[str] = None
+    azure_token_endpoint: Optional[str] = None
+    azure_use_managed_id: bool = False
+    azure_client_id: Optional[str] = None
+    azure_client_secret: Optional[str] = None
+    azure_tenant_id: Optional[str] = None
+    verify_ssl: bool = True
+    # Refresh the Azure bearer token every N seconds (default: 15 minutes)
+    refresh_interval_seconds: int = 900
+    _prometrix_config: Optional[PrometrixAzureConfig] = None
+    _token_created_at: float = 0.0
+    @staticmethod
+    def _load_from_env_or_default(
+        config_value: Optional[str], env_var: str, default: Optional[str] = None
+    ) -> Optional[str]:
+        """Load value from config, environment variable, or use default."""
+        if config_value:
+            return config_value
+        return os.environ.get(env_var, default)
+    def __init__(self, **data):
+        super().__init__(**data)
+        # Load from environment variables if not provided in config
+        self.azure_client_id = self._load_from_env_or_default(
+            self.azure_client_id, "AZURE_CLIENT_ID"
+        )
+        self.azure_tenant_id = self._load_from_env_or_default(
+            self.azure_tenant_id, "AZURE_TENANT_ID"
+        )
+        self.azure_client_secret = self._load_from_env_or_default(
+            self.azure_client_secret, "AZURE_CLIENT_SECRET"
+        )
+        # Set defaults from environment if not provided
+        self.azure_resource = self._load_from_env_or_default(
+            self.azure_resource,
+            "AZURE_RESOURCE",
+            "https://prometheus.monitor.azure.com",
+        )
+        # from https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/how-to-use-vm-token
+        self.azure_metadata_endpoint = self._load_from_env_or_default(
+            self.azure_metadata_endpoint,
+            "AZURE_METADATA_ENDPOINT",
+            "http://169.254.169.254/metadata/identity/oauth2/token",
+        )
+        self.azure_token_endpoint = self._load_from_env_or_default(
+            self.azure_token_endpoint, "AZURE_TOKEN_ENDPOINT"
+        )
+        if not self.azure_token_endpoint and self.azure_tenant_id:
+            self.azure_token_endpoint = (
+                f"https://login.microsoftonline.com/{self.azure_tenant_id}/oauth2/token"
+            )
+        # Check if managed identity should be used
+        if not self.azure_use_managed_id:
+            self.azure_use_managed_id = os.environ.get(
+                "AZURE_USE_MANAGED_ID", "false"
+            ).lower() in ("true", "1")
+        # Convert None to empty string for prometrix compatibility (prometrix checks != "")
+        azure_client_id = self.azure_client_id or ""
+        azure_tenant_id = self.azure_tenant_id or ""
+        azure_client_secret = self.azure_client_secret or ""
+        azure_resource = self.azure_resource or ""
+        azure_metadata_endpoint = self.azure_metadata_endpoint or ""
+        azure_token_endpoint = self.azure_token_endpoint or ""
+        # Create prometrix Azure config
+        self._prometrix_config = PrometrixAzureConfig(
+            url=self.prometheus_url,
+            azure_resource=azure_resource,
+            azure_metadata_endpoint=azure_metadata_endpoint,
+            azure_token_endpoint=azure_token_endpoint,
+            azure_use_managed_id=self.azure_use_managed_id,
+            azure_client_id=azure_client_id,
+            azure_client_secret=azure_client_secret,
+            azure_tenant_id=azure_tenant_id,
+            disable_ssl=not self.verify_ssl,
+            additional_labels=self.additional_labels,
+        )
+        # Ensure promtrix gets a real bool (not string) for managed identity
+        # fixing internal prometrix config issue
+        object.__setattr__(
+            self._prometrix_config,
+            "azure_use_managed_id",
+            bool(self.azure_use_managed_id),
+        )
+        PrometheusAuthorization.azure_authorization(self._prometrix_config)
+    @staticmethod
+    def is_azure_config(config: dict[str, Any]) -> bool:
+        """Check if config dict or environment variables indicate Azure Prometheus config."""
+        # Check for explicit Azure fields in config
+        if (
+            "azure_client_id" in config
+            or "azure_tenant_id" in config
+            or "azure_use_managed_id" in config
+        ):
+            return True
+        # Check for Azure environment variables
+        if os.environ.get("AZURE_CLIENT_ID") or os.environ.get("AZURE_TENANT_ID"):
+            return True
+        return False
+    def is_amp(self) -> bool:
+        return False
+    def _should_refresh_token(self) -> bool:
+        if not PrometheusAuthorization.bearer_token:
+            return True
+        return (time.time() - self._token_created_at) >= self.refresh_interval_seconds
+    def request_new_token(self) -> bool:
+        """Request a new Azure access token using prometrix."""
+        success = PrometheusAuthorization.request_new_token(self._prometrix_config)
+        if success:
+            self._token_created_at = time.time()
+        return success
+    def get_authorization_headers(self) -> Dict[str, str]:
+        # Request new token if needed
+        if self._should_refresh_token():
+            if not self.request_new_token():
+                logging.error("Failed to request new Azure access token")
+                return {}
+            self._token_created_at = time.time()
+        headers = PrometheusAuthorization.get_authorization_headers(
+            self._prometrix_config
+        )
+        if not headers.get("Authorization"):
+            logging.warning("No authorization header generated for Azure Prometheus")
+        return headers
 class BasePrometheusTool(Tool):
     toolset: "PrometheusToolset"
 def do_request(
-    config,  # PrometheusConfig | AMPConfig
+    config,  # PrometheusConfig | AMPConfig | AzurePrometheusConfig
     url: str,
     params: Optional[Dict] = None,
     data: Optional[Dict] = None,
@@ -149,17 +391,20 @@ def do_request(
     """
     Route a request through either:
       - AWSPrometheusConnect (SigV4) when config is AMPConfig
+      - Azure bearer token auth when config is AzurePrometheusConfig
       - plain requests otherwise
     method defaults to GET so callers can omit it for reads.
     """
     if verify is None:
-        verify = config.prometheus_ssl_enabled
+        verify = config.verify_ssl
     if headers is None:
         headers = config.headers or {}
     if isinstance(config, AMPConfig):
         client = config.get_aws_client()  # cached AWSPrometheusConnect
+        # Note: timeout parameter is not supported by prometrix's signed_request
+        # AWS/AMP requests will not respect the timeout setting
         return client.signed_request(  # type: ignore
             method=method,
             url=url,
@@ -169,7 +414,21 @@ def do_request(
             headers=headers,
         )
-    # Non-AMP: plain HTTP
+    if isinstance(config, AzurePrometheusConfig):
+        # Merge Azure authorization headers with provided headers
+        azure_headers = config.get_authorization_headers()
+        headers = {**azure_headers, **headers}
+        return requests.request(
+            method=method,
+            url=url,
+            headers=headers,
+            params=params,
+            data=data,
+            timeout=timeout,
+            verify=verify,
+        )
+    # Non-AMP, Non-Azure: plain HTTP
     return requests.request(
         method=method,
         url=url,
@@ -181,99 +440,6 @@ def do_request(
     )
-def filter_metrics_by_type(metrics: Dict, expected_type: str):
-    return {
-        metric_name: metric_data
-        for metric_name, metric_data in metrics.items()
-        if expected_type in metric_data.get("type", "")
-        or metric_data.get("type", "") == "?"
-    }
-def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
-    regex = re.compile(pattern)
-    return {
-        metric_name: metric_data
-        for metric_name, metric_data in metrics.items()
-        if regex.search(metric_name)
-    }
-METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
-def fetch_metadata(
-    prometheus_url: str,
-    headers: Optional[Dict],
-    config,
-    verify_ssl: bool = True,
-) -> Dict:
-    metadata_url = urljoin(prometheus_url, "api/v1/metadata")
-    metadata_response = do_request(
-        config=config,
-        url=metadata_url,
-        headers=headers,
-        timeout=60,
-        verify=verify_ssl,
-        method="GET",
-    )
-    metadata_response.raise_for_status()
-    metadata = metadata_response.json()["data"]
-    metrics = {}
-    for metric_name, meta_list in metadata.items():
-        if meta_list:
-            metric_type = meta_list[0].get("type", "unknown")
-            metric_description = meta_list[0].get("help", "unknown")
-            metrics[metric_name] = {
-                "type": metric_type,
-                "description": metric_description,
-                "labels": set(),
-            }
-    return metrics
-def fetch_metadata_with_series_api(
-    prometheus_url: str,
-    metric_name: str,
-    headers: Dict,
-    config,
-    verify_ssl: bool = True,
-) -> Dict:
-    url = urljoin(prometheus_url, "api/v1/series")
-    params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
-    response = do_request(
-        config=config,
-        url=url,
-        headers=headers,
-        params=params,
-        timeout=60,
-        verify=verify_ssl,
-        method="GET",
-    )
-    response.raise_for_status()
-    metrics = response.json()["data"]
-    metadata: Dict = {}
-    for metric_data in metrics:
-        metric_name = metric_data.get("__name__")
-        if not metric_name:
-            continue
-        metric = metadata.get(metric_name)
-        if not metric:
-            metric = {"description": "?", "type": "?", "labels": set()}
-            metadata[metric_name] = metric
-        labels = {k for k in metric_data.keys() if k != "__name__"}
-        metric["labels"].update(labels)
-    return metadata
 def result_has_data(result: Dict) -> bool:
     data = result.get("data", {})
     if len(data.get("result", [])) > 0:
@@ -284,33 +450,58 @@ def result_has_data(result: Dict) -> bool:
 def adjust_step_for_max_points(
     start_timestamp: str,
     end_timestamp: str,
-    step: float,
+    step: Optional[float] = None,
+    max_points_override: Optional[float] = None,
 ) -> float:
     """
     Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
-    Max points is controlled by the PROMETHEUS_MAX_GRAPH_POINTS environment variable (default: 300).
     Args:
         start_timestamp: RFC3339 formatted start time
         end_timestamp: RFC3339 formatted end time
-        step: The requested step duration in seconds
+        step: The requested step duration in seconds (None for auto-calculation)
+        max_points_override: Optional override for max points (must be <= MAX_GRAPH_POINTS)
     Returns:
         Adjusted step value in seconds that ensures points <= max_points
     """
+    # Use override if provided and valid, otherwise use default
+    max_points = MAX_GRAPH_POINTS
+    if max_points_override is not None:
+        if max_points_override > MAX_GRAPH_POINTS:
+            logging.warning(
+                f"max_points override ({max_points_override}) exceeds system limit ({MAX_GRAPH_POINTS}), using {MAX_GRAPH_POINTS}"
+            )
+            max_points = MAX_GRAPH_POINTS
+        elif max_points_override < 1:
+            logging.warning(
+                f"max_points override ({max_points_override}) is invalid, using default {MAX_GRAPH_POINTS}"
+            )
+            max_points = MAX_GRAPH_POINTS
+        else:
+            max_points = max_points_override
+            logging.debug(f"Using max_points override: {max_points}")
     start_dt = dateutil.parser.parse(start_timestamp)
     end_dt = dateutil.parser.parse(end_timestamp)
     time_range_seconds = (end_dt - start_dt).total_seconds()
+    # If no step provided, calculate a reasonable default
+    # Aim for ~60 data points across the time range (1 per minute for hourly, etc)
+    if step is None:
+        step = max(1, time_range_seconds / 60)
+        logging.debug(
+            f"No step provided, defaulting to {step}s for {time_range_seconds}s range"
+        )
     current_points = time_range_seconds / step
     # If current points exceed max, adjust the step
-    if current_points > MAX_GRAPH_POINTS:
-        adjusted_step = time_range_seconds / MAX_GRAPH_POINTS
+    if current_points > max_points:
+        adjusted_step = time_range_seconds / max_points
         logging.info(
-            f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {MAX_GRAPH_POINTS}"
+            f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {max_points}"
         )
         return adjusted_step
@@ -324,185 +515,149 @@ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]
     return results
-def fetch_metrics_labels_with_series_api(
-    prometheus_url: str,
-    headers: Dict[str, str],
-    cache: Optional[TTLCache],
-    metrics_labels_time_window_hrs: Union[int, None],
-    metric_name: str,
-    config=None,
-    verify_ssl: bool = True,
-) -> dict:
-    """This is a slow query. Takes 5+ seconds to run"""
-    cache_key = f"metrics_labels_series_api:{metric_name}"
-    if cache:
-        cached_result = cache.get(cache_key)
-        if cached_result:
-            return cached_result
-    series_url = urljoin(prometheus_url, "api/v1/series")
-    params: dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
-    if metrics_labels_time_window_hrs is not None:
-        params["end"] = int(time.time())
-        params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
-    series_response = do_request(
-        config=config,
-        url=series_url,
-        headers=headers,
-        params=params,
-        timeout=60,
-        verify=verify_ssl,
-        method="GET",
-    )
-    series_response.raise_for_status()
-    series = series_response.json()["data"]
-    metrics_labels: dict = {}
-    for serie in series:
-        metric_name = serie["__name__"]
-        # Add all labels except __name__
-        labels = {k for k in serie.keys() if k != "__name__"}
-        if metric_name in metrics_labels:
-            metrics_labels[metric_name].update(labels)
-        else:
-            metrics_labels[metric_name] = labels
-    if cache:
-        cache.set(cache_key, metrics_labels)
-    return metrics_labels
-def fetch_metrics_labels_with_labels_api(
-    prometheus_url: str,
-    cache: Optional[TTLCache],
-    metrics_labels_time_window_hrs: Union[int, None],
-    metric_names: List[str],
-    headers: Dict,
-    config=None,
-    verify_ssl: bool = True,
-) -> dict:
-    metrics_labels = {}
-    for metric_name in metric_names:
-        cache_key = f"metrics_labels_labels_api:{metric_name}"
-        if cache:
-            cached_result = cache.get(cache_key)
-            if cached_result:
-                metrics_labels[metric_name] = cached_result
-        url = urljoin(prometheus_url, "api/v1/labels")
-        params: dict = {
-            "match[]": f'{{__name__="{metric_name}"}}',
-        }
-        if metrics_labels_time_window_hrs is not None:
-            params["end"] = int(time.time())
-            params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
+def create_data_summary_for_large_result(
+    result_data: Dict, query: str, data_size_tokens: int, is_range_query: bool = False
+) -> Dict[str, Any]:
+    """
+    Create a summary for large Prometheus results instead of returning full data.
-        response = do_request(
-            config=config,
-            url=url,
-            headers=headers,
-            params=params,
-            timeout=60,
-            verify=verify_ssl,
-            method="GET",
-        )
-        response.raise_for_status()
-        labels = response.json()["data"]
-        filtered_labels = {label for label in labels if label != "__name__"}
-        metrics_labels[metric_name] = filtered_labels
-        if cache:
-            cache.set(cache_key, filtered_labels)
-    return metrics_labels
-def fetch_metrics(
-    prometheus_url: str,
-    cache: Optional[TTLCache],
-    metrics_labels_time_window_hrs: Union[int, None],
-    metric_name: str,
-    should_fetch_labels_with_labels_api: bool,
-    should_fetch_metadata_with_series_api: bool,
-    headers: Dict,
-    config=None,
-    verify_ssl: bool = True,
-) -> dict:
-    metrics = None
-    should_fetch_labels = True
-    if should_fetch_metadata_with_series_api:
-        metrics = fetch_metadata_with_series_api(
-            prometheus_url=prometheus_url,
-            metric_name=metric_name,
-            headers=headers,
-            config=config,
-            verify_ssl=verify_ssl,
+    Args:
+        result_data: The Prometheus data result
+        query: The original PromQL query
+        data_size_tokens: Size of the data in tokens
+        is_range_query: Whether this is a range query (vs instant query)
+    Returns:
+        Dictionary with summary information and suggestions
+    """
+    if is_range_query:
+        series_list = result_data.get("result", [])
+        num_items = len(series_list)
+        # Calculate exact total data points across all series
+        total_points = 0
+        for series in series_list:  # Iterate through ALL series for exact count
+            points = len(series.get("values", []))
+            total_points += points
+        # Analyze label keys and their cardinality
+        label_cardinality: Dict[str, set] = {}
+        for series in series_list:
+            metric = series.get("metric", {})
+            for label_key, label_value in metric.items():
+                if label_key not in label_cardinality:
+                    label_cardinality[label_key] = set()
+                label_cardinality[label_key].add(label_value)
+        # Convert sets to counts for the summary
+        label_summary = {
+            label: len(values) for label, values in label_cardinality.items()
+        }
+        # Sort by cardinality (highest first) for better insights
+        label_summary = dict(
+            sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
         )
-        should_fetch_labels = False  # series API returns the labels
+        return {
+            "message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} time series with {total_points:,} total data points.",
+            "series_count": num_items,
+            "total_data_points": total_points,
+            "data_size_tokens": data_size_tokens,
+            "label_cardinality": label_summary,
+            "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
+        }
     else:
-        metrics = fetch_metadata(
-            prometheus_url=prometheus_url,
-            headers=headers,
-            config=config,
-            verify_ssl=verify_ssl,
+        # Instant query
+        result_type = result_data.get("resultType", "")
+        result_list = result_data.get("result", [])
+        num_items = len(result_list)
+        # Analyze label keys and their cardinality
+        instant_label_cardinality: Dict[str, set] = {}
+        for item in result_list:
+            if isinstance(item, dict):
+                metric = item.get("metric", {})
+                for label_key, label_value in metric.items():
+                    if label_key not in instant_label_cardinality:
+                        instant_label_cardinality[label_key] = set()
+                    instant_label_cardinality[label_key].add(label_value)
+        # Convert sets to counts for the summary
+        label_summary = {
+            label: len(values) for label, values in instant_label_cardinality.items()
+        }
+        # Sort by cardinality (highest first) for better insights
+        label_summary = dict(
+            sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
         )
-        metrics = filter_metrics_by_name(metrics, metric_name)
-    if should_fetch_labels:
-        metrics_labels = {}
-        if should_fetch_labels_with_labels_api:
-            metrics_labels = fetch_metrics_labels_with_labels_api(
-                prometheus_url=prometheus_url,
-                cache=cache,
-                metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
-                metric_names=list(metrics.keys()),
-                headers=headers,
-                config=config,
-                verify_ssl=verify_ssl,
-            )
-        else:
-            metrics_labels = fetch_metrics_labels_with_series_api(
-                prometheus_url=prometheus_url,
-                cache=cache,
-                metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
-                metric_name=metric_name,
-                headers=headers,
-                config=config,
-                verify_ssl=verify_ssl,
-            )
+        return {
+            "message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} results.",
+            "result_count": num_items,
+            "result_type": result_type,
+            "data_size_tokens": data_size_tokens,
+            "label_cardinality": label_summary,
+            "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
+        }
-        for metric_name in metrics:
-            if metric_name in metrics_labels:
-                metrics[metric_name]["labels"] = metrics_labels[metric_name]
-    return metrics
+class MetricsBasedResponse(BaseModel):
+    status: str
+    error_message: Optional[str] = None
+    data: Optional[str] = None
+    tool_name: str
+    description: str
+    query: str
+    start: Optional[str] = None
+    end: Optional[str] = None
+    step: Optional[float] = None
+    output_type: Optional[str] = None
+    data_summary: Optional[dict[str, Any]] = None
+def create_structured_tool_result(
+    params: dict, response: MetricsBasedResponse
+) -> StructuredToolResult:
+    status = StructuredToolResultStatus.SUCCESS
+    error = None
+    if response.error_message or response.status.lower() in ("failed", "error"):
+        status = StructuredToolResultStatus.ERROR
+        error = (
+            response.error_message
+            if response.error_message
+            else "Unknown Prometheus error"
+        )
+    elif not response.data:
+        status = StructuredToolResultStatus.NO_DATA
+    return StructuredToolResult(
+        status=status,
+        data=response,
+        params=params,
+        error=error,
+    )
 class ListPrometheusRules(BasePrometheusTool):
     def __init__(self, toolset: "PrometheusToolset"):
         super().__init__(
             name="list_prometheus_rules",
-            description="List all defined prometheus rules. Will show the prometheus rules description, expression and annotations",
+            description="List all defined Prometheus rules (api/v1/rules). Will show the Prometheus rules description, expression and annotations",
             parameters={},
             toolset=toolset,
         )
         self._cache = None
-    def _invoke(
-        self, params: dict, user_approved: bool = False
-    ) -> StructuredToolResult:
+    def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
         if not self.toolset.config or not self.toolset.config.prometheus_url:
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Prometheus is not configured. Prometheus URL is missing",
                 params=params,
             )
         if self.toolset.config.is_amp():
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Tool not supported in AMP",
                 params=params,
             )
@@ -515,7 +670,7 @@ class ListPrometheusRules(BasePrometheusTool):
                     logging.debug("rules returned from cache")
                     return StructuredToolResult(
-                        status=ToolResultStatus.SUCCESS,
+                        status=StructuredToolResultStatus.SUCCESS,
                         data=cached_rules,
                         params=params,
                     )
@@ -528,8 +683,8 @@ class ListPrometheusRules(BasePrometheusTool):
                 config=self.toolset.config,
                 url=rules_url,
                 params=params,
-                timeout=180,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                timeout=40,
+                verify=self.toolset.config.verify_ssl,
                 headers=self.toolset.config.headers,
                 method="GET",
             )
@@ -539,28 +694,35 @@ class ListPrometheusRules(BasePrometheusTool):
             if self._cache:
                 self._cache.set(PROMETHEUS_RULES_CACHE_KEY, data)
             return StructuredToolResult(
-                status=ToolResultStatus.SUCCESS,
+                status=StructuredToolResultStatus.SUCCESS,
                 data=data,
                 params=params,
             )
         except requests.Timeout:
             logging.warning("Timeout while fetching prometheus rules", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Request timed out while fetching rules",
                 params=params,
             )
+        except SSLError as e:
+            logging.warning("SSL error while fetching prometheus rules", exc_info=True)
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
+                params=params,
+            )
         except RequestException as e:
             logging.warning("Failed to fetch prometheus rules", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Network error while fetching rules: {str(e)}",
                 params=params,
             )
         except Exception as e:
             logging.warning("Failed to process prometheus rules", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Unexpected error: {str(e)}",
                 params=params,
             )
@@ -569,120 +731,553 @@ class ListPrometheusRules(BasePrometheusTool):
         return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Rules"
-class ListAvailableMetrics(BasePrometheusTool):
+class GetMetricNames(BasePrometheusTool):
+    """Thin wrapper around /api/v1/label/__name__/values - the fastest way to discover metric names"""
     def __init__(self, toolset: "PrometheusToolset"):
         super().__init__(
-            name="list_available_metrics",
-            description="List all the available metrics to query from prometheus, including their types (counter, gauge, histogram, summary) and available labels.",
+            name="get_metric_names",
+            description=(
+                "Get list of metric names using /api/v1/label/__name__/values. "
+                "FASTEST method for metric discovery when you need to explore available metrics. "
+                f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique metric names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use a more specific filter. "
+                f"ALWAYS use match[] parameter to filter metrics - without it you'll get random {PROMETHEUS_METADATA_API_LIMIT} metrics which is rarely useful. "
+                "Note: Does not return metric metadata (type, description, labels). "
+                "By default returns metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
+            ),
             parameters={
-                "type_filter": ToolParameter(
-                    description="Optional filter to only return a specific metric type. Can be one of counter, gauge, histogram, summary",
+                "match": ToolParameter(
+                    description=(
+                        "REQUIRED: PromQL selector to filter metrics. Use regex OR (|) to check multiple patterns in one call - much faster than multiple calls! Examples: "
+                        "'{__name__=~\"node_cpu.*|node_memory.*|node_disk.*\"}' for all node resource metrics, "
+                        "'{__name__=~\"container_cpu.*|container_memory.*|container_network.*\"}' for all container metrics, "
+                        "'{__name__=~\"kube_pod.*|kube_deployment.*|kube_service.*\"}' for multiple Kubernetes object metrics, "
+                        "'{__name__=~\".*cpu.*|.*memory.*|.*disk.*\"}' for all resource metrics, "
+                        "'{namespace=~\"kube-system|default|monitoring\"}' for metrics from multiple namespaces, "
+                        "'{job=~\"prometheus|node-exporter|kube-state-metrics\"}' for metrics from multiple jobs."
+                    ),
+                    type="string",
+                    required=True,
+                ),
+                "start": ToolParameter(
+                    description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
                     type="string",
                     required=False,
                 ),
-                "name_filter": ToolParameter(
-                    description="Only the metrics partially or fully matching this name will be returned",
+                "end": ToolParameter(
+                    description="End timestamp (RFC3339 or Unix). Default: now",
                     type="string",
-                    required=True,
+                    required=False,
                 ),
             },
             toolset=toolset,
         )
-        self._cache = None
-    def _invoke(
-        self, params: dict, user_approved: bool = False
-    ) -> StructuredToolResult:
+    def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
         if not self.toolset.config or not self.toolset.config.prometheus_url:
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Prometheus is not configured. Prometheus URL is missing",
                 params=params,
             )
-        if not self._cache and self.toolset.config.metrics_labels_cache_duration_hrs:
-            self._cache = TTLCache(
-                self.toolset.config.metrics_labels_cache_duration_hrs * 3600  # type: ignore
-            )
         try:
-            prometheus_url = self.toolset.config.prometheus_url
-            metrics_labels_time_window_hrs = (
-                self.toolset.config.metrics_labels_time_window_hrs
+            match_param = params.get("match")
+            if not match_param:
+                return StructuredToolResult(
+                    status=StructuredToolResultStatus.ERROR,
+                    error="Match parameter is required to filter metrics",
+                    params=params,
+                )
+            url = urljoin(
+                self.toolset.config.prometheus_url, "api/v1/label/__name__/values"
             )
+            query_params = {
+                "limit": str(PROMETHEUS_METADATA_API_LIMIT),
+                "match[]": match_param,
+            }
+            # Add time parameters - use provided values or defaults
+            if params.get("end"):
+                query_params["end"] = params["end"]
+            else:
+                query_params["end"] = str(int(time.time()))
+            if params.get("start"):
+                query_params["start"] = params["start"]
+            elif self.toolset.config.discover_metrics_from_last_hours:
+                # Use default time window
+                query_params["start"] = str(
+                    int(time.time())
+                    - (self.toolset.config.discover_metrics_from_last_hours * 3600)
+                )
+            response = do_request(
+                config=self.toolset.config,
+                url=url,
+                params=query_params,
+                timeout=self.toolset.config.metadata_timeout_seconds_default,
+                verify=self.toolset.config.verify_ssl,
+                headers=self.toolset.config.headers,
+                method="GET",
+            )
+            response.raise_for_status()
+            data = response.json()
+            # Check if results were truncated
+            if (
+                "data" in data
+                and isinstance(data["data"], list)
+                and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
+            ):
+                data["_truncated"] = True
+                data["_message"] = (
+                    f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match filter to see additional metrics."
+                )
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.SUCCESS,
+                data=data,
+                params=params,
+            )
+        except Exception as e:
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error=str(e),
+                params=params,
+            )
+    def get_parameterized_one_liner(self, params) -> str:
+        return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metric Names"
+class GetLabelValues(BasePrometheusTool):
+    """Get values for a specific label across all metrics"""
-            name_filter = params.get("name_filter")
-            if not name_filter:
+    def __init__(self, toolset: "PrometheusToolset"):
+        super().__init__(
+            name="get_label_values",
+            description=(
+                "Get all values for a specific label using /api/v1/label/{label}/values. "
+                "Use this to discover pods, namespaces, jobs, instances, etc. "
+                f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique values (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
+                "Supports optional match[] parameter to filter. "
+                "By default returns values from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
+            ),
+            parameters={
+                "label": ToolParameter(
+                    description="Label name to get values for (e.g., 'pod', 'namespace', 'job', 'instance')",
+                    type="string",
+                    required=True,
+                ),
+                "match": ToolParameter(
+                    description=(
+                        "Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
+                        "'{namespace=\"default\"}')."
+                    ),
+                    type="string",
+                    required=False,
+                ),
+                "start": ToolParameter(
+                    description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
+                    type="string",
+                    required=False,
+                ),
+                "end": ToolParameter(
+                    description="End timestamp (RFC3339 or Unix). Default: now",
+                    type="string",
+                    required=False,
+                ),
+            },
+            toolset=toolset,
+        )
+    def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
+        if not self.toolset.config or not self.toolset.config.prometheus_url:
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error="Prometheus is not configured. Prometheus URL is missing",
+                params=params,
+            )
+        try:
+            label = params.get("label")
+            if not label:
                 return StructuredToolResult(
-                    status=ToolResultStatus.ERROR,
-                    error="Error: cannot run tool 'list_available_metrics'. The param 'name_filter' is required but is missing.",
+                    status=StructuredToolResultStatus.ERROR,
+                    error="Label parameter is required",
                     params=params,
                 )
-            metrics = fetch_metrics(
-                prometheus_url=prometheus_url,
-                cache=self._cache,
-                metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
-                metric_name=name_filter,
-                should_fetch_labels_with_labels_api=self.toolset.config.fetch_labels_with_labels_api,
-                should_fetch_metadata_with_series_api=self.toolset.config.fetch_metadata_with_series_api,
+            url = urljoin(
+                self.toolset.config.prometheus_url, f"api/v1/label/{label}/values"
+            )
+            query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
+            if params.get("match"):
+                query_params["match[]"] = params["match"]
+            # Add time parameters - use provided values or defaults
+            if params.get("end"):
+                query_params["end"] = params["end"]
+            else:
+                query_params["end"] = str(int(time.time()))
+            if params.get("start"):
+                query_params["start"] = params["start"]
+            elif self.toolset.config.discover_metrics_from_last_hours:
+                # Use default time window
+                query_params["start"] = str(
+                    int(time.time())
+                    - (self.toolset.config.discover_metrics_from_last_hours * 3600)
+                )
+            response = do_request(
+                config=self.toolset.config,
+                url=url,
+                params=query_params,
+                timeout=self.toolset.config.metadata_timeout_seconds_default,
+                verify=self.toolset.config.verify_ssl,
                 headers=self.toolset.config.headers,
+                method="GET",
+            )
+            response.raise_for_status()
+            data = response.json()
+            # Check if results were truncated
+            if (
+                "data" in data
+                and isinstance(data["data"], list)
+                and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
+            ):
+                data["_truncated"] = True
+                data["_message"] = (
+                    f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter label '{label}' values."
+                )
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.SUCCESS,
+                data=data,
+                params=params,
+            )
+        except Exception as e:
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error=str(e),
+                params=params,
+            )
+    def get_parameterized_one_liner(self, params) -> str:
+        label = params.get("label", "")
+        return f"{toolset_name_for_one_liner(self.toolset.name)}: Get {label} Values"
+class GetAllLabels(BasePrometheusTool):
+    """Get all label names that exist in Prometheus"""
+    def __init__(self, toolset: "PrometheusToolset"):
+        super().__init__(
+            name="get_all_labels",
+            description=(
+                "Get list of all label names using /api/v1/labels. "
+                "Use this to discover what labels are available across all metrics. "
+                f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} label names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
+                "Supports optional match[] parameter to filter. "
+                "By default returns labels from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
+            ),
+            parameters={
+                "match": ToolParameter(
+                    description=(
+                        "Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
+                        "'{job=\"prometheus\"}')."
+                    ),
+                    type="string",
+                    required=False,
+                ),
+                "start": ToolParameter(
+                    description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
+                    type="string",
+                    required=False,
+                ),
+                "end": ToolParameter(
+                    description="End timestamp (RFC3339 or Unix). Default: now",
+                    type="string",
+                    required=False,
+                ),
+            },
+            toolset=toolset,
+        )
+    def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
+        if not self.toolset.config or not self.toolset.config.prometheus_url:
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error="Prometheus is not configured. Prometheus URL is missing",
+                params=params,
+            )
+        try:
+            url = urljoin(self.toolset.config.prometheus_url, "api/v1/labels")
+            query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
+            if params.get("match"):
+                query_params["match[]"] = params["match"]
+            # Add time parameters - use provided values or defaults
+            if params.get("end"):
+                query_params["end"] = params["end"]
+            else:
+                query_params["end"] = str(int(time.time()))
+            if params.get("start"):
+                query_params["start"] = params["start"]
+            elif self.toolset.config.discover_metrics_from_last_hours:
+                # Use default time window
+                query_params["start"] = str(
+                    int(time.time())
+                    - (self.toolset.config.discover_metrics_from_last_hours * 3600)
+                )
+            response = do_request(
                 config=self.toolset.config,
-                verify_ssl=self.toolset.config.prometheus_ssl_enabled,
+                url=url,
+                params=query_params,
+                timeout=self.toolset.config.metadata_timeout_seconds_default,
+                verify=self.toolset.config.verify_ssl,
+                headers=self.toolset.config.headers,
+                method="GET",
             )
+            response.raise_for_status()
+            data = response.json()
+            # Check if results were truncated
+            if (
+                "data" in data
+                and isinstance(data["data"], list)
+                and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
+            ):
+                data["_truncated"] = True
+                data["_message"] = (
+                    f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter labels."
+                )
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.SUCCESS,
+                data=data,
+                params=params,
+            )
+        except Exception as e:
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error=str(e),
+                params=params,
+            )
+    def get_parameterized_one_liner(self, params) -> str:
+        return f"{toolset_name_for_one_liner(self.toolset.name)}: Get All Labels"
-            type_filter = params.get("type_filter")
-            if type_filter:
-                metrics = filter_metrics_by_type(metrics, type_filter)
-            output = ["Metric | Description | Type | Labels"]
-            output.append("-" * 100)
+class GetSeries(BasePrometheusTool):
+    """Get time series matching a selector"""
-            for metric, info in sorted(metrics.items()):
-                labels_str = (
-                    ", ".join(sorted(info["labels"])) if info["labels"] else "none"
+    def __init__(self, toolset: "PrometheusToolset"):
+        super().__init__(
+            name="get_series",
+            description=(
+                "Get time series using /api/v1/series. "
+                "Returns label sets for all time series matching the selector. "
+                "SLOWER than other discovery methods - use only when you need full label sets. "
+                f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} series (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more series exist - use more specific selector. "
+                "Requires match[] parameter with PromQL selector. "
+                "By default returns series active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
+            ),
+            parameters={
+                "match": ToolParameter(
+                    description=(
+                        "PromQL selector to match series (e.g., 'up', 'node_cpu_seconds_total', "
+                        "'{__name__=~\"node.*\"}', '{job=\"prometheus\"}', "
+                        '\'{__name__="up",job="prometheus"}\').'
+                    ),
+                    type="string",
+                    required=True,
+                ),
+                "start": ToolParameter(
+                    description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
+                    type="string",
+                    required=False,
+                ),
+                "end": ToolParameter(
+                    description="End timestamp (RFC3339 or Unix). Default: now",
+                    type="string",
+                    required=False,
+                ),
+            },
+            toolset=toolset,
+        )
+    def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
+        if not self.toolset.config or not self.toolset.config.prometheus_url:
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error="Prometheus is not configured. Prometheus URL is missing",
+                params=params,
+            )
+        try:
+            match = params.get("match")
+            if not match:
+                return StructuredToolResult(
+                    status=StructuredToolResultStatus.ERROR,
+                    error="Match parameter is required",
+                    params=params,
                 )
-                output.append(
-                    f"{metric} | {info['description']} | {info['type']} | {labels_str}"
+            url = urljoin(self.toolset.config.prometheus_url, "api/v1/series")
+            query_params = {
+                "match[]": match,
+                "limit": str(PROMETHEUS_METADATA_API_LIMIT),
+            }
+            # Add time parameters - use provided values or defaults
+            if params.get("end"):
+                query_params["end"] = params["end"]
+            else:
+                query_params["end"] = str(int(time.time()))
+            if params.get("start"):
+                query_params["start"] = params["start"]
+            elif self.toolset.config.discover_metrics_from_last_hours:
+                # Use default time window
+                query_params["start"] = str(
+                    int(time.time())
+                    - (self.toolset.config.discover_metrics_from_last_hours * 3600)
                 )
-            table_output = "\n".join(output)
+            response = do_request(
+                config=self.toolset.config,
+                url=url,
+                params=query_params,
+                timeout=self.toolset.config.metadata_timeout_seconds_default,
+                verify=self.toolset.config.verify_ssl,
+                headers=self.toolset.config.headers,
+                method="GET",
+            )
+            response.raise_for_status()
+            data = response.json()
+            # Check if results were truncated
+            if (
+                "data" in data
+                and isinstance(data["data"], list)
+                and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
+            ):
+                data["_truncated"] = True
+                data["_message"] = (
+                    f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match selector to see additional series."
+                )
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.SUCCESS,
+                data=data,
+                params=params,
+            )
+        except Exception as e:
             return StructuredToolResult(
-                status=ToolResultStatus.SUCCESS,
-                data=table_output,
+                status=StructuredToolResultStatus.ERROR,
+                error=str(e),
                 params=params,
             )
-        except requests.Timeout:
-            logging.warn("Timeout while fetching prometheus metrics", exc_info=True)
+    def get_parameterized_one_liner(self, params) -> str:
+        return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Series"
+class GetMetricMetadata(BasePrometheusTool):
+    """Get metadata (type, description, unit) for metrics"""
+    def __init__(self, toolset: "PrometheusToolset"):
+        super().__init__(
+            name="get_metric_metadata",
+            description=(
+                "Get metric metadata using /api/v1/metadata. "
+                "Returns type, help text, and unit for metrics. "
+                "Use after discovering metric names to get their descriptions. "
+                f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} metrics (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - filter by specific metric name. "
+                "Supports optional metric name filter."
+            ),
+            parameters={
+                "metric": ToolParameter(
+                    description=(
+                        "Optional metric name to filter (e.g., 'up', 'node_cpu_seconds_total'). "
+                        "If not provided, returns metadata for all metrics."
+                    ),
+                    type="string",
+                    required=False,
+                ),
+            },
+            toolset=toolset,
+        )
+    def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
+        if not self.toolset.config or not self.toolset.config.prometheus_url:
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
-                error="Request timed out while fetching metrics",
+                status=StructuredToolResultStatus.ERROR,
+                error="Prometheus is not configured. Prometheus URL is missing",
                 params=params,
             )
-        except RequestException as e:
-            logging.warn("Failed to fetch prometheus metrics", exc_info=True)
+        try:
+            url = urljoin(self.toolset.config.prometheus_url, "api/v1/metadata")
+            query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
+            if params.get("metric"):
+                query_params["metric"] = params["metric"]
+            response = do_request(
+                config=self.toolset.config,
+                url=url,
+                params=query_params,
+                timeout=self.toolset.config.metadata_timeout_seconds_default,
+                verify=self.toolset.config.verify_ssl,
+                headers=self.toolset.config.headers,
+                method="GET",
+            )
+            response.raise_for_status()
+            data = response.json()
+            # Check if results were truncated (metadata endpoint returns a dict, not a list)
+            if (
+                "data" in data
+                and isinstance(data["data"], dict)
+                and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
+            ):
+                data["_truncated"] = True
+                data["_message"] = (
+                    f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use metric parameter to filter by specific metric name."
+                )
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
-                error=f"Network error while fetching metrics: {str(e)}",
+                status=StructuredToolResultStatus.SUCCESS,
+                data=data,
                 params=params,
             )
         except Exception as e:
-            logging.warn("Failed to process prometheus metrics", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
-                error=f"Unexpected error: {str(e)}",
+                status=StructuredToolResultStatus.ERROR,
+                error=str(e),
                 params=params,
             )
     def get_parameterized_one_liner(self, params) -> str:
-        name_filter = params.get("name_filter", "")
-        return f"{toolset_name_for_one_liner(self.toolset.name)}: Search Metrics ({name_filter})"
+        metric = params.get("metric", "all")
+        return (
+            f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metadata ({metric})"
+        )
 class ExecuteInstantQuery(BasePrometheusTool):
     def __init__(self, toolset: "PrometheusToolset"):
         super().__init__(
             name="execute_prometheus_instant_query",
-            description="Execute an instant PromQL query",
+            description=(
+                f"Execute an instant PromQL query (single point in time). "
+                f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
+                f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries."
+            ),
             parameters={
                 "query": ToolParameter(
                     description="The PromQL query",
@@ -694,16 +1289,23 @@ class ExecuteInstantQuery(BasePrometheusTool):
                     type="string",
                     required=True,
                 ),
+                "timeout": ToolParameter(
+                    description=(
+                        f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
+                        f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
+                        f"Increase for complex queries that may take longer."
+                    ),
+                    type="number",
+                    required=False,
+                ),
             },
             toolset=toolset,
         )
-    def _invoke(
-        self, params: dict, user_approved: bool = False
-    ) -> StructuredToolResult:
+    def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
         if not self.toolset.config or not self.toolset.config.prometheus_url:
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Prometheus is not configured. Prometheus URL is missing",
                 params=params,
             )
@@ -715,13 +1317,25 @@ class ExecuteInstantQuery(BasePrometheusTool):
             payload = {"query": query}
+            # Get timeout parameter and enforce limits
+            default_timeout = self.toolset.config.query_timeout_seconds_default
+            max_timeout = self.toolset.config.query_timeout_seconds_hard_max
+            timeout = params.get("timeout", default_timeout)
+            if timeout > max_timeout:
+                timeout = max_timeout
+                logging.warning(
+                    f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
+                )
+            elif timeout < 1:
+                timeout = default_timeout  # Min 1 second, but use default if invalid
             response = do_request(
                 config=self.toolset.config,
                 url=url,
                 headers=self.toolset.config.headers,
                 data=payload,
-                timeout=60,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                timeout=timeout,
+                verify=self.toolset.config.verify_ssl,
                 method="POST",
             )
@@ -734,24 +1348,68 @@ class ExecuteInstantQuery(BasePrometheusTool):
                     error_message = (
                         "The prometheus query returned no result. Is the query correct?"
                     )
-                response_data = {
-                    "status": status,
-                    "error_message": error_message,
-                    "random_key": generate_random_key(),
-                    "tool_name": self.name,
-                    "description": description,
-                    "query": query,
-                }
+                response_data = MetricsBasedResponse(
+                    status=status,
+                    error_message=error_message,
+                    tool_name=self.name,
+                    description=description,
+                    query=query,
+                )
+                structured_tool_result: StructuredToolResult
+                # Check if data should be included based on size
                 if self.toolset.config.tool_calls_return_data:
-                    response_data["data"] = data.get("data")
+                    result_data = data.get("data", {})
+                    response_data.data = result_data
-                data_str = json.dumps(response_data, indent=2)
-                return StructuredToolResult(
-                    status=ToolResultStatus.SUCCESS,
-                    data=data_str,
-                    params=params,
+                    structured_tool_result = create_structured_tool_result(
+                        params=params, response=response_data
+                    )
+                    tool_call_id = context.tool_call_id
+                    tool_name = context.tool_name
+                    token_count = count_tool_response_tokens(
+                        llm=context.llm,
+                        structured_tool_result=structured_tool_result,
+                        tool_call_id=tool_call_id,
+                        tool_name=tool_name,
+                    )
+                    token_limit = context.max_token_count
+                    if self.toolset.config.query_response_size_limit_pct:
+                        custom_token_limit = get_pct_token_count(
+                            percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
+                            llm=context.llm,
+                        )
+                        if custom_token_limit < token_limit:
+                            token_limit = custom_token_limit
+                    # Provide summary if data is too large
+                    if token_count > token_limit:
+                        response_data.data = None
+                        response_data.data_summary = (
+                            create_data_summary_for_large_result(
+                                result_data,
+                                query,
+                                token_count,
+                                is_range_query=False,
+                            )
+                        )
+                        logging.info(
+                            f"Prometheus instant query returned large dataset: "
+                            f"{response_data.data_summary.get('result_count', 0)} results, "
+                            f"{token_count:,} tokens (limit: {token_limit:,}). "
+                            f"Returning summary instead of full data."
+                        )
+                        # Also add token info to the summary for debugging
+                        response_data.data_summary["_debug_info"] = (
+                            f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
+                        )
+                    else:
+                        response_data.data = result_data
+                structured_tool_result = create_structured_tool_result(
+                    params=params, response=response_data
                 )
+                return structured_tool_result
             # Handle known Prometheus error status codes
             error_msg = "Unknown error occurred"
@@ -764,29 +1422,36 @@ class ExecuteInstantQuery(BasePrometheusTool):
                 except json.JSONDecodeError:
                     pass
                 return StructuredToolResult(
-                    status=ToolResultStatus.ERROR,
+                    status=StructuredToolResultStatus.ERROR,
                     error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
                     params=params,
                 )
             # For other status codes, just return the status code and content
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
                 params=params,
             )
+        except SSLError as e:
+            logging.warning("SSL error while executing Prometheus query", exc_info=True)
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
+                params=params,
+            )
         except RequestException as e:
             logging.info("Failed to connect to Prometheus", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Connection error to Prometheus: {str(e)}",
                 params=params,
             )
         except Exception as e:
             logging.info("Failed to connect to Prometheus", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Unexpected error executing query: {str(e)}",
                 params=params,
             )
@@ -800,7 +1465,12 @@ class ExecuteRangeQuery(BasePrometheusTool):
     def __init__(self, toolset: "PrometheusToolset"):
         super().__init__(
             name="execute_prometheus_range_query",
-            description="Generates a graph and Execute a PromQL range query",
+            description=(
+                f"Generates a graph and Execute a PromQL range query. "
+                f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
+                f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries. "
+                f"Default time range is last 1 hour."
+            ),
             parameters={
                 "query": ToolParameter(
                     description="The PromQL query",
@@ -827,23 +1497,40 @@ class ExecuteRangeQuery(BasePrometheusTool):
                 "step": ToolParameter(
                     description="Query resolution step width in duration format or float number of seconds",
                     type="number",
-                    required=True,
+                    required=False,
                 ),
                 "output_type": ToolParameter(
                     description="Specifies how to interpret the Prometheus result. Use 'Plain' for raw values, 'Bytes' to format byte values, 'Percentage' to scale 0–1 values into 0–100%, or 'CPUUsage' to convert values to cores (e.g., 500 becomes 500m, 2000 becomes 2).",
                     type="string",
                     required=True,
                 ),
+                "timeout": ToolParameter(
+                    description=(
+                        f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
+                        f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
+                        f"Increase for complex queries that may take longer."
+                    ),
+                    type="number",
+                    required=False,
+                ),
+                "max_points": ToolParameter(
+                    description=(
+                        f"Maximum number of data points to return. Default: {int(MAX_GRAPH_POINTS)}. "
+                        f"Can be reduced to get fewer data points (e.g., 50 for simpler graphs). "
+                        f"Cannot exceed system limit of {int(MAX_GRAPH_POINTS)}. "
+                        f"If your query would return more points than this limit, the step will be automatically adjusted."
+                    ),
+                    type="number",
+                    required=False,
+                ),
             },
             toolset=toolset,
         )
-    def _invoke(
-        self, params: dict, user_approved: bool = False
-    ) -> StructuredToolResult:
+    def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
         if not self.toolset.config or not self.toolset.config.prometheus_url:
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Prometheus is not configured. Prometheus URL is missing",
                 params=params,
             )
@@ -857,12 +1544,17 @@ class ExecuteRangeQuery(BasePrometheusTool):
                 end_timestamp=params.get("end"),
                 default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
             )
-            step = params.get("step", "")
+            step = parse_duration_to_seconds(params.get("step"))
+            max_points = params.get(
+                "max_points"
+            )  # Get the optional max_points parameter
+            # adjust_step_for_max_points handles None case and converts to float
             step = adjust_step_for_max_points(
                 start_timestamp=start,
                 end_timestamp=end,
-                step=float(step) if step else MAX_GRAPH_POINTS,
+                step=step,
+                max_points_override=max_points,
             )
             description = params.get("description", "")
@@ -874,13 +1566,25 @@ class ExecuteRangeQuery(BasePrometheusTool):
                 "step": step,
             }
+            # Get timeout parameter and enforce limits
+            default_timeout = self.toolset.config.query_timeout_seconds_default
+            max_timeout = self.toolset.config.query_timeout_seconds_hard_max
+            timeout = params.get("timeout", default_timeout)
+            if timeout > max_timeout:
+                timeout = max_timeout
+                logging.warning(
+                    f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
+                )
+            elif timeout < 1:
+                timeout = default_timeout  # Min 1 second, but use default if invalid
             response = do_request(
                 config=self.toolset.config,
                 url=url,
                 headers=self.toolset.config.headers,
                 data=payload,
-                timeout=120,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                timeout=timeout,
+                verify=self.toolset.config.verify_ssl,
                 method="POST",
             )
@@ -893,29 +1597,73 @@ class ExecuteRangeQuery(BasePrometheusTool):
                     error_message = (
                         "The prometheus query returned no result. Is the query correct?"
                     )
-                response_data = {
-                    "status": status,
-                    "error_message": error_message,
-                    "random_key": generate_random_key(),
-                    "tool_name": self.name,
-                    "description": description,
-                    "query": query,
-                    "start": start,
-                    "end": end,
-                    "step": step,
-                    "output_type": output_type,
-                }
+                response_data = MetricsBasedResponse(
+                    status=status,
+                    error_message=error_message,
+                    tool_name=self.name,
+                    description=description,
+                    query=query,
+                    start=start,
+                    end=end,
+                    step=step,
+                    output_type=output_type,
+                )
+                structured_tool_result: StructuredToolResult
+                # Check if data should be included based on size
                 if self.toolset.config.tool_calls_return_data:
-                    response_data["data"] = data.get("data")
-                data_str = json.dumps(response_data, indent=2)
+                    result_data = data.get("data", {})
+                    response_data.data = result_data
+                    structured_tool_result = create_structured_tool_result(
+                        params=params, response=response_data
+                    )
-                return StructuredToolResult(
-                    status=ToolResultStatus.SUCCESS,
-                    data=data_str,
-                    params=params,
+                    tool_call_id = context.tool_call_id
+                    tool_name = context.tool_name
+                    token_count = count_tool_response_tokens(
+                        llm=context.llm,
+                        structured_tool_result=structured_tool_result,
+                        tool_call_id=tool_call_id,
+                        tool_name=tool_name,
+                    )
+                    token_limit = context.max_token_count
+                    if self.toolset.config.query_response_size_limit_pct:
+                        custom_token_limit = get_pct_token_count(
+                            percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
+                            llm=context.llm,
+                        )
+                        if custom_token_limit < token_limit:
+                            token_limit = custom_token_limit
+                    # Provide summary if data is too large
+                    if token_count > token_limit:
+                        response_data.data = None
+                        response_data.data_summary = (
+                            create_data_summary_for_large_result(
+                                result_data, query, token_count, is_range_query=True
+                            )
+                        )
+                        logging.info(
+                            f"Prometheus range query returned large dataset: "
+                            f"{response_data.data_summary.get('series_count', 0)} series, "
+                            f"{token_count:,} tokens (limit: {token_limit:,}). "
+                            f"Returning summary instead of full data."
+                        )
+                        # Also add character info to the summary for debugging
+                        response_data.data_summary["_debug_info"] = (
+                            f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
+                        )
+                    else:
+                        response_data.data = result_data
+                structured_tool_result = create_structured_tool_result(
+                    params=params, response=response_data
                 )
+                return structured_tool_result
             error_msg = "Unknown error occurred"
             if response.status_code in [400, 429]:
                 try:
@@ -926,28 +1674,37 @@ class ExecuteRangeQuery(BasePrometheusTool):
                 except json.JSONDecodeError:
                     pass
                 return StructuredToolResult(
-                    status=ToolResultStatus.ERROR,
+                    status=StructuredToolResultStatus.ERROR,
                     error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
                     params=params,
                 )
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
                 params=params,
             )
+        except SSLError as e:
+            logging.warning(
+                "SSL error while executing Prometheus range query", exc_info=True
+            )
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
+                params=params,
+            )
         except RequestException as e:
             logging.info("Failed to connect to Prometheus", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Connection error to Prometheus: {str(e)}",
                 params=params,
             )
         except Exception as e:
             logging.info("Failed to connect to Prometheus", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Unexpected error executing query: {str(e)}",
                 params=params,
             )
@@ -958,7 +1715,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
 class PrometheusToolset(Toolset):
-    config: Optional[Union[PrometheusConfig, AMPConfig]] = None
+    config: Optional[Union[PrometheusConfig, AMPConfig, AzurePrometheusConfig]] = None
     def __init__(self):
         super().__init__(
@@ -969,7 +1726,11 @@ class PrometheusToolset(Toolset):
             prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
             tools=[
                 ListPrometheusRules(toolset=self),
-                ListAvailableMetrics(toolset=self),
+                GetMetricNames(toolset=self),
+                GetLabelValues(toolset=self),
+                GetAllLabels(toolset=self),
+                GetSeries(toolset=self),
+                GetMetricMetadata(toolset=self),
                 ExecuteInstantQuery(toolset=self),
                 ExecuteRangeQuery(toolset=self),
             ],
@@ -987,16 +1748,36 @@ class PrometheusToolset(Toolset):
     def determine_prometheus_class(
         self, config: dict[str, Any]
-    ) -> Type[Union[PrometheusConfig, AMPConfig]]:
+    ) -> Type[Union[PrometheusConfig, AMPConfig, AzurePrometheusConfig]]:
         has_aws_fields = "aws_region" in config
-        return AMPConfig if has_aws_fields else PrometheusConfig
+        if has_aws_fields:
+            return AMPConfig
+        # Check for Azure config using static method
+        is_azure = AzurePrometheusConfig.is_azure_config(config)
+        if is_azure:
+            logging.info("Detected Azure Managed Prometheus configuration")
+        return AzurePrometheusConfig if is_azure else PrometheusConfig
+    def _disable_azure_incompatible_tools(self):
+        """
+        Azure Managed Prometheus does not support some APIs.
+        Remove unsupported tools.
+        """
+        incompatible = {
+            "get_label_values",
+            "get_metric_metadata",
+            "list_prometheus_rules",
+        }
+        self.tools = [t for t in self.tools if t.name not in incompatible]
     def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
         try:
             if config:
                 config_cls = self.determine_prometheus_class(config)
                 self.config = config_cls(**config)  # type: ignore
+                if isinstance(self.config, AzurePrometheusConfig):
+                    self._disable_azure_incompatible_tools()
                 self._reload_llm_instructions()
                 return self._is_healthy()
         except Exception:
@@ -1041,14 +1822,14 @@ class PrometheusToolset(Toolset):
                 f"Toolset {self.name} failed to initialize because prometheus is not configured correctly",
             )
-        url = urljoin(self.config.prometheus_url, self.config.healthcheck)
+        url = urljoin(self.config.prometheus_url, "api/v1/query?query=up")
         try:
             response = do_request(
                 config=self.config,
                 url=url,
                 headers=self.config.headers,
                 timeout=10,
-                verify=self.config.prometheus_ssl_enabled,
+                verify=self.config.verify_ssl,
                 method="GET",
             )
@@ -1060,13 +1841,8 @@ class PrometheusToolset(Toolset):
                     f"Failed to connect to Prometheus at {url}: HTTP {response.status_code}",
                 )
-        except RequestException:
-            return (
-                False,
-                f"Failed to initialize using url={url}",
-            )
         except Exception as e:
-            logging.exception("Failed to initialize Prometheus")
+            logging.debug("Failed to initialize Prometheus", exc_info=True)
             return (
                 False,
                 f"Failed to initialize using url={url}. Unexpected error: {str(e)}",
@@ -1074,6 +1850,11 @@ class PrometheusToolset(Toolset):
     def get_example_config(self):
         example_config = PrometheusConfig(
-            prometheus_url="http://robusta-kube-prometheus-st-prometheus:9090"
+            prometheus_url="http://prometheus-server.monitoring.svc.cluster.local:9090",
+            headers={"Authorization": "Basic <base64_encoded_credentials>"},
+            discover_metrics_from_last_hours=1,
+            query_timeout_seconds_default=20,
+            query_timeout_seconds_hard_max=180,
+            verify_ssl=True,
         )
         return example_config.model_dump()

holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

holmesgpt 0.13.2py3-none-any.whl → 0.18.4py3-none-any.whl