PyPI - holmesgpt - Versions diffs - 0.16.2a0__py3-none-any.whl → 0.18.4__py3-none-any.whl - Mend

holmesgpt 0.16.2a0py3-none-any.whl → 0.18.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

holmes/__init__.py +3 -5
holmes/clients/robusta_client.py +4 -3
holmes/common/env_vars.py +18 -2
holmes/common/openshift.py +1 -1
holmes/config.py +11 -6
holmes/core/conversations.py +30 -13
holmes/core/investigation.py +21 -25
holmes/core/investigation_structured_output.py +3 -3
holmes/core/issue.py +1 -1
holmes/core/llm.py +50 -31
holmes/core/models.py +19 -17
holmes/core/openai_formatting.py +1 -1
holmes/core/prompt.py +47 -2
holmes/core/runbooks.py +1 -0
holmes/core/safeguards.py +4 -2
holmes/core/supabase_dal.py +4 -2
holmes/core/tool_calling_llm.py +102 -141
holmes/core/tools.py +19 -28
holmes/core/tools_utils/token_counting.py +9 -2
holmes/core/tools_utils/tool_context_window_limiter.py +13 -30
holmes/core/tools_utils/tool_executor.py +0 -18
holmes/core/tools_utils/toolset_utils.py +1 -0
holmes/core/toolset_manager.py +37 -2
holmes/core/tracing.py +13 -2
holmes/core/transformers/__init__.py +1 -1
holmes/core/transformers/base.py +1 -0
holmes/core/transformers/llm_summarize.py +3 -2
holmes/core/transformers/registry.py +2 -1
holmes/core/transformers/transformer.py +1 -0
holmes/core/truncation/compaction.py +37 -2
holmes/core/truncation/input_context_window_limiter.py +3 -2
holmes/interactive.py +52 -8
holmes/main.py +17 -37
holmes/plugins/interfaces.py +2 -1
holmes/plugins/prompts/__init__.py +2 -1
holmes/plugins/prompts/_fetch_logs.jinja2 +5 -5
holmes/plugins/prompts/_runbook_instructions.jinja2 +2 -1
holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
holmes/plugins/prompts/conversation_history_compaction.jinja2 +2 -1
holmes/plugins/prompts/generic_ask.jinja2 +0 -2
holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -2
holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -2
holmes/plugins/prompts/generic_investigation.jinja2 +0 -2
holmes/plugins/prompts/investigation_procedure.jinja2 +2 -1
holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -2
holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -2
holmes/plugins/runbooks/__init__.py +32 -3
holmes/plugins/sources/github/__init__.py +4 -2
holmes/plugins/sources/prometheus/models.py +1 -0
holmes/plugins/toolsets/__init__.py +30 -26
holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +13 -12
holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -12
holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +7 -7
holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -7
holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -5
holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -7
holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +6 -8
holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -3
holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -3
holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -3
holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -3
holmes/plugins/toolsets/azure_sql/utils.py +0 -32
holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
holmes/plugins/toolsets/bash/bash_toolset.py +2 -3
holmes/plugins/toolsets/bash/common/bash.py +19 -9
holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
holmes/plugins/toolsets/bash/common/stringify.py +1 -1
holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
holmes/plugins/toolsets/bash/parse_command.py +12 -13
holmes/plugins/toolsets/connectivity_check.py +124 -0
holmes/plugins/toolsets/coralogix/api.py +132 -119
holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
holmes/plugins/toolsets/coralogix/utils.py +15 -79
holmes/plugins/toolsets/datadog/datadog_api.py +36 -3
holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +34 -1
holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
holmes/plugins/toolsets/datadog/toolset_datadog_general.py +71 -28
holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +224 -375
holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +67 -36
holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +360 -343
holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
holmes/plugins/toolsets/git.py +7 -8
holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
holmes/plugins/toolsets/grafana/common.py +2 -30
holmes/plugins/toolsets/grafana/grafana_tempo_api.py +2 -1
holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +18 -2
holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +92 -18
holmes/plugins/toolsets/grafana/loki_api.py +4 -0
holmes/plugins/toolsets/grafana/toolset_grafana.py +109 -25
holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +22 -0
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +201 -33
holmes/plugins/toolsets/grafana/trace_parser.py +3 -2
holmes/plugins/toolsets/internet/internet.py +10 -10
holmes/plugins/toolsets/internet/notion.py +5 -6
holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
holmes/plugins/toolsets/investigator/model.py +3 -1
holmes/plugins/toolsets/json_filter_mixin.py +134 -0
holmes/plugins/toolsets/kafka.py +12 -7
holmes/plugins/toolsets/kubernetes.yaml +260 -30
holmes/plugins/toolsets/kubernetes_logs.py +3 -3
holmes/plugins/toolsets/logging_utils/logging_api.py +16 -6
holmes/plugins/toolsets/mcp/toolset_mcp.py +88 -60
holmes/plugins/toolsets/newrelic/new_relic_api.py +41 -1
holmes/plugins/toolsets/newrelic/newrelic.jinja2 +24 -0
holmes/plugins/toolsets/newrelic/newrelic.py +212 -55
holmes/plugins/toolsets/prometheus/prometheus.py +358 -102
holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +11 -3
holmes/plugins/toolsets/rabbitmq/api.py +23 -4
holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +5 -5
holmes/plugins/toolsets/robusta/robusta.py +5 -5
holmes/plugins/toolsets/runbook/runbook_fetcher.py +25 -6
holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +1 -1
holmes/plugins/toolsets/utils.py +1 -1
holmes/utils/config_utils.py +1 -1
holmes/utils/connection_utils.py +31 -0
holmes/utils/console/result.py +10 -0
holmes/utils/file_utils.py +2 -1
holmes/utils/global_instructions.py +10 -26
holmes/utils/holmes_status.py +4 -3
holmes/utils/log.py +15 -0
holmes/utils/markdown_utils.py +2 -3
holmes/utils/memory_limit.py +58 -0
holmes/utils/sentry_helper.py +23 -0
holmes/utils/stream.py +12 -5
holmes/utils/tags.py +4 -3
holmes/version.py +3 -1
{holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +12 -10
holmesgpt-0.18.4.dist-info/RECORD +258 -0
holmes/plugins/toolsets/aws.yaml +0 -80
holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -114
holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -736
holmes/plugins/toolsets/grafana/grafana_api.py +0 -64
holmes/plugins/toolsets/opensearch/__init__.py +0 -0
holmes/plugins/toolsets/opensearch/opensearch.py +0 -250
holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -215
holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
holmes/utils/keygen_utils.py +0 -6
holmesgpt-0.16.2a0.dist-info/RECORD +0 -258
holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_ppl_query_docs.jinja2 +0 -0
holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist.py +2 -2
/holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist_instructions.jinja2 +0 -0
{holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/LICENSE +0 -0
{holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
{holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0

holmes/plugins/toolsets/prometheus/prometheus.py CHANGED Viewed

@@ -2,28 +2,39 @@ import json
 import logging
 import os
 import time
-import dateutil.parser
 from typing import Any, Dict, Optional, Tuple, Type, Union
 from urllib.parse import urljoin
+import dateutil.parser
 import requests  # type: ignore
-from pydantic import BaseModel, field_validator, Field, model_validator
-from requests import RequestException
+from prometrix.auth import PrometheusAuthorization
 from prometrix.connect.aws_connect import AWSPrometheusConnect
+from prometrix.models.prometheus_config import (
+    AzurePrometheusConfig as PrometrixAzureConfig,
+)
 from prometrix.models.prometheus_config import PrometheusConfig as BasePrometheusConfig
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from requests import RequestException
+from requests.exceptions import SSLError  # type: ignore
+from holmes.common.env_vars import IS_OPENSHIFT, MAX_GRAPH_POINTS
+from holmes.common.openshift import load_openshift_token
 from holmes.core.tools import (
     CallablePrerequisite,
     StructuredToolResult,
+    StructuredToolResultStatus,
     Tool,
     ToolInvokeContext,
     ToolParameter,
-    StructuredToolResultStatus,
     Toolset,
     ToolsetTag,
 )
 from holmes.core.tools_utils.token_counting import count_tool_response_tokens
 from holmes.core.tools_utils.tool_context_window_limiter import get_pct_token_count
 from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
+from holmes.plugins.toolsets.logging_utils.logging_api import (
+    DEFAULT_GRAPH_TIME_SPAN_SECONDS,
+)
 from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
 from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
 from holmes.plugins.toolsets.utils import (
@@ -33,12 +44,6 @@ from holmes.plugins.toolsets.utils import (
     toolset_name_for_one_liner,
 )
 from holmes.utils.cache import TTLCache
-from holmes.common.env_vars import IS_OPENSHIFT, MAX_GRAPH_POINTS
-from holmes.common.openshift import load_openshift_token
-from holmes.plugins.toolsets.logging_utils.logging_api import (
-    DEFAULT_GRAPH_TIME_SPAN_SECONDS,
-)
-from holmes.utils.keygen_utils import generate_random_key
 PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
 PROMETHEUS_METADATA_API_LIMIT = 100  # Default limit for Prometheus metadata APIs (series, labels, metadata) to prevent overwhelming responses
@@ -52,46 +57,57 @@ MAX_METADATA_TIMEOUT_SECONDS = 60
 DEFAULT_METADATA_TIME_WINDOW_HRS = 1
+def format_ssl_error_message(prometheus_url: str, error: SSLError) -> str:
+    """Format a clear SSL error message with remediation steps."""
+    return (
+        f"SSL certificate verification failed when connecting to Prometheus at {prometheus_url}. "
+        f"Error: {str(error)}. "
+        f"To disable SSL verification, set 'verify_ssl: false' in your configuration. "
+        f"For Helm deployments, add this to your values.yaml:\n"
+        f"  toolsets:\n"
+        f"    prometheus/metrics:\n"
+        f"      config:\n"
+        f"        verify_ssl: false"
+    )
 class PrometheusConfig(BaseModel):
+    """Prometheus toolset configuration.
+    Deprecated config names (still accepted but not in schema):
+    - default_metadata_time_window_hrs -> discover_metrics_from_last_hours
+    - default_query_timeout_seconds -> query_timeout_seconds_default
+    - max_query_timeout_seconds -> query_timeout_seconds_hard_max
+    - default_metadata_timeout_seconds -> metadata_timeout_seconds_default
+    - max_metadata_timeout_seconds -> metadata_timeout_seconds_hard_max
+    - metrics_labels_time_window_hrs -> discover_metrics_from_last_hours
+    - prometheus_ssl_enabled -> verify_ssl
+    - metrics_labels_cache_duration_hrs (no longer used)
+    - fetch_labels_with_labels_api (no longer used)
+    - fetch_metadata_with_series_api (no longer used)
+    """
+    model_config = ConfigDict(extra="allow")
     # URL is optional because it can be set with an env var
-    prometheus_url: Optional[str]
-    healthcheck: str = "-/healthy"
+    prometheus_url: Optional[str] = None
-    # New config for default time window for metadata APIs
-    default_metadata_time_window_hrs: int = DEFAULT_METADATA_TIME_WINDOW_HRS  # Default: only show metrics active in the last hour
+    # Discovery API time window - only return metrics with data in the last N hours
+    discover_metrics_from_last_hours: int = DEFAULT_METADATA_TIME_WINDOW_HRS
     # Query timeout configuration
-    default_query_timeout_seconds: int = (
-        DEFAULT_QUERY_TIMEOUT_SECONDS  # Default timeout for PromQL queries
-    )
-    max_query_timeout_seconds: int = (
-        MAX_QUERY_TIMEOUT_SECONDS  # Maximum allowed timeout for PromQL queries
-    )
+    query_timeout_seconds_default: int = DEFAULT_QUERY_TIMEOUT_SECONDS
+    query_timeout_seconds_hard_max: int = MAX_QUERY_TIMEOUT_SECONDS
     # Metadata API timeout configuration
-    default_metadata_timeout_seconds: int = (
-        DEFAULT_METADATA_TIMEOUT_SECONDS  # Default timeout for metadata/discovery APIs
-    )
-    max_metadata_timeout_seconds: int = (
-        MAX_METADATA_TIMEOUT_SECONDS  # Maximum allowed timeout for metadata APIs
-    )
-    # DEPRECATED: These config values are deprecated and will be removed in a future version
-    # Using None as default so we can detect if user explicitly set them
-    metrics_labels_time_window_hrs: Optional[int] = (
-        None  # DEPRECATED - use default_metadata_time_window_hrs instead
-    )
-    metrics_labels_cache_duration_hrs: Optional[int] = (
-        None  # DEPRECATED - no longer used
-    )
-    fetch_labels_with_labels_api: Optional[bool] = None  # DEPRECATED - no longer used
-    fetch_metadata_with_series_api: Optional[bool] = None  # DEPRECATED - no longer used
+    metadata_timeout_seconds_default: int = DEFAULT_METADATA_TIMEOUT_SECONDS
+    metadata_timeout_seconds_hard_max: int = MAX_METADATA_TIMEOUT_SECONDS
     tool_calls_return_data: bool = True
     headers: Dict = Field(default_factory=dict)
     rules_cache_duration_seconds: Optional[int] = 1800  # 30 minutes
     additional_labels: Optional[Dict[str, str]] = None
-    prometheus_ssl_enabled: bool = True
+    verify_ssl: bool = True
     # Custom limit to the max number of tokens that a query result can take to proactively
     #   prevent token limit issues. Expressed in % of the model's context window.
@@ -107,31 +123,52 @@ class PrometheusConfig(BaseModel):
     @model_validator(mode="after")
     def validate_prom_config(self):
-        # Check for deprecated config values and print warnings
-        deprecated_configs = []
-        if self.metrics_labels_time_window_hrs is not None:  # Check if explicitly set
-            deprecated_configs.append(
-                "metrics_labels_time_window_hrs (use default_metadata_time_window_hrs instead)"
+        # Handle deprecated config names passed as extra fields
+        # These are accepted via extra="allow" but not defined in schema
+        extra = self.model_extra or {}
+        deprecated_with_replacement = []
+        # Map of old names -> new names
+        deprecated_mappings = {
+            "default_metadata_time_window_hrs": "discover_metrics_from_last_hours",
+            "default_query_timeout_seconds": "query_timeout_seconds_default",
+            "max_query_timeout_seconds": "query_timeout_seconds_hard_max",
+            "default_metadata_timeout_seconds": "metadata_timeout_seconds_default",
+            "max_metadata_timeout_seconds": "metadata_timeout_seconds_hard_max",
+            "metrics_labels_time_window_hrs": "discover_metrics_from_last_hours",
+            "prometheus_ssl_enabled": "verify_ssl",
+        }
+        for old_name, new_name in deprecated_mappings.items():
+            if old_name in extra:
+                setattr(self, new_name, extra[old_name])
+                deprecated_with_replacement.append(f"{old_name} -> {new_name}")
+        if deprecated_with_replacement:
+            logging.warning(
+                f"Prometheus config uses deprecated names. Please update: "
+                f"{', '.join(deprecated_with_replacement)}"
             )
-        if (
-            self.metrics_labels_cache_duration_hrs is not None
-        ):  # Check if explicitly set
-            deprecated_configs.append("metrics_labels_cache_duration_hrs")
-        if self.fetch_labels_with_labels_api is not None:  # Check if explicitly set
-            deprecated_configs.append("fetch_labels_with_labels_api")
-        if self.fetch_metadata_with_series_api is not None:  # Check if explicitly set
-            deprecated_configs.append("fetch_metadata_with_series_api")
-        if deprecated_configs:
+        # Check for deprecated config values that no longer have any effect
+        deprecated_no_effect = [
+            name
+            for name in [
+                "metrics_labels_cache_duration_hrs",
+                "fetch_labels_with_labels_api",
+                "fetch_metadata_with_series_api",
+            ]
+            if name in extra
+        ]
+        if deprecated_no_effect:
             logging.warning(
-                f"WARNING: The following Prometheus config values are deprecated and will be removed in a future version: "
-                f"{', '.join(deprecated_configs)}. These configs no longer affect behavior."
+                f"The following Prometheus config values are deprecated and have no effect: "
+                f"{', '.join(deprecated_no_effect)}"
             )
         # If openshift is enabled, and the user didn't configure auth headers, we will try to load the token from the service account.
         if IS_OPENSHIFT:
-            if self.healthcheck == "-/healthy":
-                self.healthcheck = "api/v1/query?query=up"
             if self.headers.get("Authorization"):
                 return self
@@ -151,8 +188,7 @@ class AMPConfig(PrometheusConfig):
     aws_secret_access_key: Optional[str] = None
     aws_region: str
     aws_service_name: str = "aps"
-    healthcheck: str = "api/v1/query?query=up"
-    prometheus_ssl_enabled: bool = False
+    verify_ssl: bool = False
     assume_role_arn: Optional[str] = None
     # Refresh the AWS client (and its STS creds) every N seconds (default: 15 minutes)
@@ -176,7 +212,7 @@ class AMPConfig(PrometheusConfig):
             try:
                 base_config = BasePrometheusConfig(
                     url=self.prometheus_url,
-                    disable_ssl=not self.prometheus_ssl_enabled,
+                    disable_ssl=not self.verify_ssl,
                     additional_labels=self.additional_labels,
                 )
                 self._aws_client = AWSPrometheusConnect(
@@ -195,12 +231,155 @@ class AMPConfig(PrometheusConfig):
         return self._aws_client
+class AzurePrometheusConfig(PrometheusConfig):
+    azure_resource: Optional[str] = None
+    azure_metadata_endpoint: Optional[str] = None
+    azure_token_endpoint: Optional[str] = None
+    azure_use_managed_id: bool = False
+    azure_client_id: Optional[str] = None
+    azure_client_secret: Optional[str] = None
+    azure_tenant_id: Optional[str] = None
+    verify_ssl: bool = True
+    # Refresh the Azure bearer token every N seconds (default: 15 minutes)
+    refresh_interval_seconds: int = 900
+    _prometrix_config: Optional[PrometrixAzureConfig] = None
+    _token_created_at: float = 0.0
+    @staticmethod
+    def _load_from_env_or_default(
+        config_value: Optional[str], env_var: str, default: Optional[str] = None
+    ) -> Optional[str]:
+        """Load value from config, environment variable, or use default."""
+        if config_value:
+            return config_value
+        return os.environ.get(env_var, default)
+    def __init__(self, **data):
+        super().__init__(**data)
+        # Load from environment variables if not provided in config
+        self.azure_client_id = self._load_from_env_or_default(
+            self.azure_client_id, "AZURE_CLIENT_ID"
+        )
+        self.azure_tenant_id = self._load_from_env_or_default(
+            self.azure_tenant_id, "AZURE_TENANT_ID"
+        )
+        self.azure_client_secret = self._load_from_env_or_default(
+            self.azure_client_secret, "AZURE_CLIENT_SECRET"
+        )
+        # Set defaults from environment if not provided
+        self.azure_resource = self._load_from_env_or_default(
+            self.azure_resource,
+            "AZURE_RESOURCE",
+            "https://prometheus.monitor.azure.com",
+        )
+        # from https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/how-to-use-vm-token
+        self.azure_metadata_endpoint = self._load_from_env_or_default(
+            self.azure_metadata_endpoint,
+            "AZURE_METADATA_ENDPOINT",
+            "http://169.254.169.254/metadata/identity/oauth2/token",
+        )
+        self.azure_token_endpoint = self._load_from_env_or_default(
+            self.azure_token_endpoint, "AZURE_TOKEN_ENDPOINT"
+        )
+        if not self.azure_token_endpoint and self.azure_tenant_id:
+            self.azure_token_endpoint = (
+                f"https://login.microsoftonline.com/{self.azure_tenant_id}/oauth2/token"
+            )
+        # Check if managed identity should be used
+        if not self.azure_use_managed_id:
+            self.azure_use_managed_id = os.environ.get(
+                "AZURE_USE_MANAGED_ID", "false"
+            ).lower() in ("true", "1")
+        # Convert None to empty string for prometrix compatibility (prometrix checks != "")
+        azure_client_id = self.azure_client_id or ""
+        azure_tenant_id = self.azure_tenant_id or ""
+        azure_client_secret = self.azure_client_secret or ""
+        azure_resource = self.azure_resource or ""
+        azure_metadata_endpoint = self.azure_metadata_endpoint or ""
+        azure_token_endpoint = self.azure_token_endpoint or ""
+        # Create prometrix Azure config
+        self._prometrix_config = PrometrixAzureConfig(
+            url=self.prometheus_url,
+            azure_resource=azure_resource,
+            azure_metadata_endpoint=azure_metadata_endpoint,
+            azure_token_endpoint=azure_token_endpoint,
+            azure_use_managed_id=self.azure_use_managed_id,
+            azure_client_id=azure_client_id,
+            azure_client_secret=azure_client_secret,
+            azure_tenant_id=azure_tenant_id,
+            disable_ssl=not self.verify_ssl,
+            additional_labels=self.additional_labels,
+        )
+        # Ensure promtrix gets a real bool (not string) for managed identity
+        # fixing internal prometrix config issue
+        object.__setattr__(
+            self._prometrix_config,
+            "azure_use_managed_id",
+            bool(self.azure_use_managed_id),
+        )
+        PrometheusAuthorization.azure_authorization(self._prometrix_config)
+    @staticmethod
+    def is_azure_config(config: dict[str, Any]) -> bool:
+        """Check if config dict or environment variables indicate Azure Prometheus config."""
+        # Check for explicit Azure fields in config
+        if (
+            "azure_client_id" in config
+            or "azure_tenant_id" in config
+            or "azure_use_managed_id" in config
+        ):
+            return True
+        # Check for Azure environment variables
+        if os.environ.get("AZURE_CLIENT_ID") or os.environ.get("AZURE_TENANT_ID"):
+            return True
+        return False
+    def is_amp(self) -> bool:
+        return False
+    def _should_refresh_token(self) -> bool:
+        if not PrometheusAuthorization.bearer_token:
+            return True
+        return (time.time() - self._token_created_at) >= self.refresh_interval_seconds
+    def request_new_token(self) -> bool:
+        """Request a new Azure access token using prometrix."""
+        success = PrometheusAuthorization.request_new_token(self._prometrix_config)
+        if success:
+            self._token_created_at = time.time()
+        return success
+    def get_authorization_headers(self) -> Dict[str, str]:
+        # Request new token if needed
+        if self._should_refresh_token():
+            if not self.request_new_token():
+                logging.error("Failed to request new Azure access token")
+                return {}
+            self._token_created_at = time.time()
+        headers = PrometheusAuthorization.get_authorization_headers(
+            self._prometrix_config
+        )
+        if not headers.get("Authorization"):
+            logging.warning("No authorization header generated for Azure Prometheus")
+        return headers
 class BasePrometheusTool(Tool):
     toolset: "PrometheusToolset"
 def do_request(
-    config,  # PrometheusConfig | AMPConfig
+    config,  # PrometheusConfig | AMPConfig | AzurePrometheusConfig
     url: str,
     params: Optional[Dict] = None,
     data: Optional[Dict] = None,
@@ -212,12 +391,13 @@ def do_request(
     """
     Route a request through either:
       - AWSPrometheusConnect (SigV4) when config is AMPConfig
+      - Azure bearer token auth when config is AzurePrometheusConfig
       - plain requests otherwise
     method defaults to GET so callers can omit it for reads.
     """
     if verify is None:
-        verify = config.prometheus_ssl_enabled
+        verify = config.verify_ssl
     if headers is None:
         headers = config.headers or {}
@@ -234,7 +414,21 @@ def do_request(
             headers=headers,
         )
-    # Non-AMP: plain HTTP
+    if isinstance(config, AzurePrometheusConfig):
+        # Merge Azure authorization headers with provided headers
+        azure_headers = config.get_authorization_headers()
+        headers = {**azure_headers, **headers}
+        return requests.request(
+            method=method,
+            url=url,
+            headers=headers,
+            params=params,
+            data=data,
+            timeout=timeout,
+            verify=verify,
+        )
+    # Non-AMP, Non-Azure: plain HTTP
     return requests.request(
         method=method,
         url=url,
@@ -411,7 +605,6 @@ class MetricsBasedResponse(BaseModel):
     status: str
     error_message: Optional[str] = None
     data: Optional[str] = None
-    random_key: str
     tool_name: str
     description: str
     query: str
@@ -426,15 +619,22 @@ def create_structured_tool_result(
     params: dict, response: MetricsBasedResponse
 ) -> StructuredToolResult:
     status = StructuredToolResultStatus.SUCCESS
+    error = None
     if response.error_message or response.status.lower() in ("failed", "error"):
         status = StructuredToolResultStatus.ERROR
+        error = (
+            response.error_message
+            if response.error_message
+            else "Unknown Prometheus error"
+        )
     elif not response.data:
         status = StructuredToolResultStatus.NO_DATA
     return StructuredToolResult(
         status=status,
-        data=response.model_dump_json(indent=2),
+        data=response,
         params=params,
+        error=error,
     )
@@ -484,7 +684,7 @@ class ListPrometheusRules(BasePrometheusTool):
                 url=rules_url,
                 params=params,
                 timeout=40,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                verify=self.toolset.config.verify_ssl,
                 headers=self.toolset.config.headers,
                 method="GET",
             )
@@ -505,6 +705,13 @@ class ListPrometheusRules(BasePrometheusTool):
                 error="Request timed out while fetching rules",
                 params=params,
             )
+        except SSLError as e:
+            logging.warning("SSL error while fetching prometheus rules", exc_info=True)
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
+                params=params,
+            )
         except RequestException as e:
             logging.warning("Failed to fetch prometheus rules", exc_info=True)
             return StructuredToolResult(
@@ -598,19 +805,19 @@ class GetMetricNames(BasePrometheusTool):
             if params.get("start"):
                 query_params["start"] = params["start"]
-            elif self.toolset.config.default_metadata_time_window_hrs:
+            elif self.toolset.config.discover_metrics_from_last_hours:
                 # Use default time window
                 query_params["start"] = str(
                     int(time.time())
-                    - (self.toolset.config.default_metadata_time_window_hrs * 3600)
+                    - (self.toolset.config.discover_metrics_from_last_hours * 3600)
                 )
             response = do_request(
                 config=self.toolset.config,
                 url=url,
                 params=query_params,
-                timeout=self.toolset.config.default_metadata_timeout_seconds,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                timeout=self.toolset.config.metadata_timeout_seconds_default,
+                verify=self.toolset.config.verify_ssl,
                 headers=self.toolset.config.headers,
                 method="GET",
             )
@@ -716,19 +923,19 @@ class GetLabelValues(BasePrometheusTool):
             if params.get("start"):
                 query_params["start"] = params["start"]
-            elif self.toolset.config.default_metadata_time_window_hrs:
+            elif self.toolset.config.discover_metrics_from_last_hours:
                 # Use default time window
                 query_params["start"] = str(
                     int(time.time())
-                    - (self.toolset.config.default_metadata_time_window_hrs * 3600)
+                    - (self.toolset.config.discover_metrics_from_last_hours * 3600)
                 )
             response = do_request(
                 config=self.toolset.config,
                 url=url,
                 params=query_params,
-                timeout=self.toolset.config.default_metadata_timeout_seconds,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                timeout=self.toolset.config.metadata_timeout_seconds_default,
+                verify=self.toolset.config.verify_ssl,
                 headers=self.toolset.config.headers,
                 method="GET",
             )
@@ -820,19 +1027,19 @@ class GetAllLabels(BasePrometheusTool):
             if params.get("start"):
                 query_params["start"] = params["start"]
-            elif self.toolset.config.default_metadata_time_window_hrs:
+            elif self.toolset.config.discover_metrics_from_last_hours:
                 # Use default time window
                 query_params["start"] = str(
                     int(time.time())
-                    - (self.toolset.config.default_metadata_time_window_hrs * 3600)
+                    - (self.toolset.config.discover_metrics_from_last_hours * 3600)
                 )
             response = do_request(
                 config=self.toolset.config,
                 url=url,
                 params=query_params,
-                timeout=self.toolset.config.default_metadata_timeout_seconds,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                timeout=self.toolset.config.metadata_timeout_seconds_default,
+                verify=self.toolset.config.verify_ssl,
                 headers=self.toolset.config.headers,
                 method="GET",
             )
@@ -934,19 +1141,19 @@ class GetSeries(BasePrometheusTool):
             if params.get("start"):
                 query_params["start"] = params["start"]
-            elif self.toolset.config.default_metadata_time_window_hrs:
+            elif self.toolset.config.discover_metrics_from_last_hours:
                 # Use default time window
                 query_params["start"] = str(
                     int(time.time())
-                    - (self.toolset.config.default_metadata_time_window_hrs * 3600)
+                    - (self.toolset.config.discover_metrics_from_last_hours * 3600)
                 )
             response = do_request(
                 config=self.toolset.config,
                 url=url,
                 params=query_params,
-                timeout=self.toolset.config.default_metadata_timeout_seconds,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                timeout=self.toolset.config.metadata_timeout_seconds_default,
+                verify=self.toolset.config.verify_ssl,
                 headers=self.toolset.config.headers,
                 method="GET",
             )
@@ -1024,8 +1231,8 @@ class GetMetricMetadata(BasePrometheusTool):
                 config=self.toolset.config,
                 url=url,
                 params=query_params,
-                timeout=self.toolset.config.default_metadata_timeout_seconds,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                timeout=self.toolset.config.metadata_timeout_seconds_default,
+                verify=self.toolset.config.verify_ssl,
                 headers=self.toolset.config.headers,
                 method="GET",
             )
@@ -1111,8 +1318,8 @@ class ExecuteInstantQuery(BasePrometheusTool):
             payload = {"query": query}
             # Get timeout parameter and enforce limits
-            default_timeout = self.toolset.config.default_query_timeout_seconds
-            max_timeout = self.toolset.config.max_query_timeout_seconds
+            default_timeout = self.toolset.config.query_timeout_seconds_default
+            max_timeout = self.toolset.config.query_timeout_seconds_hard_max
             timeout = params.get("timeout", default_timeout)
             if timeout > max_timeout:
                 timeout = max_timeout
@@ -1128,7 +1335,7 @@ class ExecuteInstantQuery(BasePrometheusTool):
                 headers=self.toolset.config.headers,
                 data=payload,
                 timeout=timeout,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                verify=self.toolset.config.verify_ssl,
                 method="POST",
             )
@@ -1144,7 +1351,6 @@ class ExecuteInstantQuery(BasePrometheusTool):
                 response_data = MetricsBasedResponse(
                     status=status,
                     error_message=error_message,
-                    random_key=generate_random_key(),
                     tool_name=self.name,
                     description=description,
                     query=query,
@@ -1158,8 +1364,13 @@ class ExecuteInstantQuery(BasePrometheusTool):
                     structured_tool_result = create_structured_tool_result(
                         params=params, response=response_data
                     )
+                    tool_call_id = context.tool_call_id
+                    tool_name = context.tool_name
                     token_count = count_tool_response_tokens(
-                        llm=context.llm, structured_tool_result=structured_tool_result
+                        llm=context.llm,
+                        structured_tool_result=structured_tool_result,
+                        tool_call_id=tool_call_id,
+                        tool_name=tool_name,
                     )
                     token_limit = context.max_token_count
@@ -1223,6 +1434,13 @@ class ExecuteInstantQuery(BasePrometheusTool):
                 params=params,
             )
+        except SSLError as e:
+            logging.warning("SSL error while executing Prometheus query", exc_info=True)
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
+                params=params,
+            )
         except RequestException as e:
             logging.info("Failed to connect to Prometheus", exc_info=True)
             return StructuredToolResult(
@@ -1349,8 +1567,8 @@ class ExecuteRangeQuery(BasePrometheusTool):
             }
             # Get timeout parameter and enforce limits
-            default_timeout = self.toolset.config.default_query_timeout_seconds
-            max_timeout = self.toolset.config.max_query_timeout_seconds
+            default_timeout = self.toolset.config.query_timeout_seconds_default
+            max_timeout = self.toolset.config.query_timeout_seconds_hard_max
             timeout = params.get("timeout", default_timeout)
             if timeout > max_timeout:
                 timeout = max_timeout
@@ -1366,7 +1584,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
                 headers=self.toolset.config.headers,
                 data=payload,
                 timeout=timeout,
-                verify=self.toolset.config.prometheus_ssl_enabled,
+                verify=self.toolset.config.verify_ssl,
                 method="POST",
             )
@@ -1382,7 +1600,6 @@ class ExecuteRangeQuery(BasePrometheusTool):
                 response_data = MetricsBasedResponse(
                     status=status,
                     error_message=error_message,
-                    random_key=generate_random_key(),
                     tool_name=self.name,
                     description=description,
                     query=query,
@@ -1402,8 +1619,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
                         params=params, response=response_data
                     )
+                    tool_call_id = context.tool_call_id
+                    tool_name = context.tool_name
                     token_count = count_tool_response_tokens(
-                        llm=context.llm, structured_tool_result=structured_tool_result
+                        llm=context.llm,
+                        structured_tool_result=structured_tool_result,
+                        tool_call_id=tool_call_id,
+                        tool_name=tool_name,
                     )
                     token_limit = context.max_token_count
@@ -1463,6 +1685,15 @@ class ExecuteRangeQuery(BasePrometheusTool):
                 params=params,
             )
+        except SSLError as e:
+            logging.warning(
+                "SSL error while executing Prometheus range query", exc_info=True
+            )
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
+                params=params,
+            )
         except RequestException as e:
             logging.info("Failed to connect to Prometheus", exc_info=True)
             return StructuredToolResult(
@@ -1484,7 +1715,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
 class PrometheusToolset(Toolset):
-    config: Optional[Union[PrometheusConfig, AMPConfig]] = None
+    config: Optional[Union[PrometheusConfig, AMPConfig, AzurePrometheusConfig]] = None
     def __init__(self):
         super().__init__(
@@ -1517,16 +1748,36 @@ class PrometheusToolset(Toolset):
     def determine_prometheus_class(
         self, config: dict[str, Any]
-    ) -> Type[Union[PrometheusConfig, AMPConfig]]:
+    ) -> Type[Union[PrometheusConfig, AMPConfig, AzurePrometheusConfig]]:
         has_aws_fields = "aws_region" in config
-        return AMPConfig if has_aws_fields else PrometheusConfig
+        if has_aws_fields:
+            return AMPConfig
+        # Check for Azure config using static method
+        is_azure = AzurePrometheusConfig.is_azure_config(config)
+        if is_azure:
+            logging.info("Detected Azure Managed Prometheus configuration")
+        return AzurePrometheusConfig if is_azure else PrometheusConfig
+    def _disable_azure_incompatible_tools(self):
+        """
+        Azure Managed Prometheus does not support some APIs.
+        Remove unsupported tools.
+        """
+        incompatible = {
+            "get_label_values",
+            "get_metric_metadata",
+            "list_prometheus_rules",
+        }
+        self.tools = [t for t in self.tools if t.name not in incompatible]
     def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
         try:
             if config:
                 config_cls = self.determine_prometheus_class(config)
                 self.config = config_cls(**config)  # type: ignore
+                if isinstance(self.config, AzurePrometheusConfig):
+                    self._disable_azure_incompatible_tools()
                 self._reload_llm_instructions()
                 return self._is_healthy()
         except Exception:
@@ -1571,14 +1822,14 @@ class PrometheusToolset(Toolset):
                 f"Toolset {self.name} failed to initialize because prometheus is not configured correctly",
             )
-        url = urljoin(self.config.prometheus_url, self.config.healthcheck)
+        url = urljoin(self.config.prometheus_url, "api/v1/query?query=up")
         try:
             response = do_request(
                 config=self.config,
                 url=url,
                 headers=self.config.headers,
                 timeout=10,
-                verify=self.config.prometheus_ssl_enabled,
+                verify=self.config.verify_ssl,
                 method="GET",
             )
@@ -1599,6 +1850,11 @@ class PrometheusToolset(Toolset):
     def get_example_config(self):
         example_config = PrometheusConfig(
-            prometheus_url="http://robusta-kube-prometheus-st-prometheus:9090"
+            prometheus_url="http://prometheus-server.monitoring.svc.cluster.local:9090",
+            headers={"Authorization": "Basic <base64_encoded_credentials>"},
+            discover_metrics_from_last_hours=1,
+            query_timeout_seconds_default=20,
+            query_timeout_seconds_hard_max=180,
+            verify_ssl=True,
         )
         return example_config.model_dump()

holmesgpt 0.16.2a0__py3-none-any.whl → 0.18.4__py3-none-any.whl

holmesgpt 0.16.2a0py3-none-any.whl → 0.18.4py3-none-any.whl