PyPI - holmesgpt - Versions diffs - 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl - Mend

holmesgpt 0.13.2py3-none-any.whl → 0.16.2a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

holmes/__init__.py +1 -1
holmes/clients/robusta_client.py +17 -4
holmes/common/env_vars.py +40 -1
holmes/config.py +114 -144
holmes/core/conversations.py +53 -14
holmes/core/feedback.py +191 -0
holmes/core/investigation.py +18 -22
holmes/core/llm.py +489 -88
holmes/core/models.py +103 -1
holmes/core/openai_formatting.py +13 -0
holmes/core/prompt.py +1 -1
holmes/core/safeguards.py +4 -4
holmes/core/supabase_dal.py +293 -100
holmes/core/tool_calling_llm.py +423 -323
holmes/core/tools.py +311 -33
holmes/core/tools_utils/token_counting.py +14 -0
holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
holmes/core/tools_utils/tool_executor.py +13 -8
holmes/core/toolset_manager.py +155 -4
holmes/core/tracing.py +6 -1
holmes/core/transformers/__init__.py +23 -0
holmes/core/transformers/base.py +62 -0
holmes/core/transformers/llm_summarize.py +174 -0
holmes/core/transformers/registry.py +122 -0
holmes/core/transformers/transformer.py +31 -0
holmes/core/truncation/compaction.py +59 -0
holmes/core/truncation/dal_truncation_utils.py +23 -0
holmes/core/truncation/input_context_window_limiter.py +218 -0
holmes/interactive.py +177 -24
holmes/main.py +7 -4
holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
holmes/plugins/prompts/generic_ask.jinja2 +2 -4
holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
holmes/plugins/runbooks/__init__.py +117 -18
holmes/plugins/runbooks/catalog.json +2 -0
holmes/plugins/toolsets/__init__.py +21 -8
holmes/plugins/toolsets/aks-node-health.yaml +46 -0
holmes/plugins/toolsets/aks.yaml +64 -0
holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
holmes/plugins/toolsets/bash/common/bash.py +7 -7
holmes/plugins/toolsets/cilium.yaml +284 -0
holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
holmes/plugins/toolsets/git.py +51 -46
holmes/plugins/toolsets/grafana/common.py +15 -3
holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
holmes/plugins/toolsets/internet/internet.py +6 -7
holmes/plugins/toolsets/internet/notion.py +5 -6
holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
holmes/plugins/toolsets/kafka.py +25 -36
holmes/plugins/toolsets/kubernetes.yaml +58 -84
holmes/plugins/toolsets/kubernetes_logs.py +6 -6
holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
holmes/plugins/toolsets/newrelic/__init__.py +0 -0
holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
holmes/plugins/toolsets/openshift.yaml +283 -0
holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
holmes/plugins/toolsets/prometheus/utils.py +28 -0
holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
holmes/plugins/toolsets/robusta/robusta.py +236 -65
holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
holmes/plugins/toolsets/service_discovery.py +1 -1
holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
holmes/plugins/toolsets/utils.py +88 -0
holmes/utils/config_utils.py +91 -0
holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
holmes/utils/env.py +7 -0
holmes/utils/global_instructions.py +75 -10
holmes/utils/holmes_status.py +2 -1
holmes/utils/holmes_sync_toolsets.py +0 -2
holmes/utils/krr_utils.py +188 -0
holmes/utils/sentry_helper.py +41 -0
holmes/utils/stream.py +61 -7
holmes/version.py +34 -14
holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
{holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
{holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
holmes/core/performance_timing.py +0 -72
holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
holmes/plugins/toolsets/newrelic.py +0 -231
holmes/plugins/toolsets/servicenow/install.md +0 -37
holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
{holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
{holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0

holmes/core/llm.py CHANGED Viewed

@@ -1,32 +1,92 @@
 import json
 import logging
+import os
 from abc import abstractmethod
-from typing import Any, Dict, List, Optional, Type, Union
+from math import floor
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union
-from litellm.types.utils import ModelResponse
+import litellm
+from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
+from litellm.types.utils import ModelResponse, TextCompletionResponse
 import sentry_sdk
+from pydantic import BaseModel, ConfigDict, SecretStr
+from typing_extensions import Self
+from holmes.clients.robusta_client import (
+    RobustaModel,
+    RobustaModelsResponse,
+    fetch_robusta_models,
+)
-from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
-from pydantic import BaseModel
-import litellm
-import os
 from holmes.common.env_vars import (
+    FALLBACK_CONTEXT_WINDOW_SIZE,
+    LOAD_ALL_ROBUSTA_MODELS,
     REASONING_EFFORT,
+    ROBUSTA_AI,
+    ROBUSTA_API_ENDPOINT,
     THINKING,
+    EXTRA_HEADERS,
+    TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT,
+    TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_TOKENS,
 )
+from holmes.core.supabase_dal import SupabaseDal
+from holmes.utils.env import environ_get_safe_int, replace_env_vars_values
+from holmes.utils.file_utils import load_yaml_file
+if TYPE_CHECKING:
+    from holmes.config import Config
-def environ_get_safe_int(env_var, default="0"):
-    try:
-        return max(int(os.environ.get(env_var, default)), 0)
-    except ValueError:
-        return int(default)
+MODEL_LIST_FILE_LOCATION = os.environ.get(
+    "MODEL_LIST_FILE_LOCATION", "/etc/holmes/config/model_list.yaml"
+)
 OVERRIDE_MAX_OUTPUT_TOKEN = environ_get_safe_int("OVERRIDE_MAX_OUTPUT_TOKEN")
 OVERRIDE_MAX_CONTENT_SIZE = environ_get_safe_int("OVERRIDE_MAX_CONTENT_SIZE")
+def get_context_window_compaction_threshold_pct() -> int:
+    """Get the compaction threshold percentage at runtime to support test overrides."""
+    return environ_get_safe_int("CONTEXT_WINDOW_COMPACTION_THRESHOLD_PCT", default="95")
+ROBUSTA_AI_MODEL_NAME = "Robusta"
+class TokenCountMetadata(BaseModel):
+    total_tokens: int
+    tools_tokens: int
+    system_tokens: int
+    user_tokens: int
+    tools_to_call_tokens: int
+    assistant_tokens: int
+    other_tokens: int
+class ModelEntry(BaseModel):
+    """ModelEntry represents a single LLM model configuration."""
+    model: str
+    # TODO: the name field seems to be redundant, can we remove it?
+    name: Optional[str] = None
+    api_key: Optional[SecretStr] = None
+    base_url: Optional[str] = None
+    is_robusta_model: Optional[bool] = None
+    custom_args: Optional[Dict[str, Any]] = None
+    # LLM configurations used services like Azure OpenAI Service
+    api_base: Optional[str] = None
+    api_version: Optional[str] = None
+    model_config = ConfigDict(
+        extra="allow",
+    )
+    @classmethod
+    def load_from_dict(cls, data: dict) -> Self:
+        return cls.model_validate(data)
 class LLM:
     @abstractmethod
     def __init__(self):
@@ -40,8 +100,23 @@ class LLM:
     def get_maximum_output_token(self) -> int:
         pass
+    def get_max_token_count_for_single_tool(self) -> int:
+        if (
+            0 < TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT
+            and TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT <= 100
+        ):
+            context_window_size = self.get_context_window_size()
+            calculated_max_tokens = int(
+                context_window_size * TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT // 100
+            )
+            return min(calculated_max_tokens, TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_TOKENS)
+        else:
+            return TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_TOKENS
     @abstractmethod
-    def count_tokens_for_message(self, messages: list[dict]) -> int:
+    def count_tokens(
+        self, messages: list[dict], tools: Optional[list[dict[str, Any]]] = None
+    ) -> TokenCountMetadata:
         pass
     @abstractmethod
@@ -61,31 +136,55 @@ class LLM:
 class DefaultLLM(LLM):
     model: str
     api_key: Optional[str]
-    base_url: Optional[str]
+    api_base: Optional[str]
+    api_version: Optional[str]
     args: Dict
+    is_robusta_model: bool
     def __init__(
         self,
         model: str,
         api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        api_version: Optional[str] = None,
         args: Optional[Dict] = None,
-        tracer=None,
+        tracer: Optional[Any] = None,
+        name: Optional[str] = None,
+        is_robusta_model: bool = False,
     ):
         self.model = model
         self.api_key = api_key
+        self.api_base = api_base
+        self.api_version = api_version
         self.args = args or {}
         self.tracer = tracer
+        self.name = name
+        self.is_robusta_model = is_robusta_model
+        self.update_custom_args()
+        self.check_llm(
+            self.model, self.api_key, self.api_base, self.api_version, self.args
+        )
-        if not self.args:
-            self.check_llm(self.model, self.api_key)
+    def update_custom_args(self):
+        self.max_context_size = self.args.get("custom_args", {}).get("max_context_size")
+        self.args.pop("custom_args", None)
-    def check_llm(self, model: str, api_key: Optional[str]):
+    def check_llm(
+        self,
+        model: str,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        api_version: Optional[str],
+        args: Optional[dict] = None,
+    ):
+        if self.is_robusta_model:
+            # The model is assumed correctly configured if it is a robusta model
+            # For robusta models, this code would fail because Holmes has no knowledge of the API keys
+            # to azure or bedrock as all completion API calls go through robusta's LLM proxy
+            return
+        args = args or {}
         logging.debug(f"Checking LiteLLM model {model}")
-        # TODO: this WAS a hack to get around the fact that we can't pass in an api key to litellm.validate_environment
-        # so without this hack it always complains that the environment variable for the api key is missing
-        # to fix that, we always set an api key in the standard format that litellm expects (which is ${PROVIDER}_API_KEY)
-        # TODO: we can now handle this better - see https://github.com/BerriAI/litellm/issues/4375#issuecomment-2223684750
-        lookup = litellm.get_llm_provider(self.model)
+        lookup = litellm.get_llm_provider(model)
         if not lookup:
             raise Exception(f"Unknown provider for model {model}")
         provider = lookup[1]
@@ -119,85 +218,151 @@ class DefaultLLM(LLM):
                     "environment variable for proper functionality. For more information, refer to the documentation: "
                     "https://docs.litellm.ai/docs/providers/watsonx#usage---models-in-deployment-spaces"
                 )
-        elif provider == "bedrock" and (
-            os.environ.get("AWS_PROFILE") or os.environ.get("AWS_BEARER_TOKEN_BEDROCK")
-        ):
-            model_requirements = {"keys_in_environment": True, "missing_keys": []}
+        elif provider == "bedrock":
+            if os.environ.get("AWS_PROFILE") or os.environ.get(
+                "AWS_BEARER_TOKEN_BEDROCK"
+            ):
+                model_requirements = {"keys_in_environment": True, "missing_keys": []}
+            elif args.get("aws_access_key_id") and args.get("aws_secret_access_key"):
+                return  # break fast.
+            else:
+                model_requirements = litellm.validate_environment(
+                    model=model, api_key=api_key, api_base=api_base
+                )
         else:
-            #
-            api_key_env_var = f"{provider.upper()}_API_KEY"
-            if api_key:
-                os.environ[api_key_env_var] = api_key
-            model_requirements = litellm.validate_environment(model=model)
+            model_requirements = litellm.validate_environment(
+                model=model, api_key=api_key, api_base=api_base
+            )
+            # validate_environment does not accept api_version, and as a special case for Azure OpenAI Service,
+            # when all the other AZURE environments are set expect AZURE_API_VERSION, validate_environment complains
+            # the missing of it even after the api_version is set.
+            # TODO: There's an open PR in litellm to accept api_version in validate_environment, we can leverage this
+            # change if accepted to ignore the following check.
+            # https://github.com/BerriAI/litellm/pull/13808
+            if (
+                provider == "azure"
+                and ["AZURE_API_VERSION"] == model_requirements["missing_keys"]
+                and api_version is not None
+            ):
+                model_requirements["missing_keys"] = []
+                model_requirements["keys_in_environment"] = True
         if not model_requirements["keys_in_environment"]:
             raise Exception(
                 f"model {model} requires the following environment variables: {model_requirements['missing_keys']}"
             )
-    def _strip_model_prefix(self) -> str:
+    def _get_model_name_variants_for_lookup(self) -> list[str]:
         """
-        Helper function to strip 'openai/' prefix from model name if it exists.
-        model cost is taken from here which does not have the openai prefix
-        https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json
+        Generate model name variants to try when looking up in litellm.model_cost.
+        Returns a list of names to try in order: exact, lowercase, without prefix, etc.
         """
-        model_name = self.model
-        prefixes = ["openai/", "bedrock/", "vertex_ai/", "anthropic/"]
+        names_to_try = [self.model, self.model.lower()]
-        for prefix in prefixes:
-            if model_name.startswith(prefix):
-                return model_name[len(prefix) :]
+        # If there's a prefix, also try without it
+        if "/" in self.model:
+            base_model = self.model.split("/", 1)[1]
+            names_to_try.extend([base_model, base_model.lower()])
-        return model_name
-        # this unfortunately does not seem to work for azure if the deployment name is not a well-known model name
-        # if not litellm.supports_function_calling(model=model):
-        #    raise Exception(f"model {model} does not support function calling. You must use HolmesGPT with a model that supports function calling.")
+        # Remove duplicates while preserving order (dict.fromkeys maintains insertion order in Python 3.7+)
+        return list(dict.fromkeys(names_to_try))
     def get_context_window_size(self) -> int:
+        if self.max_context_size:
+            return self.max_context_size
         if OVERRIDE_MAX_CONTENT_SIZE:
             logging.debug(
                 f"Using override OVERRIDE_MAX_CONTENT_SIZE {OVERRIDE_MAX_CONTENT_SIZE}"
             )
             return OVERRIDE_MAX_CONTENT_SIZE
-        model_name = os.environ.get("MODEL_TYPE", self._strip_model_prefix())
-        try:
-            return litellm.model_cost[model_name]["max_input_tokens"]
-        except Exception:
-            logging.warning(
-                f"Couldn't find model's name {model_name} in litellm's model list, fallback to 128k tokens for max_input_tokens"
-            )
-            return 128000
+        # Try each name variant
+        for name in self._get_model_name_variants_for_lookup():
+            try:
+                return litellm.model_cost[name]["max_input_tokens"]
+            except Exception:
+                continue
+        # Log which lookups we tried
+        logging.warning(
+            f"Couldn't find model {self.model} in litellm's model list (tried: {', '.join(self._get_model_name_variants_for_lookup())}), "
+            f"using default {FALLBACK_CONTEXT_WINDOW_SIZE} tokens for max_input_tokens. "
+            f"To override, set OVERRIDE_MAX_CONTENT_SIZE environment variable to the correct value for your model."
+        )
+        return FALLBACK_CONTEXT_WINDOW_SIZE
     @sentry_sdk.trace
-    def count_tokens_for_message(self, messages: list[dict]) -> int:
-        total_token_count = 0
+    def count_tokens(
+        self, messages: list[dict], tools: Optional[list[dict[str, Any]]] = None
+    ) -> TokenCountMetadata:
+        # TODO: Add a recount:bool flag to save time. When the flag is false, reuse 'message["token_count"]' for individual messages.
+        # It's only necessary to recount message tokens at the beginning of a session because the LLM model may have changed.
+        # Changing the model requires recounting tokens because the tokenizer may be different
+        total_tokens = 0
+        tools_tokens = 0
+        system_tokens = 0
+        assistant_tokens = 0
+        user_tokens = 0
+        other_tokens = 0
+        tools_to_call_tokens = 0
         for message in messages:
-            if "token_count" in message and message["token_count"]:
-                total_token_count += message["token_count"]
+            # count message tokens individually because it gives us fine grain information about each tool call/message etc.
+            # However be aware that the sum of individual message tokens is not equal to the overall messages token
+            token_count = litellm.token_counter(  # type: ignore
+                model=self.model, messages=[message]
+            )
+            message["token_count"] = token_count
+            role = message.get("role")
+            if role == "system":
+                system_tokens += token_count
+            elif role == "user":
+                user_tokens += token_count
+            elif role == "tool":
+                tools_tokens += token_count
+            elif role == "assistant":
+                assistant_tokens += token_count
             else:
-                # message can be counted by this method only if message contains a "content" key
-                if "content" in message:
-                    if isinstance(message["content"], str):
-                        message_to_count = [
-                            {"type": "text", "text": message["content"]}
-                        ]
-                    elif isinstance(message["content"], list):
-                        message_to_count = [
-                            {"type": "text", "text": json.dumps(message["content"])}
-                        ]
-                    elif isinstance(message["content"], dict):
-                        if "type" not in message["content"]:
-                            message_to_count = [
-                                {"type": "text", "text": json.dumps(message["content"])}
-                            ]
-                    token_count = litellm.token_counter(
-                        model=self.model, messages=message_to_count
-                    )
-                    message["token_count"] = token_count
-                    total_token_count += token_count
-        return total_token_count
+                # although this should not be needed,
+                # it is defensive code so that all tokens are accounted for
+                # and can potentially make debugging easier
+                other_tokens += token_count
+        messages_token_count_without_tools = litellm.token_counter(  # type: ignore
+            model=self.model, messages=messages
+        )
+        total_tokens = litellm.token_counter(  # type: ignore
+            model=self.model,
+            messages=messages,
+            tools=tools,  # type: ignore
+        )
+        tools_to_call_tokens = max(0, total_tokens - messages_token_count_without_tools)
+        return TokenCountMetadata(
+            total_tokens=total_tokens,
+            system_tokens=system_tokens,
+            user_tokens=user_tokens,
+            tools_tokens=tools_tokens,
+            tools_to_call_tokens=tools_to_call_tokens,
+            other_tokens=other_tokens,
+            assistant_tokens=assistant_tokens,
+        )
+    def get_litellm_corrected_name_for_robusta_ai(self) -> str:
+        if self.is_robusta_model:
+            # For robusta models, self.model is the underlying provider/model used by Robusta AI
+            # To avoid litellm modifying the API URL according to the provider, the provider name
+            # is replaced with 'openai/' just before doing a completion() call
+            # Cf. https://docs.litellm.ai/docs/providers/openai_compatible
+            split_model_name = self.model.split("/")
+            return (
+                split_model_name[0]
+                if len(split_model_name) == 1
+                else f"openai/{split_model_name[1]}"
+            )
+        else:
+            return self.model
     def completion(
         self,
@@ -219,6 +384,9 @@ class DefaultLLM(LLM):
         if THINKING:
             self.args.setdefault("thinking", json.loads(THINKING))
+        if EXTRA_HEADERS:
+            self.args.setdefault("extra_headers", json.loads(EXTRA_HEADERS))
         if self.args.get("thinking", None):
             litellm.modify_params = True
@@ -234,9 +402,13 @@ class DefaultLLM(LLM):
         # Get the litellm module to use (wrapped or unwrapped)
         litellm_to_use = self.tracer.wrap_llm(litellm) if self.tracer else litellm
+        litellm_model_name = self.get_litellm_corrected_name_for_robusta_ai()
         result = litellm_to_use.completion(
-            model=self.model,
+            model=litellm_model_name,
             api_key=self.api_key,
+            base_url=self.api_base,
+            api_version=self.api_version,
             messages=messages,
             response_format=response_format,
             drop_params=drop_params,
@@ -254,20 +426,33 @@ class DefaultLLM(LLM):
             raise Exception(f"Unexpected type returned by the LLM {type(result)}")
     def get_maximum_output_token(self) -> int:
+        max_output_tokens = floor(min(64000, self.get_context_window_size() / 5))
         if OVERRIDE_MAX_OUTPUT_TOKEN:
             logging.debug(
                 f"Using OVERRIDE_MAX_OUTPUT_TOKEN {OVERRIDE_MAX_OUTPUT_TOKEN}"
             )
             return OVERRIDE_MAX_OUTPUT_TOKEN
-        model_name = os.environ.get("MODEL_TYPE", self._strip_model_prefix())
-        try:
-            return litellm.model_cost[model_name]["max_output_tokens"]
-        except Exception:
-            logging.warning(
-                f"Couldn't find model's name {model_name} in litellm's model list, fallback to 4096 tokens for max_output_tokens"
-            )
-            return 4096
+        # Try each name variant
+        for name in self._get_model_name_variants_for_lookup():
+            try:
+                litellm_max_output_tokens = litellm.model_cost[name][
+                    "max_output_tokens"
+                ]
+                if litellm_max_output_tokens < max_output_tokens:
+                    max_output_tokens = litellm_max_output_tokens
+                return max_output_tokens
+            except Exception:
+                continue
+        # Log which lookups we tried
+        logging.warning(
+            f"Couldn't find model {self.model} in litellm's model list (tried: {', '.join(self._get_model_name_variants_for_lookup())}), "
+            f"using {max_output_tokens} tokens for max_output_tokens. "
+            f"To override, set OVERRIDE_MAX_OUTPUT_TOKEN environment variable to the correct value for your model."
+        )
+        return max_output_tokens
     def _add_cache_control_to_last_message(
         self, messages: List[Dict[str, Any]]
@@ -276,6 +461,12 @@ class DefaultLLM(LLM):
         Add cache_control to the last non-user message for Anthropic prompt caching.
         Removes any existing cache_control from previous messages to avoid accumulation.
         """
+        # Skip cache_control for VertexAI/Gemini models as they don't support it with tools
+        if self.model and (
+            "vertex" in self.model.lower() or "gemini" in self.model.lower()
+        ):
+            return
         # First, remove any existing cache_control from all messages
         for msg in messages:
             content = msg.get("content")
@@ -305,7 +496,7 @@ class DefaultLLM(LLM):
         if content is None:
             return
-        if isinstance(content, str):
+        if isinstance(content, str) and content:
             # Convert string to structured format with cache_control
             target_msg["content"] = [
                 {
@@ -325,3 +516,213 @@ class DefaultLLM(LLM):
                 logging.debug(
                     f"Added cache_control to {target_msg.get('role')} message (structured content)"
                 )
+class LLMModelRegistry:
+    def __init__(self, config: "Config", dal: SupabaseDal) -> None:
+        self.config = config
+        self._llms: dict[str, ModelEntry] = {}
+        self._default_robusta_model = None
+        self.dal = dal
+        self._init_models()
+    @property
+    def default_robusta_model(self) -> Optional[str]:
+        return self._default_robusta_model
+    def _init_models(self):
+        self._llms = self._parse_models_file(MODEL_LIST_FILE_LOCATION)
+        if self._should_load_robusta_ai():
+            self.configure_robusta_ai_model()
+        if self._should_load_config_model():
+            self._llms[self.config.model] = self._create_model_entry(
+                model=self.config.model,
+                model_name=self.config.model,
+                base_url=self.config.api_base,
+                is_robusta_model=False,
+                api_key=self.config.api_key,
+                api_version=self.config.api_version,
+            )
+    def _should_load_config_model(self) -> bool:
+        if self.config.model is not None:
+            return True
+        # backward compatibility - in the past config.model was set by default to gpt-4o.
+        # so we need to check if the user has set an OPENAI_API_KEY to load the config model.
+        has_openai_key = os.environ.get("OPENAI_API_KEY")
+        if has_openai_key:
+            self.config.model = "gpt-4.1"
+            return True
+        return False
+    def configure_robusta_ai_model(self) -> None:
+        try:
+            if not self.config.cluster_name or not LOAD_ALL_ROBUSTA_MODELS:
+                self._load_default_robusta_config()
+                return
+            if not self.dal.account_id or not self.dal.enabled:
+                self._load_default_robusta_config()
+                return
+            account_id, token = self.dal.get_ai_credentials()
+            robusta_models: RobustaModelsResponse | None = fetch_robusta_models(
+                account_id, token
+            )
+            if not robusta_models or not robusta_models.models:
+                self._load_default_robusta_config()
+                return
+            default_model = None
+            for model_name, model_data in robusta_models.models.items():
+                logging.info(f"Loading Robusta AI model: {model_name}")
+                self._llms[model_name] = self._create_robusta_model_entry(
+                    model_name=model_name, model_data=model_data
+                )
+                if model_data.is_default:
+                    default_model = model_name
+            if default_model:
+                logging.info(f"Setting default Robusta AI model to: {default_model}")
+                self._default_robusta_model: str = default_model  # type: ignore
+        except Exception:
+            logging.exception("Failed to get all robusta models")
+            # fallback to default behavior
+            self._load_default_robusta_config()
+    def _load_default_robusta_config(self):
+        if self._should_load_robusta_ai():
+            logging.info("Loading default Robusta AI model")
+            self._llms[ROBUSTA_AI_MODEL_NAME] = ModelEntry(
+                name=ROBUSTA_AI_MODEL_NAME,
+                model="gpt-4o",  # TODO: tech debt, this isn't really
+                base_url=ROBUSTA_API_ENDPOINT,
+                is_robusta_model=True,
+            )
+            self._default_robusta_model = ROBUSTA_AI_MODEL_NAME
+    def _should_load_robusta_ai(self) -> bool:
+        if not self.config.should_try_robusta_ai:
+            return False
+        # ROBUSTA_AI were set in the env vars, so we can use it directly
+        if ROBUSTA_AI is not None:
+            return ROBUSTA_AI
+        # MODEL is set in the env vars, e.g. the user is using a custom model
+        # so we don't need to load the robusta AI model and keep the behavior backward compatible
+        if "MODEL" in os.environ:
+            return False
+        # if the user has provided a model list, we don't need to load the robusta AI model
+        if self._llms:
+            return False
+        return True
+    def get_model_params(self, model_key: Optional[str] = None) -> ModelEntry:
+        if not self._llms:
+            raise Exception("No llm models were loaded")
+        if model_key:
+            model_params = self._llms.get(model_key)
+            if model_params is not None:
+                logging.info(f"Using selected model: {model_key}")
+                return model_params.copy()
+            logging.error(f"Couldn't find model: {model_key} in model list")
+        if self._default_robusta_model:
+            model_params = self._llms.get(self._default_robusta_model)
+            if model_params is not None:
+                logging.info(
+                    f"Using default Robusta AI model: {self._default_robusta_model}"
+                )
+                return model_params.copy()
+            logging.error(
+                f"Couldn't find default Robusta AI model: {self._default_robusta_model} in model list"
+            )
+        model_key, first_model_params = next(iter(self._llms.items()))
+        logging.debug(f"Using first available model: {model_key}")
+        return first_model_params.copy()
+    def get_llm(self, name: str) -> LLM:  # TODO: fix logic
+        return self._llms[name]  # type: ignore
+    @property
+    def models(self) -> dict[str, ModelEntry]:
+        return self._llms
+    def _parse_models_file(self, path: str) -> dict[str, ModelEntry]:
+        models = load_yaml_file(path, raise_error=False, warn_not_found=False)
+        for _, params in models.items():
+            params = replace_env_vars_values(params)
+        llms = {}
+        for model_name, params in models.items():
+            llms[model_name] = ModelEntry.model_validate(params)
+        return llms
+    def _create_robusta_model_entry(
+        self, model_name: str, model_data: RobustaModel
+    ) -> ModelEntry:
+        entry = self._create_model_entry(
+            model=model_data.model,
+            model_name=model_name,
+            base_url=f"{ROBUSTA_API_ENDPOINT}/llm/{model_name}",
+            is_robusta_model=True,
+        )
+        entry.custom_args = model_data.holmes_args or {}  # type: ignore[assignment]
+        return entry
+    def _create_model_entry(
+        self,
+        model: str,
+        model_name: str,
+        base_url: Optional[str] = None,
+        is_robusta_model: Optional[bool] = None,
+        api_key: Optional[SecretStr] = None,
+        api_base: Optional[str] = None,
+        api_version: Optional[str] = None,
+    ) -> ModelEntry:
+        return ModelEntry(
+            name=model_name,
+            model=model,
+            base_url=base_url,
+            is_robusta_model=is_robusta_model,
+            api_key=api_key,
+            api_base=api_base,
+            api_version=api_version,
+        )
+def get_llm_usage(
+    llm_response: Union[ModelResponse, CustomStreamWrapper, TextCompletionResponse],
+) -> dict:
+    usage: dict = {}
+    if (
+        (
+            isinstance(llm_response, ModelResponse)
+            or isinstance(llm_response, TextCompletionResponse)
+        )
+        and hasattr(llm_response, "usage")
+        and llm_response.usage
+    ):  # type: ignore
+        usage["prompt_tokens"] = llm_response.usage.prompt_tokens  # type: ignore
+        usage["completion_tokens"] = llm_response.usage.completion_tokens  # type: ignore
+        usage["total_tokens"] = llm_response.usage.total_tokens  # type: ignore
+    elif isinstance(llm_response, CustomStreamWrapper):
+        complete_response = litellm.stream_chunk_builder(chunks=llm_response)  # type: ignore
+        if complete_response:
+            return get_llm_usage(complete_response)
+    return usage

holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl

holmesgpt 0.13.2py3-none-any.whl → 0.16.2a0py3-none-any.whl