PyPI - holmesgpt - Versions diffs - 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl - Mend

holmesgpt 0.13.1py3-none-any.whl → 0.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

holmes/__init__.py +1 -1
holmes/common/env_vars.py +7 -0
holmes/config.py +3 -1
holmes/core/conversations.py +0 -11
holmes/core/investigation.py +0 -6
holmes/core/llm.py +60 -1
holmes/core/prompt.py +0 -2
holmes/core/supabase_dal.py +2 -2
holmes/core/todo_tasks_formatter.py +51 -0
holmes/core/tool_calling_llm.py +166 -91
holmes/core/tools.py +20 -4
holmes/interactive.py +63 -2
holmes/main.py +0 -1
holmes/plugins/prompts/_general_instructions.jinja2 +3 -1
holmes/plugins/prompts/investigation_procedure.jinja2 +3 -13
holmes/plugins/toolsets/__init__.py +5 -1
holmes/plugins/toolsets/argocd.yaml +1 -1
holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +18 -6
holmes/plugins/toolsets/aws.yaml +9 -5
holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +3 -1
holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +3 -1
holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -1
holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -1
holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +3 -1
holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +3 -1
holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -1
holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -1
holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -1
holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -1
holmes/plugins/toolsets/bash/bash_toolset.py +31 -20
holmes/plugins/toolsets/confluence.yaml +1 -1
holmes/plugins/toolsets/coralogix/api.py +3 -1
holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +4 -4
holmes/plugins/toolsets/coralogix/utils.py +41 -14
holmes/plugins/toolsets/datadog/datadog_api.py +45 -2
holmes/plugins/toolsets/datadog/datadog_general_instructions.jinja2 +208 -0
holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +43 -0
holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +12 -9
holmes/plugins/toolsets/datadog/toolset_datadog_general.py +722 -0
holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +17 -6
holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +15 -7
holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +6 -2
holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +9 -3
holmes/plugins/toolsets/docker.yaml +1 -1
holmes/plugins/toolsets/git.py +15 -5
holmes/plugins/toolsets/grafana/toolset_grafana.py +25 -4
holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +4 -4
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +5 -3
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -32
holmes/plugins/toolsets/helm.yaml +1 -1
holmes/plugins/toolsets/internet/internet.py +4 -2
holmes/plugins/toolsets/internet/notion.py +4 -2
holmes/plugins/toolsets/investigator/core_investigation.py +5 -17
holmes/plugins/toolsets/investigator/investigator_instructions.jinja2 +1 -5
holmes/plugins/toolsets/kafka.py +19 -7
holmes/plugins/toolsets/kubernetes.yaml +5 -5
holmes/plugins/toolsets/kubernetes_logs.py +4 -4
holmes/plugins/toolsets/kubernetes_logs.yaml +1 -1
holmes/plugins/toolsets/logging_utils/logging_api.py +15 -2
holmes/plugins/toolsets/mcp/toolset_mcp.py +3 -1
holmes/plugins/toolsets/newrelic.py +8 -4
holmes/plugins/toolsets/opensearch/opensearch.py +13 -5
holmes/plugins/toolsets/opensearch/opensearch_logs.py +4 -4
holmes/plugins/toolsets/opensearch/opensearch_traces.py +9 -6
holmes/plugins/toolsets/prometheus/prometheus.py +193 -82
holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +7 -3
holmes/plugins/toolsets/robusta/robusta.py +10 -4
holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -2
holmes/plugins/toolsets/servicenow/servicenow.py +9 -3
holmes/plugins/toolsets/slab.yaml +1 -1
{holmesgpt-0.13.1.dist-info → holmesgpt-0.13.2.dist-info}/METADATA +3 -2
{holmesgpt-0.13.1.dist-info → holmesgpt-0.13.2.dist-info}/RECORD +75 -72
holmes/core/todo_manager.py +0 -88
{holmesgpt-0.13.1.dist-info → holmesgpt-0.13.2.dist-info}/LICENSE.txt +0 -0
{holmesgpt-0.13.1.dist-info → holmesgpt-0.13.2.dist-info}/WHEEL +0 -0
{holmesgpt-0.13.1.dist-info → holmesgpt-0.13.2.dist-info}/entry_points.txt +0 -0

holmes/plugins/toolsets/coralogix/utils.py CHANGED Viewed

@@ -20,9 +20,10 @@ class CoralogixQueryResult(BaseModel):
 class CoralogixLabelsConfig(BaseModel):
-    pod: str = "kubernetes.pod_name"
-    namespace: str = "kubernetes.namespace_name"
-    log_message: str = "log"
+    pod: str = "resource.attributes.k8s.pod.name"
+    namespace: str = "resource.attributes.k8s.namespace.name"
+    log_message: str = "logRecord.body"
+    timestamp: str = "logRecord.attributes.time"
 class CoralogixLogsMethodology(str, Enum):
@@ -78,24 +79,43 @@ def normalize_datetime(date_str: Optional[str]) -> str:
         return date_str
+def extract_field(data_obj: dict[str, Any], field: str):
+    """returns a nested field from a dict
+    e.g. extract_field({"parent": {"child": "value"}}, "parent.child") => value
+    """
+    current_object: Any = data_obj
+    fields = field.split(".")
+    for field in fields:
+        if not current_object:
+            return None
+        if isinstance(current_object, dict):
+            current_object = current_object.get(field)
+        else:
+            return None
+    return current_object
 def flatten_structured_log_entries(
     log_entries: List[Dict[str, Any]],
+    labels_config: CoralogixLabelsConfig,
 ) -> List[FlattenedLog]:
     flattened_logs = []
     for log_entry in log_entries:
         try:
-            user_data = json.loads(log_entry.get("userData", "{}"))
-            timestamp = normalize_datetime(user_data.get("time"))
-            log_message = user_data.get("log", "")
-            if log_message:
+            userData = json.loads(log_entry.get("userData", "{}"))
+            log_message = extract_field(userData, labels_config.log_message)
+            timestamp = extract_field(userData, labels_config.timestamp)
+            if not log_message or not timestamp:
+                log_message = json.dumps(userData)
+            else:
                 flattened_logs.append(
                     FlattenedLog(timestamp=timestamp, log_message=log_message)
                 )  # Store as tuple for sorting
         except json.JSONDecodeError:
-            logging.error(
-                f"Failed to decode userData JSON: {log_entry.get('userData')}"
-            )
+            logging.error(f"Failed to decode userData JSON: {json.dumps(log_entry)}")
     return flattened_logs
@@ -107,14 +127,16 @@ def stringify_flattened_logs(log_entries: List[FlattenedLog]) -> str:
     return "\n".join(formatted_logs) if formatted_logs else "No logs found."
-def parse_json_objects(json_objects: List[Dict[str, Any]]) -> List[FlattenedLog]:
+def parse_json_objects(
+    json_objects: List[Dict[str, Any]], labels_config: CoralogixLabelsConfig
+) -> List[FlattenedLog]:
     """Extracts timestamp and log values from parsed JSON objects, sorted in ascending order (oldest first)."""
     logs: List[FlattenedLog] = []
     for data in json_objects:
         if isinstance(data, dict) and "result" in data and "results" in data["result"]:
             logs += flatten_structured_log_entries(
-                log_entries=data["result"]["results"]
+                log_entries=data["result"]["results"], labels_config=labels_config
             )
         elif isinstance(data, dict) and data.get("warning"):
             logging.info(
@@ -128,13 +150,18 @@ def parse_json_objects(json_objects: List[Dict[str, Any]]) -> List[FlattenedLog]
     return logs
-def parse_logs(raw_logs: str) -> List[FlattenedLog]:
+def parse_logs(
+    raw_logs: str,
+    labels_config: CoralogixLabelsConfig,
+) -> List[FlattenedLog]:
     """Processes the HTTP response and extracts only log outputs."""
     try:
         json_objects = parse_json_lines(raw_logs)
         if not json_objects:
             raise Exception("No valid JSON objects found.")
-        return parse_json_objects(json_objects)
+        return parse_json_objects(
+            json_objects=json_objects, labels_config=labels_config
+        )
     except Exception as e:
         logging.error(
             f"Unexpected error in format_logs for a coralogix API response: {str(e)}"

holmes/plugins/toolsets/datadog/datadog_api.py CHANGED Viewed

@@ -1,5 +1,6 @@
+import json
 import logging
-from typing import Any, Optional, Dict
+from typing import Any, Optional, Dict, Union
 import requests  # type: ignore
 from pydantic import AnyUrl, BaseModel
 from requests.structures import CaseInsensitiveDict  # type: ignore
@@ -145,6 +146,19 @@ def execute_paginated_datadog_http_request(
     return data, cursor
+def sanitize_headers(headers: Union[dict, CaseInsensitiveDict]) -> dict:
+    try:
+        return {
+            k: v
+            if ("key" not in k.lower() and "key" not in v.lower())
+            else "[REDACTED]"
+            for k, v in headers.items()
+        }
+    except (AttributeError, TypeError):
+        # Return empty dict for mock objects or other non-dict types
+        return {}
 def execute_datadog_http_request(
     url: str,
     headers: dict,
@@ -152,6 +166,16 @@ def execute_datadog_http_request(
     timeout: int,
     method: str = "POST",
 ) -> Any:
+    # Log the request details
+    logging.info("Datadog API Request:")
+    logging.info(f"  Method: {method}")
+    logging.info(f"  URL: {url}")
+    logging.info(f"  Headers: {json.dumps(sanitize_headers(headers), indent=2)}")
+    logging.info(
+        f"  {'Params' if method == 'GET' else 'Payload'}: {json.dumps(payload_or_params, indent=2)}"
+    )
+    logging.info(f"  Timeout: {timeout}s")
     if method == "GET":
         response = requests.get(
             url, headers=headers, params=payload_or_params, timeout=timeout
@@ -161,10 +185,29 @@ def execute_datadog_http_request(
             url, headers=headers, json=payload_or_params, timeout=timeout
         )
+    # Log the response details
+    logging.info("Datadog API Response:")
+    logging.info(f"  Status Code: {response.status_code}")
+    logging.info(f"  Response Headers: {dict(sanitize_headers(response.headers))}")
     if response.status_code == 200:
-        return response.json()
+        response_data = response.json()
+        # Log response size but not full content (could be large)
+        if isinstance(response_data, dict):
+            logging.info(f"  Response Keys: {list(response_data.keys())}")
+            if "data" in response_data:
+                data_len = (
+                    len(response_data["data"])
+                    if isinstance(response_data["data"], list)
+                    else 1
+                )
+                logging.info(f"  Data Items Count: {data_len}")
+        else:
+            logging.info(f"  Response Type: {type(response_data).__name__}")
+        return response_data
     else:
+        logging.error(f"  Error Response Body: {response.text}")
         raise DataDogRequestError(
             payload=payload_or_params,
             status_code=response.status_code,

holmes/plugins/toolsets/datadog/datadog_general_instructions.jinja2 ADDED Viewed

@@ -0,0 +1,208 @@
+## Datadog General API Tools Usage Guide
+### When to Use This Toolset
+**PROACTIVELY use the Datadog general toolset when investigating issues to gather comprehensive observability data.**
+**Use Datadog for Historical Context When Needed, or check live data when needed:**
+- **When checking current status**: Use current time ranges for real-time monitoring
+- **When investigating past issues**: If asked about problems from yesterday, last week, etc.
+- **When finding root causes**: Look at events/monitors from BEFORE an issue started
+- **When Kubernetes data is missing**: Pods may have been deleted, events expired, etc.
+This toolset provides access to critical Datadog resources that can help identify root causes, or health status:
+- **Monitors**: Check alert history, thresholds, and monitor states
+- **Incidents**: Review recent incidents and their timelines
+- **Dashboards**: Access pre-configured dashboards for system overview
+- **SLOs**: Verify service level objectives and error budgets
+- **Events**: Correlate deployments, configuration changes, and system events
+- **Synthetics**: Check endpoint availability and performance
+- **Security**: Review security signals and alerts
+- **Hosts**: Get infrastructure-level information
+### When Historical Data is Important
+**Kubernetes limitations that Datadog can address:**
+- Kubernetes events expire after 1 hour by default
+- Deleted pods/deployments leave no trace in the cluster
+- Previous configuration values are not retained
+- Past node issues may be resolved without evidence
+**Datadog preserves this context when you need it:**
+- Events from before an incident started
+- Monitor triggers on now-deleted resources
+- Past incidents and their resolutions
+- Deployment and configuration change history
+### Investigation Workflow
+**1. Determine the appropriate time range based on the request:**
+```
+- For current status: Use recent time windows (last hour, last few minutes)
+- For investigating alerts: Query from before the alert started to understand triggers
+- For past issues: Use the specific timeframe when the issue occurred
+- For root cause analysis: Look at events/changes before the problem began
+```
+**2. Check relevant monitors and incidents:**
+```
+- Use `datadog_api_get` with `/api/v1/monitor` to list monitors
+- Use `datadog_api_post_search` with `/api/v2/incidents/search` to find recent incidents
+- Check monitor states to understand alert patterns
+```
+**3. Correlate with events when investigating issues:**
+```
+- Query `/api/v1/events` with appropriate time range
+- For root cause: Look for events BEFORE the issue started
+- Events often reveal deployments, config changes, or infrastructure updates
+- Especially useful when Kubernetes resources have been deleted/replaced
+```
+**4. Check service health and dependencies:**
+```
+- Use `/api/v2/services` to list services and their states
+- Query `/api/v2/services/{service}/dependencies` to understand service relationships
+- This helps identify cascade failures
+```
+**5. Review SLOs for service degradation over time:**
+```
+- Query `/api/v1/slo` to check service level objectives
+- Use `/api/v1/slo/{id}/history` to see historical compliance
+- Identify when degradation started (may be before alerts fired)
+- Check if issues are violating SLO targets
+```
+### Common Investigation Patterns
+**For Kubernetes Pod/Deployment Issues:**
+1. **When pods are missing/deleted**: Query Datadog for historical data about those pods
+2. **For recurring issues**: Check monitor history for patterns
+3. **For deployment problems**: Look for deployment events around issue time
+4. **When Kubernetes events expired**: Use Datadog events for the same timeframe
+**For Application Issues:**
+1. **Adjust time range based on issue**: Current for live issues, historical for past problems
+2. Review monitors: `datadog_api_get` with `/api/v1/monitor` filtering by service
+3. Search incidents: `datadog_api_post_search` with `/api/v2/incidents/search`
+4. For degradation: Check SLO history to identify when it started
+**For Infrastructure Issues:**
+1. List hosts: `datadog_api_get` with `/api/v1/hosts` to see host status
+2. Check host details: `datadog_api_get` with `/api/v1/hosts/{hostname}`
+3. Review events: Look for infrastructure changes or maintenance
+4. Check monitors: Find infrastructure-related alerts
+**For Performance Issues:**
+1. Review synthetics: `datadog_api_get` with `/api/v1/synthetics/tests` for endpoint monitoring
+2. Check SLO history: Track performance degradation over time
+3. Review dashboards: `datadog_api_get` with `/api/v1/dashboard` for performance dashboards
+4. Correlate with events: Find changes that might impact performance
+**For Security Issues:**
+1. Search security signals: `datadog_api_post_search` with `/api/v2/security_monitoring/signals/search`
+2. Review security rules: `datadog_api_get` with `/api/v2/security_monitoring/rules`
+3. Check recent incidents: Look for security-related incidents
+### Time Parameters
+**Choose time ranges based on the investigation context:**
+- Use query parameters for time ranges:
+  - `from`: Start time (Unix timestamp or ISO 8601)
+  - `to`: End time (Unix timestamp or ISO 8601)
+- Example: `{"from": "2024-01-01T00:00:00Z", "to": "2024-01-02T00:00:00Z"}`
+- For relative times: `{"from": "-1h"}` for last hour
+- **For root cause analysis**: Query from before the issue started (e.g., if alert fired 2 hours ago, query from "-4h")
+- **For current status**: Use recent time windows (e.g., "-15m" or "-1h")
+- **For historical issues**: Use the specific timeframe when the issue occurred
+### Query Examples
+**List all monitors with their current state:**
+```
+Tool: datadog_api_get
+Endpoint: /api/v1/monitor
+Query params: {"group_states": "all", "monitor_tags": "env:production"}
+```
+**Search for recent incidents:**
+```
+Tool: datadog_api_post_search
+Endpoint: /api/v2/incidents/search
+Body: {
+  "filter": {
+    "created": {
+      "from": "-24h"
+    }
+  },
+  "sort": "-created",
+  "page": {"limit": 10}
+}
+```
+**Get events for a specific service:**
+```
+Tool: datadog_api_get
+Endpoint: /api/v1/events
+Query params: {"start": "-3600", "end": "now", "tags": "service:my-service"}
+```
+**Check SLO compliance:**
+```
+Tool: datadog_api_get
+Endpoint: /api/v1/slo/{slo_id}/history
+Query params: {"from_ts": 1234567890, "to_ts": 1234567900}
+```
+### Best Practices
+1. **Always correlate multiple data sources:**
+   - Don't rely on a single metric or log
+   - Cross-reference monitors, events, and incidents
+   - Look for patterns across different data types
+2. **Use time windows effectively:**
+   - Start with a broader time range to see patterns
+   - Narrow down once you identify the issue timeframe
+   - Compare with historical data when available
+3. **Follow the dependency chain:**
+   - Check upstream services when investigating issues
+   - Use service dependency maps to understand impact
+   - Look for cascade failures
+4. **Prioritize based on severity:**
+   - Check critical monitors and P1 incidents first
+   - Review SLO violations for business impact
+   - Focus on customer-facing services
+5. **Document findings:**
+   - Note correlations between events and issues
+   - Identify patterns in monitor triggers
+   - Track incident timelines for post-mortems
+### Resource Discovery
+Use `list_datadog_api_resources` to discover available endpoints:
+- Filter by category: monitors, dashboards, slos, incidents, etc.
+- This helps identify which resources are available for investigation
+- Example: `list_datadog_api_resources` with `{"category": "monitors"}`
+### Integration with Other Toolsets
+This toolset complements other Datadog toolsets:
+- Use with `datadog/metrics` for detailed metric analysis
+- Combine with `datadog/logs` for log correlation
+- Use alongside `datadog/traces` for distributed tracing
+- Integrate with Kubernetes toolsets for container-level issues
+### IMPORTANT: Proactive Usage
+**Don't wait for the user to explicitly ask for Datadog data. When investigating any issue:**
+1. Check if there are relevant monitors or incidents
+2. Look for recent events that might be related
+3. Verify service health and SLO compliance
+4. Review any security signals if applicable
+This proactive approach often reveals root causes that wouldn't be found through logs or metrics alone.

holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 ADDED Viewed

@@ -0,0 +1,43 @@
+## Datadog Logs Tools Usage Guide
+Before running logs queries:
+** You are often (but not always) running in a kubernetes environment. So users might ask you questions about kubernetes workloads without explicitly stating their type.
+** When getting ambiguous questions, use kubectl_find_resource to find the resource you are being asked about!
+** Find the involved resource name and kind
+** If you can't figure out what is the type of the resource, ask the user for more information and don't guess
+### General guideline
+- This toolset is used to read pod logs.
+- Assume the pod should have logs. If logs not found, try to adjust the query
+### CRITICAL: Pod Name Resolution Workflow
+**When user provides an exact pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
+- FIRST query Datadog directly with that pod name using appropriate tags
+- Do NOT try to verify if the pod exists in Kubernetes first
+- This allows querying historical pods that have been deleted/replaced
+**When user provides a generic workload name** (e.g., "my-workload", "nginx", "telemetry-processor"):
+- First use `kubectl_find_resource` to find actual pod names
+- Example: `kubectl_find_resource` with "my-workload" → finds pods like "my-workload-8f8cdfxyz-c7zdr"
+- Then use those specific pod names in Datadog queries
+- Alternative: Use deployment-level tags when appropriate
+**Why this matters:**
+- Pod names in Datadog are the actual Kubernetes pod names (with random suffixes)
+- Historical pods that no longer exist in the cluster can still have logs in Datadog
+- Deployment/service names alone are NOT pod names (they need the suffix)
+### Time Parameters
+- Use RFC3339 format: `2023-03-01T10:30:00Z`
+- Or relative seconds: `-3600` for 1 hour ago
+- Defaults to 1 hour window if not specified
+### Common Investigation Patterns
+**For Pod/Container Metrics (MOST COMMON):**
+1. User asks: "Show logs for my-workload"
+2. Use `kubectl_find_resource` → find pod "my-workload-abc123-xyz"
+3. Query Datadog for pod "my-workload-abc123-xyz" logs

holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 CHANGED Viewed

@@ -32,19 +32,22 @@ When investigating metrics-related issues:
 - IMPORTANT: This toolset DOES NOT support promql queries.
 ### CRITICAL: Pod Name Resolution Workflow
-When users ask for metrics about a deployment, service, or workload (e.g., "my-workload", "nginx-deployment"):
-**ALWAYS follow this two-step process:**
-1. **First**: Use `kubectl_find_resource` to find the actual pod names
-   - Example: `kubectl_find_resource` with "my-workload" → finds pods like "my-workload-8f8cdfxyz-c7zdr"
-2. **Then**: Use those specific pod names in Datadog queries
-   - Correct: `container.cpu.usage{pod_name:my-workload-8f8cdfxyz-c7zdr}`
-   - WRONG: `container.cpu.usage{pod_name:my-workload}` ← This will return no data!
+**When user provides an exact pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
+- Query Datadog directly with that pod name using appropriate metrics and tags
+- Do NOT try to verify if the pod exists in Kubernetes first
+- This allows querying historical pods that have been deleted/replaced
+**When user provides a generic workload name** (e.g., "my-workload", "nginx", "telemetry-processor"):
+- First use `kubectl_find_resource` to find actual pod names
+- Example: `kubectl_find_resource` with "my-workload" → finds pods like "my-workload-8f8cdfxyz-c7zdr"
+- Then use those specific pod names in Datadog queries
+- Alternative: Use deployment-level tags when appropriate
 **Why this matters:**
 - Pod names in Datadog are the actual Kubernetes pod names (with random suffixes)
-- Deployment/service names are NOT pod names
-- Using deployment names as pod_name filters will always return empty results
+- Historical pods that no longer exist in the cluster can still have metrics in Datadog
+- Deployment/service names alone are NOT pod names (they need the suffix)
 ### Time Parameters
 - Use RFC3339 format: `2023-03-01T10:30:00Z`

holmesgpt 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl

holmesgpt 0.13.1py3-none-any.whl → 0.13.2py3-none-any.whl