PyPI - holmesgpt - Versions diffs - 0.16.2a0__py3-none-any.whl → 0.18.4__py3-none-any.whl - Mend

holmesgpt 0.16.2a0py3-none-any.whl → 0.18.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

holmes/__init__.py +3 -5
holmes/clients/robusta_client.py +4 -3
holmes/common/env_vars.py +18 -2
holmes/common/openshift.py +1 -1
holmes/config.py +11 -6
holmes/core/conversations.py +30 -13
holmes/core/investigation.py +21 -25
holmes/core/investigation_structured_output.py +3 -3
holmes/core/issue.py +1 -1
holmes/core/llm.py +50 -31
holmes/core/models.py +19 -17
holmes/core/openai_formatting.py +1 -1
holmes/core/prompt.py +47 -2
holmes/core/runbooks.py +1 -0
holmes/core/safeguards.py +4 -2
holmes/core/supabase_dal.py +4 -2
holmes/core/tool_calling_llm.py +102 -141
holmes/core/tools.py +19 -28
holmes/core/tools_utils/token_counting.py +9 -2
holmes/core/tools_utils/tool_context_window_limiter.py +13 -30
holmes/core/tools_utils/tool_executor.py +0 -18
holmes/core/tools_utils/toolset_utils.py +1 -0
holmes/core/toolset_manager.py +37 -2
holmes/core/tracing.py +13 -2
holmes/core/transformers/__init__.py +1 -1
holmes/core/transformers/base.py +1 -0
holmes/core/transformers/llm_summarize.py +3 -2
holmes/core/transformers/registry.py +2 -1
holmes/core/transformers/transformer.py +1 -0
holmes/core/truncation/compaction.py +37 -2
holmes/core/truncation/input_context_window_limiter.py +3 -2
holmes/interactive.py +52 -8
holmes/main.py +17 -37
holmes/plugins/interfaces.py +2 -1
holmes/plugins/prompts/__init__.py +2 -1
holmes/plugins/prompts/_fetch_logs.jinja2 +5 -5
holmes/plugins/prompts/_runbook_instructions.jinja2 +2 -1
holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
holmes/plugins/prompts/conversation_history_compaction.jinja2 +2 -1
holmes/plugins/prompts/generic_ask.jinja2 +0 -2
holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -2
holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -2
holmes/plugins/prompts/generic_investigation.jinja2 +0 -2
holmes/plugins/prompts/investigation_procedure.jinja2 +2 -1
holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -2
holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -2
holmes/plugins/runbooks/__init__.py +32 -3
holmes/plugins/sources/github/__init__.py +4 -2
holmes/plugins/sources/prometheus/models.py +1 -0
holmes/plugins/toolsets/__init__.py +30 -26
holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +13 -12
holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -12
holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +7 -7
holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -7
holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -5
holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -7
holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +6 -8
holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -3
holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -3
holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -3
holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -3
holmes/plugins/toolsets/azure_sql/utils.py +0 -32
holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
holmes/plugins/toolsets/bash/bash_toolset.py +2 -3
holmes/plugins/toolsets/bash/common/bash.py +19 -9
holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
holmes/plugins/toolsets/bash/common/stringify.py +1 -1
holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
holmes/plugins/toolsets/bash/parse_command.py +12 -13
holmes/plugins/toolsets/connectivity_check.py +124 -0
holmes/plugins/toolsets/coralogix/api.py +132 -119
holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
holmes/plugins/toolsets/coralogix/utils.py +15 -79
holmes/plugins/toolsets/datadog/datadog_api.py +36 -3
holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +34 -1
holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
holmes/plugins/toolsets/datadog/toolset_datadog_general.py +71 -28
holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +224 -375
holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +67 -36
holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +360 -343
holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
holmes/plugins/toolsets/git.py +7 -8
holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
holmes/plugins/toolsets/grafana/common.py +2 -30
holmes/plugins/toolsets/grafana/grafana_tempo_api.py +2 -1
holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +18 -2
holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +92 -18
holmes/plugins/toolsets/grafana/loki_api.py +4 -0
holmes/plugins/toolsets/grafana/toolset_grafana.py +109 -25
holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +22 -0
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +201 -33
holmes/plugins/toolsets/grafana/trace_parser.py +3 -2
holmes/plugins/toolsets/internet/internet.py +10 -10
holmes/plugins/toolsets/internet/notion.py +5 -6
holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
holmes/plugins/toolsets/investigator/model.py +3 -1
holmes/plugins/toolsets/json_filter_mixin.py +134 -0
holmes/plugins/toolsets/kafka.py +12 -7
holmes/plugins/toolsets/kubernetes.yaml +260 -30
holmes/plugins/toolsets/kubernetes_logs.py +3 -3
holmes/plugins/toolsets/logging_utils/logging_api.py +16 -6
holmes/plugins/toolsets/mcp/toolset_mcp.py +88 -60
holmes/plugins/toolsets/newrelic/new_relic_api.py +41 -1
holmes/plugins/toolsets/newrelic/newrelic.jinja2 +24 -0
holmes/plugins/toolsets/newrelic/newrelic.py +212 -55
holmes/plugins/toolsets/prometheus/prometheus.py +358 -102
holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +11 -3
holmes/plugins/toolsets/rabbitmq/api.py +23 -4
holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +5 -5
holmes/plugins/toolsets/robusta/robusta.py +5 -5
holmes/plugins/toolsets/runbook/runbook_fetcher.py +25 -6
holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +1 -1
holmes/plugins/toolsets/utils.py +1 -1
holmes/utils/config_utils.py +1 -1
holmes/utils/connection_utils.py +31 -0
holmes/utils/console/result.py +10 -0
holmes/utils/file_utils.py +2 -1
holmes/utils/global_instructions.py +10 -26
holmes/utils/holmes_status.py +4 -3
holmes/utils/log.py +15 -0
holmes/utils/markdown_utils.py +2 -3
holmes/utils/memory_limit.py +58 -0
holmes/utils/sentry_helper.py +23 -0
holmes/utils/stream.py +12 -5
holmes/utils/tags.py +4 -3
holmes/version.py +3 -1
{holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +12 -10
holmesgpt-0.18.4.dist-info/RECORD +258 -0
holmes/plugins/toolsets/aws.yaml +0 -80
holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -114
holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -736
holmes/plugins/toolsets/grafana/grafana_api.py +0 -64
holmes/plugins/toolsets/opensearch/__init__.py +0 -0
holmes/plugins/toolsets/opensearch/opensearch.py +0 -250
holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -215
holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
holmes/utils/keygen_utils.py +0 -6
holmesgpt-0.16.2a0.dist-info/RECORD +0 -258
holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_ppl_query_docs.jinja2 +0 -0
holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist.py +2 -2
/holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist_instructions.jinja2 +0 -0
{holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/LICENSE +0 -0
{holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
{holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0

holmes/plugins/toolsets/datadog/datadog_api.py CHANGED Viewed

@@ -1,16 +1,17 @@
 import json
 import logging
 import re
+import threading
 from datetime import datetime, timedelta, timezone
-from typing import Any, Optional, Dict, Union, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 from urllib.parse import urlparse, urlunparse
 import requests  # type: ignore
 from pydantic import AnyUrl, BaseModel
 from requests.structures import CaseInsensitiveDict  # type: ignore
 from tenacity import retry, retry_if_exception, stop_after_attempt, wait_incrementing
 from tenacity.wait import wait_base
 START_RETRY_DELAY = (
     5.0  # Initial fallback delay if datadog does not return a reset_time
 )
@@ -22,6 +23,9 @@ RATE_LIMIT_REMAINING_SECONDS_HEADER = "X-RateLimit-Reset"
 # Cache for OpenAPI spec
 _openapi_spec_cache: Dict[str, Any] = {}
+# Global lock for Datadog API requests to prevent concurrent calls
+_datadog_request_lock = threading.Lock()
 # Relative time pattern (m = minutes, mo = months)
 RELATIVE_TIME_PATTERN = re.compile(r"^-?(\d+)([hdwsy]|min|m|mo)$|^now$", re.IGNORECASE)
@@ -237,6 +241,35 @@ def execute_datadog_http_request(
     payload_or_params: dict,
     timeout: int,
     method: str = "POST",
+) -> Any:
+    # from my limited testing doing 1 just request at a time is faster because the RATE_LIMIT_REMAINING_SECONDS_HEADER is shorter
+    # Serialize all Datadog API requests to avoid rate limits
+    with _datadog_request_lock:
+        return execute_datadog_http_request_with_retries(
+            url, headers, payload_or_params, timeout, method
+        )
+@retry(
+    retry=retry_if_http_429_error(),
+    wait=wait_for_retry_after_header(
+        fallback=wait_incrementing(
+            start=START_RETRY_DELAY, increment=INCREMENT_RETRY_DELAY
+        )
+    ),
+    stop=stop_after_attempt(MAX_RETRY_COUNT_ON_RATE_LIMIT),
+    before_sleep=lambda retry_state: logging.warning(
+        f"DataDog API rate limited. Retrying... "
+        f"(attempt {retry_state.attempt_number}/{MAX_RETRY_COUNT_ON_RATE_LIMIT})"
+    ),
+    reraise=True,
+)
+def execute_datadog_http_request_with_retries(
+    url: str,
+    headers: dict,
+    payload_or_params: dict,
+    timeout: int,
+    method: str,
 ) -> Any:
     logging.debug(
         f"Datadog API Request: Method: {method} URL: {url} Headers: {json.dumps(sanitize_headers(headers), indent=2)} {'Params' if method == 'GET' else 'Payload'}: {json.dumps(payload_or_params, indent=2)} Timeout: {timeout}s"
@@ -261,7 +294,7 @@ def execute_datadog_http_request(
         return response_data
     else:
-        logging.error(f"  Error Response Body: {response.text}")
+        logging.debug(f"Error Response Body: {response.text}")
         raise DataDogRequestError(
             payload=payload_or_params,
             status_code=response.status_code,

holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 CHANGED Viewed

@@ -44,7 +44,6 @@ Before running logs queries:
 ### Time Parameters
 - Use RFC3339 format: `2023-03-01T10:30:00Z`
 - Or relative seconds: `-3600` for 1 hour ago
-- Defaults to 1 hour window if not specified
 ### Common Investigation Patterns
@@ -52,3 +51,37 @@ Before running logs queries:
 1. User asks: "Show logs for my-workload"
 2. Use `kubectl_find_resource` → find pod "my-workload-abc123-xyz"
 3. Query Datadog for pod "my-workload-abc123-xyz" logs
+### Search Query Guidelines
+1. Avoid using @timestamp Attribute in the search queries (e.g  for example  @timestamp:[2025-12-10T01:00:00.000Z TO 2025-12-10T04:00:00.000Z)
+   Rely on the fetch_datadog_logs function start_datetime and end_datetime parameters for that.
+2. Datadog default TAGS for kubernetes are *kube_namespace* and *pod_name*, if a user specificy custom TAGS used in his environment please use them in your search queries.
+3. If you see a useful TAG in your Old fetch_datadog_logs query use it for further queries.
+### CRITICAL: Cursor Usage Rules
+**NEVER parallelize cursor-based calls or reuse cursor values!**
+Cursors are stateful pointers - each one is single-use and represents a unique position in the data stream.
+**WRONG (causes duplicate data):**
+```
+Batch 1 → cursor_A
+Then call Batch 2, 3, 4 ALL with cursor_A in parallel ❌
+Result: Duplicate data, incomplete results
+```
+**CORRECT (sequential pagination):**
+```
+Batch 1 → cursor_A
+Wait for response → use cursor_A for Batch 2 → cursor_B
+Wait for response → use cursor_B for Batch 3 → cursor_C
+Result: Complete unique data ✅
+```
+**Key Rules:**
+- Each response provides a NEW cursor for the NEXT request
+- NEVER reuse the same cursor value multiple times
+- NEVER make parallel calls with the same cursor
+- Always wait for response before using the returned cursor

holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 CHANGED Viewed

@@ -73,7 +73,7 @@ When investigating metrics-related issues:
 # Handling queries results
 * ALWAYS embed the execution results into your answer
-* You only need to embed the partial result in your response. Include the "tool_name" and "random_key". For example: << {"type": "datadogql", "tool_name": "query_datadog_metrics", "random_key": "92jf2hf"} >>
+* You only need to embed the partial result in your response. Include the "tool_name" and "tool_call_id". For example: << {"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "92jf2hf"} >>
 * Post processing will parse your response, re-run the query from the tool output and create a chart visible to the user
 * You MUST ensure that the query is successful.
 * ALWAYS embed a DataDog graph in the response. The graph should visualize data related to the incident.
@@ -81,6 +81,6 @@ When investigating metrics-related issues:
 * When embedding multiple graphs, always add line spacing between them
     For example:
-    <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "random_key": "lBaA"}>>
+    <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "lBaA"}>>
-    <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "random_key": "IKtq"}>>
+    <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "IKtq"}>>

holmes/plugins/toolsets/datadog/datadog_models.py ADDED Viewed

@@ -0,0 +1,59 @@
+from enum import Enum
+from pydantic import Field
+from holmes.plugins.toolsets.datadog.datadog_api import DatadogBaseConfig
+from holmes.plugins.toolsets.logging_utils.logging_api import DEFAULT_LOG_LIMIT
+# Constants for RDS toolset
+DEFAULT_TIME_SPAN_SECONDS = 3600
+DEFAULT_TOP_INSTANCES = 10
+# Constants for general toolset
+MAX_RESPONSE_SIZE = 10 * 1024 * 1024  # 10MB
+class DataDogStorageTier(str, Enum):
+    """Storage tier enum for Datadog logs."""
+    INDEXES = "indexes"
+    ONLINE_ARCHIVES = "online-archives"
+    FLEX = "flex"
+# Constants for logs toolset
+DEFAULT_STORAGE_TIERS = [DataDogStorageTier.INDEXES]
+class DatadogMetricsConfig(DatadogBaseConfig):
+    """Configuration for Datadog metrics toolset."""
+    default_limit: int = DEFAULT_LOG_LIMIT
+class DatadogTracesConfig(DatadogBaseConfig):
+    """Configuration for Datadog traces toolset."""
+    indexes: list[str] = ["*"]
+class DatadogLogsConfig(DatadogBaseConfig):
+    """Configuration for Datadog logs toolset."""
+    indexes: list[str] = ["*"]
+    # TODO storage tier just works with first element. need to add support for multi stoarge tiers.
+    storage_tiers: list[DataDogStorageTier] = Field(
+        default_factory=lambda: [DataDogStorageTier.INDEXES], min_length=1
+    )
+    compact_logs: bool = True
+    default_limit: int = DEFAULT_LOG_LIMIT
+class DatadogGeneralConfig(DatadogBaseConfig):
+    """Configuration for general-purpose Datadog toolset."""
+    max_response_size: int = MAX_RESPONSE_SIZE
+    allow_custom_endpoints: bool = (
+        False  # If True, allows endpoints not in whitelist (still filtered for safety)
+    )

holmes/plugins/toolsets/datadog/datadog_url_utils.py ADDED Viewed

@@ -0,0 +1,213 @@
+import re
+from typing import Any, Dict, Optional
+from urllib.parse import urlencode, urlparse
+from holmes.plugins.toolsets.datadog.datadog_api import convert_api_url_to_app_url
+from holmes.plugins.toolsets.datadog.datadog_models import (
+    DatadogGeneralConfig,
+    DatadogLogsConfig,
+    DatadogMetricsConfig,
+    DatadogTracesConfig,
+)
+def generate_datadog_metrics_explorer_url(
+    dd_config: DatadogMetricsConfig,
+    query: str,
+    from_time: int,
+    to_time: int,
+) -> str:
+    base_url = convert_api_url_to_app_url(dd_config.site_api_url)
+    params = {
+        "query": query,
+        "from_ts": from_time * 1000,  # seconds -> ms
+        "to_ts": to_time * 1000,  # seconds -> ms
+        "live": "true",
+    }
+    return f"{base_url}/metric/explorer?{urlencode(params)}"
+def generate_datadog_metrics_list_url(
+    dd_config: DatadogMetricsConfig,
+    from_time: int,
+    host: Optional[str] = None,
+    tag_filter: Optional[str] = None,
+    metric_filter: Optional[str] = None,
+) -> str:
+    base_url = convert_api_url_to_app_url(dd_config.site_api_url)
+    params = {}
+    if metric_filter:
+        params["filter"] = metric_filter
+    if host:
+        params["host"] = host
+    if tag_filter:
+        params["tag_filter"] = tag_filter
+    qs = urlencode(params) if params else ""
+    return f"{base_url}/metric/summary" + (f"?{qs}" if qs else "")
+def generate_datadog_metric_metadata_url(
+    dd_config: DatadogMetricsConfig,
+    metric_name: str,
+) -> str:
+    base_url = convert_api_url_to_app_url(dd_config.site_api_url)
+    params = {"metric": metric_name}
+    return f"{base_url}/metric/summary?{urlencode(params)}"
+def generate_datadog_metric_tags_url(
+    dd_config: DatadogMetricsConfig,
+    metric_name: str,
+) -> str:
+    base_url = convert_api_url_to_app_url(dd_config.site_api_url)
+    params = {"metric": metric_name}
+    return f"{base_url}/metric/summary?{urlencode(params)}"
+def generate_datadog_spans_url(
+    dd_config: DatadogTracesConfig,
+    query: str,
+    from_time_ms: int,
+    to_time_ms: int,
+) -> str:
+    base_url = convert_api_url_to_app_url(dd_config.site_api_url)
+    url_params = {
+        "query": query,
+        "from_ts": from_time_ms,
+        "to_ts": to_time_ms,
+        "live": "true",
+    }
+    return f"{base_url}/apm/traces?{urlencode(url_params)}"
+def generate_datadog_spans_analytics_url(
+    dd_config: DatadogTracesConfig,
+    query: str,
+    from_time_ms: int,
+    to_time_ms: int,
+) -> str:
+    base_url = convert_api_url_to_app_url(dd_config.site_api_url)
+    url_params = {
+        "query": query,
+        "from_ts": from_time_ms,
+        "to_ts": to_time_ms,
+        "live": "true",
+    }
+    return f"{base_url}/apm/analytics?{urlencode(url_params)}"
+def generate_datadog_logs_url(
+    dd_config: DatadogLogsConfig,
+    params: dict,
+) -> str:
+    base_url = convert_api_url_to_app_url(dd_config.site_api_url)
+    url_params = {
+        "query": params["filter"]["query"],
+        "from_ts": params["filter"]["from"],
+        "to_ts": params["filter"]["to"],
+        "live": "true",
+        "storage": params["filter"]["storage_tier"],
+    }
+    if dd_config.indexes != ["*"]:
+        url_params["index"] = ",".join(dd_config.indexes)
+    # Construct the full URL
+    return f"{base_url}/logs?{urlencode(url_params)}"
+def _build_qs(
+    query_params: Optional[Dict[str, Any]], allowed: Optional[set] = None
+) -> str:
+    if not query_params:
+        return ""
+    allowed = allowed or {
+        "filter",
+        "query",
+        "tags",
+        "status",
+        "start",
+        "end",
+        "from",
+        "to",
+    }
+    url_params = {}
+    for k, v in query_params.items():
+        if k not in allowed or v is None:
+            continue
+        if k in ("start", "from"):
+            url_params["from_ts"] = v * 1000
+        elif k in ("end", "to"):
+            url_params["to_ts"] = v * 1000
+        elif k in ("query", "filter", "tags"):
+            url_params["q"] = v
+        else:
+            url_params[k] = v
+    qs = urlencode(url_params) if url_params else ""
+    return f"?{qs}" if qs else ""
+def generate_datadog_general_url(
+    dd_config: DatadogGeneralConfig,
+    endpoint: str,
+    query_params: Optional[Dict[str, Any]] = None,
+) -> Optional[str]:
+    base_url = convert_api_url_to_app_url(dd_config.site_api_url)
+    path = urlparse(endpoint).path
+    if "/logs" in path:
+        return f"{base_url}/logs{_build_qs(query_params, {'start', 'end'})}"
+    if "/monitor" in path:
+        qs = _build_qs(query_params, {"filter", "query", "tags", "status"})
+        monitor_id_match = re.search(r"/monitor/(\d+)", path)
+        if monitor_id_match:
+            return f"{base_url}/monitors/{monitor_id_match.group(1)}{qs}"
+        return f"{base_url}/monitors{qs}"
+    if "/dashboard" in path:
+        qs = _build_qs(query_params, {"filter", "query", "tags"})
+        if re.match(r"^/api/v\d+/dashboard/[^/]+", path):
+            return f"{base_url}/dashboard/{path.split('/')[-1]}{qs}"
+        return f"{base_url}/dashboard{qs}"
+    if "/slo" in path:
+        qs = _build_qs(query_params, {"filter", "query", "tags"})
+        if re.match(r"^/api/v\d+/slo/[^/]+", path):
+            return f"{base_url}/slo/{path.split('/')[-1]}{qs}"
+        return f"{base_url}/slo{qs}"
+    if "/events" in path:
+        return f"{base_url}/events{_build_qs(query_params, {'start', 'end'})}"
+    if "/incidents" in path:
+        qs = _build_qs(query_params, {"filter", "query", "status"})
+        if re.match(r"^/api/v\d+/incidents/[^/]+", path):
+            return f"{base_url}/incidents/{path.split('/')[-1]}{qs}"
+        return f"{base_url}/incidents{qs}"
+    if "/synthetics" in path:
+        qs = _build_qs(query_params, {"filter", "query", "tags", "status"})
+        if re.match(r"^/api/v\d+/synthetics/tests/[^/]+", path):
+            return f"{base_url}/synthetics/tests/{path.split('/')[-1]}{qs}"
+        return f"{base_url}/synthetics/tests{qs}"
+    if "/hosts" in path:
+        return f"{base_url}/infrastructure{_build_qs(query_params, {'filter', 'query', 'tags'})}"
+    if "/services" in path:
+        return f"{base_url}/apm/services{_build_qs(query_params, {'filter', 'query', 'tags'})}"
+    if "/metrics" in path or "/query" in path:
+        return f"{base_url}/metrics/explorer{_build_qs(query_params, {'from', 'to', 'query'})}"
+    return f"{base_url}/apm/home"

holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 CHANGED Viewed

@@ -3,49 +3,186 @@
 Tools to search and analyze distributed traces from Datadog APM.
 ### Available Tools:
-- **fetch_datadog_traces** - List traces with filters (service, operation, duration)
-- **fetch_datadog_trace_by_id** - Get detailed span hierarchy for a specific trace
 - **fetch_datadog_spans** - Search spans with Datadog query syntax
+- **aggregate_datadog_spans** - Aggregate span data into buckets and compute metrics
 ### Common Usage:
 ```python
-# Find slow traces (>5s) for a service
-fetch_datadog_traces(service="backend-service", min_duration="5s")
+# Search for errors using Datadog query syntax
+fetch_datadog_spans(query="@http.status_code:500", limit=5)
+fetch_datadog_spans(query="service:api status:error", limit=10)
+```
-# Get trace details showing full span hierarchy
-fetch_datadog_trace_by_id(trace_id="6878d11e0000000064837efe7e97f5f8")
+### Query Patterns:
-# Search for errors using Datadog query syntax
-fetch_datadog_spans(query="@http.status_code:500")
-fetch_datadog_spans(service="api", query="status:error")
+```python
+# Specific HTTP endpoint (any method)
+fetch_datadog_spans(query="@http.route:/api/orders", limit=5)
+# HTTP routes containing substring (wildcard search)
+fetch_datadog_spans(query="@http.route:*payment*", limit=5)
+# Broad search across all span types
+fetch_datadog_spans(query="resource_name:*user*", limit=10)
+# Errors by service with wildcard
+fetch_datadog_spans(query="service:payment @http.status_code:5*", limit=5)
+# Database queries with time range (last hour)
+fetch_datadog_spans(
+    query="service:postgres @duration:>1000000000",
+    start_datetime="-3600",  # 1 hour in seconds
+    limit=10
+)
+# Production errors
+fetch_datadog_spans(query="env:production error:true", limit=5)
-# Time ranges (default: last hour)
-fetch_datadog_traces(
-    service="api",
-    start_datetime="-3600",  # 1 hour ago
-    end_datetime="0"         # now
+# Specific endpoint pattern with custom time range
+fetch_datadog_spans(
+    query='@http.route:*/user/* @http.status_code:>=400',
+    start_datetime="-1800",  # 30 minutes in seconds
+    limit=10
+)
+# Combining multiple conditions with wildcards
+fetch_datadog_spans(
+    query='service:*api* @http.route:*/user/* @http.status_code:[400 TO 599]',
+    limit=10
 )
 ```
-### Query Examples:
+### Aggregate Examples:
 ```python
-# Performance issues
-fetch_datadog_traces(min_duration="2s", operation="GET /api/products")
+# Count spans grouped by status code (last 15 minutes)
+aggregate_datadog_spans(
+    query='resource_name:*api* @http.method:POST',
+    compute=[{"aggregation": "count", "type": "total"}],
+    group_by=[{"facet": "@http.status_code", "limit": 50}],
+    start_datetime="-900"  # 15 minutes in seconds
+)
+# Get average duration by service (last hour)
+aggregate_datadog_spans(
+    query='service:*backend* OR service:*api*',
+    compute=[{"aggregation": "avg", "metric": "@duration", "type": "total"}],
+    group_by=[{"facet": "service", "limit": 50}],
+    start_datetime="-3600"  # 1 hour in seconds
+)
+# Get P95 latency timeseries by service
+aggregate_datadog_spans(
+    query='@http.route:*/api/* @http.status_code:[200 TO 299]',
+    compute=[{
+        "aggregation": "pc95",
+        "metric": "@duration",
+        "type": "timeseries",
+        "interval": "5m"
+    }],
+    group_by=[{"facet": "service", "limit": 50}]
+)
+# Complex aggregation with histogram
+aggregate_datadog_spans(
+    query='resource_name:*product* OR resource_name:*catalog*',
+    compute=[
+        {"aggregation": "avg", "metric": "@duration", "type": "total"},
+        {"aggregation": "count", "type": "total"}
+    ],
+    group_by=[{
+        "facet": "@duration",
+        "histogram": {"interval": 100, "min": 0, "max": 1000},
+        "limit": 50
+    }]
+)
+# Error rate calculation by endpoint
+aggregate_datadog_spans(
+    query='@http.route:* @http.status_code:[400 TO 599]',
+    compute=[{"aggregation": "count", "type": "total"}],
+    group_by=[
+        {"facet": "resource_name", "limit": 50},
+        {"facet": "@http.status_code", "limit": 50}
+    ]
+)
+```
+### Query Pattern Tips:
+| Your Goal | Use This Pattern |
+|-----------|------------------|
+| Specific HTTP endpoint, any method | `@http.route:/api/users` |
+| HTTP routes containing substring | `@http.route:*payment*` |
+| Broad search across all span types | `resource_name:*user*` |
+| Service name patterns | `service:*api*` or `service:payment-*` |
+| Multiple wildcards | `@http.route:*/user/*/profile` |
+| Error status codes | `@http.status_code:5*` or `@http.status_code:[400 TO 599]` |
+### General Tips:
+- Wildcards (*) can be used in most fields for flexible pattern matching
+- For aggregations: use @-prefixed attributes (e.g., @duration, @http.status_code)
+- Keep fetch_datadog_spans limit low (5-10) to avoid too much data
+- aggregate_datadog_spans can handle higher limits (50+) for group_by facets
+### CRITICAL: Cursor Usage Rules
+**NEVER parallelize cursor-based calls or reuse cursor values!**
+Cursors are stateful pointers - each one is single-use and represents a unique position in the data stream.
+**WRONG (causes duplicate data):**
+```
+Batch 1 → cursor_A
+Then call Batch 2, 3, 4 ALL with cursor_A in parallel ❌
+Result: Duplicate data, incomplete results
+```
-# Errors by service
-fetch_datadog_spans(service="payment", query="@http.status_code:5*")
+**CORRECT (sequential pagination):**
+```
+Batch 1 → cursor_A
+Wait for response → use cursor_A for Batch 2 → cursor_B
+Wait for response → use cursor_B for Batch 3 → cursor_C
+Result: Complete unique data ✅
+```
+**Key Rules:**
+- Each response provides a NEW cursor for the NEXT request
+- NEVER reuse the same cursor value multiple times
+- NEVER make parallel calls with the same cursor
+- Always wait for response before using the returned cursor
+### Compact Mode Strategy:
+The `compact` parameter reduces output size by returning only essential fields. Use this strategy:
-# Database queries
-fetch_datadog_spans(query="service:postgres @duration:>1000000000")
+1. **Initial exploration**: Use compact=true with higher limits (50-100) to get an overview
+2. **Detailed investigation**: Use compact=false with lower limits (5-10) for specific spans
-# With tags
-fetch_datadog_spans(tags={"env": "production"}, query="error:true")
+```python
+# STEP 1: Initial search with compact mode to find patterns
+fetch_datadog_spans(
+    query="service:api @http.status_code:5*",
+    compact=true,
+    limit=100  # Higher limit safe with compact mode
+)
+# STEP 2: Detailed investigation of specific issues
+fetch_datadog_spans(
+    query="service:api @http.status_code:500 resource_name:*/user/*",
+    compact=false,  # Full details for deep analysis
+    limit=10
+)
 ```
-### Tips:
-- Duration units: ms, s, m (e.g., "500ms", "5s", "1m")
-- Time: RFC3339 format or negative seconds from now
-- Rate limit: 300 requests/hour
-- Default time range: 1 hour
+**When to use compact=true:**
+- Initial searches to identify patterns
+- When you need to scan many spans for errors or performance issues
+- When looking for specific span IDs or trace IDs
+- When the full span details aren't needed yet
+**When to use compact=false (default):**
+- Investigating specific errors
+- Analyzing request/response headers
+- Examining user agent details
+- Debugging authentication issues or HTTP details

holmesgpt 0.16.2a0__py3-none-any.whl → 0.18.4__py3-none-any.whl

holmesgpt 0.16.2a0py3-none-any.whl → 0.18.4py3-none-any.whl