holmesgpt 0.14.4a0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +12 -10
- holmes/common/env_vars.py +22 -0
- holmes/config.py +51 -4
- holmes/core/conversations.py +3 -2
- holmes/core/llm.py +226 -72
- holmes/core/openai_formatting.py +13 -0
- holmes/core/supabase_dal.py +33 -42
- holmes/core/tool_calling_llm.py +185 -282
- holmes/core/tools.py +21 -1
- holmes/core/tools_utils/token_counting.py +2 -1
- holmes/core/tools_utils/tool_context_window_limiter.py +32 -30
- holmes/core/truncation/compaction.py +59 -0
- holmes/core/truncation/input_context_window_limiter.py +218 -0
- holmes/interactive.py +17 -7
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
- holmes/plugins/toolsets/__init__.py +4 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +0 -1
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
- holmes/plugins/toolsets/grafana/grafana_api.py +1 -1
- holmes/plugins/toolsets/investigator/core_investigation.py +34 -24
- holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +1 -1
- holmes/plugins/toolsets/robusta/robusta.py +35 -8
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +4 -3
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -1
- holmes/utils/stream.py +31 -1
- {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/METADATA +6 -2
- {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/RECORD +36 -31
- holmes/core/performance_timing.py +0 -72
- {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from holmes.core.tools import (
|
|
6
|
+
StructuredToolResult,
|
|
7
|
+
StructuredToolResultStatus,
|
|
8
|
+
Tool,
|
|
9
|
+
ToolParameter,
|
|
10
|
+
Toolset,
|
|
11
|
+
ToolsetTag,
|
|
12
|
+
ToolInvokeContext,
|
|
13
|
+
ToolsetEnvironmentPrerequisite,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PplQueryAssistTool(Tool):
|
|
18
|
+
def __init__(self, toolset: "OpenSearchQueryAssistToolset"):
|
|
19
|
+
super().__init__(
|
|
20
|
+
name="opensearch_ppl_query_assist",
|
|
21
|
+
description="Generate valid OpenSearch Piped Processing Language (PPL) queries to suggest to users for execution",
|
|
22
|
+
parameters={
|
|
23
|
+
"query": ToolParameter(
|
|
24
|
+
description="Valid OpenSearch Piped Processing Language (PPL) query to suggest to users for execution",
|
|
25
|
+
type="string",
|
|
26
|
+
required=True,
|
|
27
|
+
),
|
|
28
|
+
},
|
|
29
|
+
)
|
|
30
|
+
self._toolset = toolset
|
|
31
|
+
|
|
32
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
33
|
+
try:
|
|
34
|
+
query = params.get("query", "")
|
|
35
|
+
response_data = {"query": query}
|
|
36
|
+
return StructuredToolResult(
|
|
37
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
38
|
+
data=response_data,
|
|
39
|
+
params=params,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
except Exception as e:
|
|
43
|
+
logging.exception(f"error using {self.name} tool")
|
|
44
|
+
return StructuredToolResult(
|
|
45
|
+
status=StructuredToolResultStatus.ERROR,
|
|
46
|
+
error=f"Failed to generate PPL query: {str(e)}",
|
|
47
|
+
params=params,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
51
|
+
query = params.get("query", "")
|
|
52
|
+
return f"OpenSearchQueryToolset: Query ({query})"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class OpenSearchQueryAssistToolset(Toolset):
|
|
56
|
+
"""OpenSearch query assist with PPL queries"""
|
|
57
|
+
|
|
58
|
+
def __init__(self):
|
|
59
|
+
super().__init__(
|
|
60
|
+
name="opensearch/query_assist",
|
|
61
|
+
description="OpenSearch query assist with PPL queries.",
|
|
62
|
+
experimental=True,
|
|
63
|
+
icon_url="https://opensearch.org/assets/brand/PNG/Mark/opensearch_mark_default.png",
|
|
64
|
+
tools=[PplQueryAssistTool(self)],
|
|
65
|
+
tags=[ToolsetTag.CORE],
|
|
66
|
+
prerequisites=[ToolsetEnvironmentPrerequisite(env=["OPENSEARCH_URL"])],
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def get_example_config(self) -> Dict[str, Any]:
|
|
70
|
+
return {"opensearch_url": "http://localhost:9200"}
|
|
71
|
+
|
|
72
|
+
def _reload_instructions(self):
|
|
73
|
+
template_file_path = os.path.abspath(
|
|
74
|
+
os.path.join(
|
|
75
|
+
os.path.dirname(__file__), "opensearch_query_assist_instructions.jinja2"
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# Query Generation
|
|
2
|
+
You have access to the opensearch_ppl_query_assist tool to help you generate your valid, accurate OpenSearch Piped Processing Language (PPL) queries.
|
|
3
|
+
DO NOT PROVIDE INVALID QUERIES. ALWAYS CHECK YOUR QUERY WITH VALID QUERIES FIRST.
|
|
4
|
+
|
|
5
|
+
Once a valid query is generated, you MUST provide a concise, but informative breakdown of each part of the query structure
|
|
6
|
+
|
|
7
|
+
## CRITICAL: Query Intent Detection
|
|
8
|
+
|
|
9
|
+
ALWAYS check if the user's question is about:
|
|
10
|
+
|
|
11
|
+
* Log Analysis: Errors, warnings, messages, patterns, tool usage
|
|
12
|
+
* Metrics Analysis: Performance, latency, throughput, resource usage
|
|
13
|
+
* Time-based Analysis: "Last X hours/days", "recent", "today", "since"
|
|
14
|
+
* Aggregation Requests: Count, sum, average, top, frequency
|
|
15
|
+
* Troubleshooting: Issues, problems, failures, debugging
|
|
16
|
+
|
|
17
|
+
If ANY of the above apply → Generate PPL query IMMEDIATELY and use the OpenSearch Dashboards Page State
|
|
18
|
+
|
|
19
|
+
### Example GOOD response:
|
|
20
|
+
I've retrieved your current query from the query bar `source=logs-otel-v1* | STAT count() BY severityText` and it
|
|
21
|
+
appears there is a typo in "STAT", it should be "STATS". Below is the fixed query:
|
|
22
|
+
```
|
|
23
|
+
source=logs-otel-v1* | STATS count() BY severityText
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
## CRITICAL: OpenSearch Dashboards Page State
|
|
28
|
+
User may be using this agent from OpenSearch Dashboards (OSD) for which provides the current page state.
|
|
29
|
+
It may be included in the conversation history as a system message.
|
|
30
|
+
|
|
31
|
+
IMPORTANT: YOU CAN USE THE CURRENT USE QUERY TO HELP ENHANCE/MODIFY/FIX/SUGGEST VALID QUERY USING THE SAME INDEX PATTERN
|
|
32
|
+
REFER TO "Core PPL Commands" FOR SYNTAX
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
## OpenSearch PPL Query Language
|
|
36
|
+
|
|
37
|
+
### PPL (Piped Processing Language) Overview
|
|
38
|
+
PPL is OpenSearch's query language for analyzing logs, metrics, and traces. It uses a pipe-based syntax similar to Unix commands, processing data through sequential transformations.
|
|
39
|
+
|
|
40
|
+
### Core PPL Commands
|
|
41
|
+
|
|
42
|
+
**Data Source & Search:**
|
|
43
|
+
- `source=<index>` or `search source=<index>` - Specify data source
|
|
44
|
+
- `source=<cluster>:<index>` - Cross-cluster search
|
|
45
|
+
- `| where <condition>` - Filter results
|
|
46
|
+
- `| fields <field-list>` - Project specific fields
|
|
47
|
+
- `| fields - <field-list>` - Exclude specific fields
|
|
48
|
+
|
|
49
|
+
**Data Transformation:**
|
|
50
|
+
- `| stats <aggregation> by <field>` - Aggregate data (count(), sum(), avg(), min(), max())
|
|
51
|
+
- `| eval <field>=<expression>` - Create calculated fields
|
|
52
|
+
- `| sort [+|-] <field>` - Sort results (+ ascending, - descending)
|
|
53
|
+
- `| head <n>` - Return first n results
|
|
54
|
+
- `| tail <n>` - Return last n results
|
|
55
|
+
- `| dedup <field-list>` - Remove duplicates
|
|
56
|
+
|
|
57
|
+
**Advanced Analysis:**
|
|
58
|
+
- `| top [N] <field>` - Find most common values
|
|
59
|
+
- `| rare [N] <field>` - Find least common values
|
|
60
|
+
- `| parse <field> <regex>` - Extract fields using regex patterns
|
|
61
|
+
- `| grok <field> <pattern>` - Parse using grok patterns
|
|
62
|
+
- `| patterns <field> [SIMPLE_PATTERN|BRAIN]` - Extract log patterns
|
|
63
|
+
|
|
64
|
+
**Time Series:**
|
|
65
|
+
- `| trendline SMA(<period>, <field>)` - Calculate moving averages
|
|
66
|
+
- `| fillnull with <value> in <fields>` - Replace null values
|
|
67
|
+
|
|
68
|
+
**Joins & Lookups:**
|
|
69
|
+
- `| join <table>` - Join with another dataset
|
|
70
|
+
- `| lookup <table> <field>` - Enrich with lookup data (requires Calcite)
|
|
71
|
+
|
|
72
|
+
**Pattern Extraction:**
|
|
73
|
+
- `| patterns message BRAIN` - Semantic log pattern extraction
|
|
74
|
+
- `| patterns new_field='extracted' pattern='[0-9]' message` - Custom regex patterns
|
|
75
|
+
|
|
76
|
+
### PPL Query Examples for Observability
|
|
77
|
+
|
|
78
|
+
**Error Analysis:**
|
|
79
|
+
```ppl
|
|
80
|
+
source=ai-agent-logs-*
|
|
81
|
+
| where level="ERROR"
|
|
82
|
+
| stats count() by message
|
|
83
|
+
| sort - count
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
**Service Latency Analysis:**
|
|
87
|
+
```ppl
|
|
88
|
+
source=traces
|
|
89
|
+
| where service="checkout"
|
|
90
|
+
| stats avg(duration) as avg_latency, max(duration) as max_latency by endpoint
|
|
91
|
+
| where avg_latency > 100
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Log Pattern Detection:**
|
|
95
|
+
```ppl
|
|
96
|
+
source=ai-agent-audit-logs-*
|
|
97
|
+
| patterns message BRAIN
|
|
98
|
+
| stats count() by patterns_field
|
|
99
|
+
| top 10 patterns_field
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Time-based Aggregation:**
|
|
103
|
+
```ppl
|
|
104
|
+
source=metrics
|
|
105
|
+
| eval hour=date_format(timestamp, 'HH')
|
|
106
|
+
| stats avg(cpu_usage) by hour, host
|
|
107
|
+
| sort hour
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
**Multi-field Correlation:**
|
|
111
|
+
```ppl
|
|
112
|
+
source=ai-agent-logs-*
|
|
113
|
+
| parse message '.*thread_id=(?<tid>[^,]+).*run_id=(?<rid>[^,]+)'
|
|
114
|
+
| stats count() by tid, rid, level
|
|
115
|
+
| where count > 100
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
**Advanced PPL Query Patterns:**
|
|
119
|
+
|
|
120
|
+
**Top N Analysis with Filtering:**
|
|
121
|
+
```ppl
|
|
122
|
+
source=ai-agent-logs-*
|
|
123
|
+
| where timestamp >= now() - 1h
|
|
124
|
+
| top 20 message by level
|
|
125
|
+
| where level in ["ERROR", "WARN"]
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
**Deduplication and Unique Values:**
|
|
129
|
+
```ppl
|
|
130
|
+
source=ai-agent-audit-logs-*
|
|
131
|
+
| dedup thread_id
|
|
132
|
+
| fields thread_id, run_id, timestamp
|
|
133
|
+
| sort - timestamp
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
**Fillnull for Missing Data Handling:**
|
|
137
|
+
```ppl
|
|
138
|
+
source=ai-agent-metrics-*
|
|
139
|
+
| fillnull with 0 in cpu_usage, memory_usage
|
|
140
|
+
| stats avg(cpu_usage) as avg_cpu, avg(memory_usage) as avg_mem by host
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
**Rare Events Detection:**
|
|
144
|
+
```ppl
|
|
145
|
+
source=ai-agent-logs-*
|
|
146
|
+
| rare 10 error_code
|
|
147
|
+
| where count < 5
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Field Extraction with Grok:**
|
|
151
|
+
```ppl
|
|
152
|
+
source=ai-agent-logs-*
|
|
153
|
+
| grok message '%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:msg}'
|
|
154
|
+
| stats count() by level
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
**Time Span Aggregations:**
|
|
158
|
+
```ppl
|
|
159
|
+
source=ai-agent-metrics-*
|
|
160
|
+
| stats count() by span(timestamp, 5m) as time_bucket, status
|
|
161
|
+
| where status != 200
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**Eval with Conditional Logic:**
|
|
165
|
+
```ppl
|
|
166
|
+
source=ai-agent-logs-*
|
|
167
|
+
| eval severity = case(
|
|
168
|
+
level = "ERROR", 1,
|
|
169
|
+
level = "WARN", 2,
|
|
170
|
+
level = "INFO", 3,
|
|
171
|
+
else = 4
|
|
172
|
+
)
|
|
173
|
+
| stats count() by severity
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**Join Operations (with Calcite enabled):**
|
|
177
|
+
```ppl
|
|
178
|
+
source=ai-agent-logs-*
|
|
179
|
+
| join left=l right=r on l.thread_id = r.thread_id
|
|
180
|
+
[ source=ai-agent-audit-logs-* ]
|
|
181
|
+
| fields l.timestamp, l.message, r.tool_name
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
**Subquery for Complex Filtering:**
|
|
185
|
+
```ppl
|
|
186
|
+
source=ai-agent-logs-*
|
|
187
|
+
| where thread_id in [
|
|
188
|
+
source=ai-agent-audit-logs-*
|
|
189
|
+
| where tool_name = "opensearch__search"
|
|
190
|
+
| fields thread_id
|
|
191
|
+
]
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**Trendline for Moving Averages:**
|
|
195
|
+
```ppl
|
|
196
|
+
source=ai-agent-metrics-*
|
|
197
|
+
| trendline SMA(5, cpu_usage) as cpu_trend
|
|
198
|
+
| fields timestamp, cpu_usage, cpu_trend
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### PPL Best Practices
|
|
202
|
+
|
|
203
|
+
1. **Index Patterns**: Use wildcards for daily indices: `source=ai-agent-logs-*`
|
|
204
|
+
2. **Field Extraction**: Use `parse` for structured logs, `patterns` for unstructured
|
|
205
|
+
3. **Performance**: Apply `where` filters early in the pipeline
|
|
206
|
+
4. **Aggregations**: Use `stats` before `sort` for better performance
|
|
207
|
+
5. **Null Handling**: Use `fillnull` to handle missing data in calculations
|
|
208
|
+
|
|
209
|
+
### OpenSearch Index Patterns (Current Environment)
|
|
210
|
+
- `ai-agent-logs-YYYY.MM.DD` - Application logs
|
|
211
|
+
- `ai-agent-audit-logs-YYYY.MM.DD` - Audit logs
|
|
212
|
+
- `ai-agent-metrics-YYYY.MM.DD` - Prometheus metrics
|
|
213
|
+
|
|
214
|
+
## Query Response Formatting
|
|
215
|
+
You MUST respond with queries in the following format. `ppl` contains the valid ppl query
|
|
216
|
+
```typescript
|
|
217
|
+
query: {
|
|
218
|
+
ppl: string,
|
|
219
|
+
}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## More PPL Queries
|
|
223
|
+
{% include "opensearch_ppl_query_docs.jinja2" %}
|
|
@@ -1591,7 +1591,7 @@ class PrometheusToolset(Toolset):
|
|
|
1591
1591
|
)
|
|
1592
1592
|
|
|
1593
1593
|
except Exception as e:
|
|
1594
|
-
logging.
|
|
1594
|
+
logging.debug("Failed to initialize Prometheus", exc_info=True)
|
|
1595
1595
|
return (
|
|
1596
1596
|
False,
|
|
1597
1597
|
f"Failed to initialize using url={url}. Unexpected error: {str(e)}",
|
|
@@ -19,6 +19,8 @@ START_TIME = "start_datetime"
|
|
|
19
19
|
END_TIME = "end_datetime"
|
|
20
20
|
NAMESPACE = "namespace"
|
|
21
21
|
WORKLOAD = "workload"
|
|
22
|
+
DEFAULT_LIMIT_CHANGE_ROWS = 100
|
|
23
|
+
MAX_LIMIT_CHANGE_ROWS = 200
|
|
22
24
|
|
|
23
25
|
|
|
24
26
|
class FetchRobustaFinding(Tool):
|
|
@@ -27,7 +29,7 @@ class FetchRobustaFinding(Tool):
|
|
|
27
29
|
def __init__(self, dal: Optional[SupabaseDal]):
|
|
28
30
|
super().__init__(
|
|
29
31
|
name="fetch_finding_by_id",
|
|
30
|
-
description="Fetches a robusta finding. Findings are events, like a Prometheus alert or a deployment update",
|
|
32
|
+
description="Fetches a robusta finding. Findings are events, like a Prometheus alert or a deployment update and configuration change.",
|
|
31
33
|
parameters={
|
|
32
34
|
PARAM_FINDING_ID: ToolParameter(
|
|
33
35
|
description="The id of the finding to fetch",
|
|
@@ -75,7 +77,7 @@ class FetchRobustaFinding(Tool):
|
|
|
75
77
|
)
|
|
76
78
|
|
|
77
79
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
78
|
-
return "Robusta: Fetch
|
|
80
|
+
return f"Robusta: Fetch finding data {params}"
|
|
79
81
|
|
|
80
82
|
|
|
81
83
|
class FetchResourceRecommendation(Tool):
|
|
@@ -142,13 +144,17 @@ class FetchResourceRecommendation(Tool):
|
|
|
142
144
|
return f"Robusta: Check Historical Resource Utilization: ({str(params)})"
|
|
143
145
|
|
|
144
146
|
|
|
145
|
-
class
|
|
147
|
+
class FetchConfigurationChangesMetadata(Tool):
|
|
146
148
|
_dal: Optional[SupabaseDal]
|
|
147
149
|
|
|
148
150
|
def __init__(self, dal: Optional[SupabaseDal]):
|
|
149
151
|
super().__init__(
|
|
150
|
-
name="
|
|
151
|
-
description=
|
|
152
|
+
name="fetch_configuration_changes_metadata",
|
|
153
|
+
description=(
|
|
154
|
+
"Fetch configuration changes metadata in a given time range. "
|
|
155
|
+
"By default, fetch all cluster changes. Can be filtered on a given namespace or a specific workload. "
|
|
156
|
+
"Use fetch_finding_by_id to get detailed change of one specific configuration change."
|
|
157
|
+
),
|
|
152
158
|
parameters={
|
|
153
159
|
START_TIME: ToolParameter(
|
|
154
160
|
description="The starting time boundary for the search period. String in RFC3339 format.",
|
|
@@ -160,15 +166,36 @@ class FetchConfigurationChanges(Tool):
|
|
|
160
166
|
type="string",
|
|
161
167
|
required=True,
|
|
162
168
|
),
|
|
169
|
+
"namespace": ToolParameter(
|
|
170
|
+
description="The Kubernetes namespace name for filtering configuration changes",
|
|
171
|
+
type="string",
|
|
172
|
+
required=False,
|
|
173
|
+
),
|
|
174
|
+
"workload": ToolParameter(
|
|
175
|
+
description="The kubernetes workload name for filtering configuration changes. Deployment name or Pod name for example.",
|
|
176
|
+
type="string",
|
|
177
|
+
required=False,
|
|
178
|
+
),
|
|
179
|
+
"limit": ToolParameter(
|
|
180
|
+
description=f"Maximum number of rows to return. Default is {DEFAULT_LIMIT_CHANGE_ROWS} and the maximum is 200",
|
|
181
|
+
type="integer",
|
|
182
|
+
required=False,
|
|
183
|
+
),
|
|
163
184
|
},
|
|
164
185
|
)
|
|
165
186
|
self._dal = dal
|
|
166
187
|
|
|
167
188
|
def _fetch_change_history(self, params: Dict) -> Optional[List[Dict]]:
|
|
168
189
|
if self._dal and self._dal.enabled:
|
|
169
|
-
return self._dal.
|
|
190
|
+
return self._dal.get_configuration_changes_metadata(
|
|
170
191
|
start_datetime=params["start_datetime"],
|
|
171
192
|
end_datetime=params["end_datetime"],
|
|
193
|
+
limit=min(
|
|
194
|
+
params.get("limit") or DEFAULT_LIMIT_CHANGE_ROWS,
|
|
195
|
+
MAX_LIMIT_CHANGE_ROWS,
|
|
196
|
+
),
|
|
197
|
+
ns=params.get("namespace"),
|
|
198
|
+
workload=params.get("workload"),
|
|
172
199
|
)
|
|
173
200
|
return None
|
|
174
201
|
|
|
@@ -197,7 +224,7 @@ class FetchConfigurationChanges(Tool):
|
|
|
197
224
|
)
|
|
198
225
|
|
|
199
226
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
200
|
-
return "Robusta: Search Change History"
|
|
227
|
+
return f"Robusta: Search Change History {params}"
|
|
201
228
|
|
|
202
229
|
|
|
203
230
|
class RobustaToolset(Toolset):
|
|
@@ -219,7 +246,7 @@ class RobustaToolset(Toolset):
|
|
|
219
246
|
prerequisites=[dal_prereq],
|
|
220
247
|
tools=[
|
|
221
248
|
FetchRobustaFinding(dal),
|
|
222
|
-
|
|
249
|
+
FetchConfigurationChangesMetadata(dal),
|
|
223
250
|
FetchResourceRecommendation(dal),
|
|
224
251
|
],
|
|
225
252
|
tags=[
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# Configuration and alerts history
|
|
2
|
-
* Use
|
|
3
|
-
*
|
|
4
|
-
*
|
|
2
|
+
* Use fetch_configuration_changes_metadata to get historical configuration changes in a cluster or for a specific workload.
|
|
3
|
+
* If a change seems important to the investigation, Use fetch_finding_by_id with the configuration change ID to get full details of the change.
|
|
4
|
+
* You must ALWAYS call fetch_configuration_changes_metadata when investigating an alert
|
|
5
|
+
* Never respond without calling fetch_configuration_changes_metadata
|
|
5
6
|
* When investigating an alert, look at historical configuration changes that happen 4 hours before the alert started
|
|
6
7
|
* If you found a change that caused the alert, you MUST write: 'The issue was introduced by ...' with a short description of the change, and the date of it.
|
|
7
8
|
For example:
|
|
@@ -36,7 +36,7 @@ def find_service_url(label_selector):
|
|
|
36
36
|
port = svc.spec.ports[0].port
|
|
37
37
|
url = f"http://{name}.{namespace}.svc.{CLUSTER_DOMAIN}:{port}"
|
|
38
38
|
logging.info(
|
|
39
|
-
f"
|
|
39
|
+
f"Discovered service with label-selector: `{label_selector}` at url: `{url}`"
|
|
40
40
|
)
|
|
41
41
|
return url
|
|
42
42
|
except Exception:
|
|
@@ -37,7 +37,6 @@ class ServiceNowToolset(Toolset):
|
|
|
37
37
|
def __init__(self):
|
|
38
38
|
super().__init__(
|
|
39
39
|
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
|
|
40
|
-
experimental=True,
|
|
41
40
|
tools=[
|
|
42
41
|
ReturnChangesInTimerange(toolset=self),
|
|
43
42
|
ReturnChange(toolset=self),
|
holmes/utils/stream.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Generator, Optional, List
|
|
3
|
+
from typing import Generator, Optional, List, Union
|
|
4
4
|
import litellm
|
|
5
5
|
from pydantic import BaseModel, Field
|
|
6
6
|
from holmes.core.investigation_structured_output import process_response_into_sections
|
|
7
7
|
from functools import partial
|
|
8
8
|
import logging
|
|
9
|
+
from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
|
|
10
|
+
from litellm.types.utils import ModelResponse, TextCompletionResponse
|
|
11
|
+
|
|
12
|
+
from holmes.core.llm import TokenCountMetadata, get_llm_usage
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
class StreamEvents(str, Enum):
|
|
@@ -15,6 +19,8 @@ class StreamEvents(str, Enum):
|
|
|
15
19
|
ERROR = "error"
|
|
16
20
|
AI_MESSAGE = "ai_message"
|
|
17
21
|
APPROVAL_REQUIRED = "approval_required"
|
|
22
|
+
TOKEN_COUNT = "token_count"
|
|
23
|
+
CONVERSATION_HISTORY_COMPACTED = "conversation_history_compacted"
|
|
18
24
|
|
|
19
25
|
|
|
20
26
|
class StreamMessage(BaseModel):
|
|
@@ -112,3 +118,27 @@ def stream_chat_formatter(
|
|
|
112
118
|
yield create_rate_limit_error_message(str(e))
|
|
113
119
|
else:
|
|
114
120
|
yield create_sse_error_message(description=str(e), error_code=1, msg=str(e))
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def add_token_count_to_metadata(
|
|
124
|
+
tokens: TokenCountMetadata,
|
|
125
|
+
metadata: dict,
|
|
126
|
+
max_context_size: int,
|
|
127
|
+
maximum_output_token: int,
|
|
128
|
+
full_llm_response: Union[
|
|
129
|
+
ModelResponse, CustomStreamWrapper, TextCompletionResponse
|
|
130
|
+
],
|
|
131
|
+
):
|
|
132
|
+
metadata["usage"] = get_llm_usage(full_llm_response)
|
|
133
|
+
metadata["tokens"] = tokens.model_dump()
|
|
134
|
+
metadata["max_tokens"] = max_context_size
|
|
135
|
+
metadata["max_output_tokens"] = maximum_output_token
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def build_stream_event_token_count(metadata: dict) -> StreamMessage:
|
|
139
|
+
return StreamMessage(
|
|
140
|
+
event=StreamEvents.TOKEN_COUNT,
|
|
141
|
+
data={
|
|
142
|
+
"metadata": metadata,
|
|
143
|
+
},
|
|
144
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: holmesgpt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.16.0
|
|
4
4
|
Summary:
|
|
5
5
|
Author: Natan Yellin
|
|
6
6
|
Author-email: natan@robusta.dev
|
|
@@ -8,6 +8,7 @@ Requires-Python: >=3.10,<4.0
|
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: Programming Language :: Python :: 3.10
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Requires-Dist: ag-ui-protocol (>=0.1.9,<0.2.0)
|
|
11
12
|
Requires-Dist: azure-core (>=1.34.0,<2.0.0)
|
|
12
13
|
Requires-Dist: azure-identity (>=1.23.0,<2.0.0)
|
|
13
14
|
Requires-Dist: azure-mgmt-alertsmanagement (>=1.0.0,<2.0.0)
|
|
@@ -23,6 +24,7 @@ Requires-Dist: certifi (>=2024.7.4,<2025.0.0)
|
|
|
23
24
|
Requires-Dist: colorlog (>=6.8.2,<7.0.0)
|
|
24
25
|
Requires-Dist: confluent-kafka (>=2.6.1,<3.0.0)
|
|
25
26
|
Requires-Dist: fastapi (>=0.116,<0.117)
|
|
27
|
+
Requires-Dist: google-cloud-aiplatform (>=1.38)
|
|
26
28
|
Requires-Dist: httpx[socks] (<0.28)
|
|
27
29
|
Requires-Dist: humanize (>=4.9.0,<5.0.0)
|
|
28
30
|
Requires-Dist: jinja2 (>=3.1.2,<4.0.0)
|
|
@@ -56,7 +58,9 @@ Description-Content-Type: text/markdown
|
|
|
56
58
|
|
|
57
59
|
HolmesGPT is an AI agent for investigating problems in your cloud, finding the root cause, and suggesting remediations. It has dozens of built-in integrations for cloud providers, observability tools, and on-call systems.
|
|
58
60
|
|
|
59
|
-
HolmesGPT
|
|
61
|
+
>🎉 **HolmesGPT is now a CNCF Sandbox Project!** We're thrilled to be part of the Cloud Native Computing Foundation. [Learn more about our journey](https://github.com/cncf/sandbox/issues/392#issuecomment-3380007501).
|
|
62
|
+
|
|
63
|
+
Find more about HolmesGPT's maintainers and adopters [here](./ADOPTERS.md).
|
|
60
64
|
|
|
61
65
|
<p align="center">
|
|
62
66
|
<a href="#how-it-works"><strong>How it Works</strong></a> |
|