@invokehq/cli 0.2.8 → 0.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/invoke/onyx/__init__.py +26 -2
- package/invoke/onyx/analyzer.py +502 -41
- package/package.json +1 -1
- package/pyproject.toml +1 -0
package/invoke/onyx/__init__.py
CHANGED
|
@@ -1,5 +1,29 @@
|
|
|
1
1
|
"""Onyx Lite trace analyzer."""
|
|
2
2
|
|
|
3
|
-
from .analyzer import
|
|
3
|
+
from .analyzer import (
|
|
4
|
+
ONYX_FAILURE_ANALYSIS_PROMPT,
|
|
5
|
+
OnyxMode,
|
|
6
|
+
OnyxRoute,
|
|
7
|
+
OnyxSuggestion,
|
|
8
|
+
analyze_traces,
|
|
9
|
+
best_improvement,
|
|
10
|
+
deterministic_analyze_traces,
|
|
11
|
+
intelligence_analyze_failures,
|
|
12
|
+
onyx_analyze_failures,
|
|
13
|
+
should_use_intelligence,
|
|
14
|
+
summarize_failures,
|
|
15
|
+
)
|
|
4
16
|
|
|
5
|
-
__all__ = [
|
|
17
|
+
__all__ = [
|
|
18
|
+
"ONYX_FAILURE_ANALYSIS_PROMPT",
|
|
19
|
+
"OnyxMode",
|
|
20
|
+
"OnyxRoute",
|
|
21
|
+
"OnyxSuggestion",
|
|
22
|
+
"analyze_traces",
|
|
23
|
+
"best_improvement",
|
|
24
|
+
"deterministic_analyze_traces",
|
|
25
|
+
"intelligence_analyze_failures",
|
|
26
|
+
"onyx_analyze_failures",
|
|
27
|
+
"should_use_intelligence",
|
|
28
|
+
"summarize_failures",
|
|
29
|
+
]
|
package/invoke/onyx/analyzer.py
CHANGED
|
@@ -1,10 +1,53 @@
|
|
|
1
|
-
"""Onyx Lite: trace analysis and concrete repair suggestions."""
|
|
1
|
+
"""Onyx Lite: production trace analysis and concrete repair suggestions."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
5
7
|
from collections import Counter
|
|
6
8
|
from dataclasses import dataclass, field
|
|
7
|
-
from typing import Any
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
OnyxMode = Literal["deterministic", "intelligence", "auto"]
|
|
13
|
+
DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-6"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
ONYX_FAILURE_ANALYSIS_PROMPT = """
|
|
17
|
+
You are Onyx, a senior AI systems engineer and agent supervisor.
|
|
18
|
+
You are extremely good at debugging and improving Claude Agent SDK agents running in production.
|
|
19
|
+
|
|
20
|
+
You have access to the last {num_runs} runs of the agent, including:
|
|
21
|
+
- Full traces with reasoning steps, tool calls, inputs, outputs, errors
|
|
22
|
+
- Timestamps, duration, token usage
|
|
23
|
+
- Environment context (tools available, policies, schemas)
|
|
24
|
+
|
|
25
|
+
Recent failures detected:
|
|
26
|
+
{failure_summary}
|
|
27
|
+
|
|
28
|
+
=== TASK ===
|
|
29
|
+
Analyze these failures and provide the single most valuable improvement the agent can make right now.
|
|
30
|
+
|
|
31
|
+
Focus especially on:
|
|
32
|
+
- Repeated patterns, for example the same error 16 times
|
|
33
|
+
- Prompt issues, missing instructions, bad formatting
|
|
34
|
+
- Tool usage problems, wrong parameters, missing validation
|
|
35
|
+
- State management or context loss
|
|
36
|
+
- Schema violations
|
|
37
|
+
- Hallucinations or bad assumptions
|
|
38
|
+
|
|
39
|
+
=== RESPONSE FORMAT strict JSON ===
|
|
40
|
+
{{
|
|
41
|
+
"observation": "Short, clear summary of the root cause. Mention how many times it happened.",
|
|
42
|
+
"suggested_fix": "Very specific, actionable change. Include code or prompt snippet if possible.",
|
|
43
|
+
"fix_type": "prompt_improvement | tool_wrapper | retry_strategy | schema_validation | state_handling | other",
|
|
44
|
+
"expected_impact": "high/medium/low on failure rate plus brief reason",
|
|
45
|
+
"confidence": 85,
|
|
46
|
+
"one_click_patch": "Optional: exact string to patch in the system prompt or tool definition"
|
|
47
|
+
}}
|
|
48
|
+
|
|
49
|
+
Be concise, professional, and ruthless about impact. Only suggest changes that will clearly reduce failures.
|
|
50
|
+
""".strip()
|
|
8
51
|
|
|
9
52
|
|
|
10
53
|
@dataclass
|
|
@@ -13,6 +56,35 @@ class OnyxSuggestion:
|
|
|
13
56
|
severity: str
|
|
14
57
|
reason: str
|
|
15
58
|
apply: dict[str, Any] = field(default_factory=dict)
|
|
59
|
+
observation: str = ""
|
|
60
|
+
suggested_fix: str = ""
|
|
61
|
+
fix_type: str = "other"
|
|
62
|
+
expected_impact: str = "low"
|
|
63
|
+
confidence: int = 60
|
|
64
|
+
one_click_patch: str | None = None
|
|
65
|
+
evidence_count: int = 0
|
|
66
|
+
source: str = "deterministic_trace_analysis"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class OnyxRoute:
|
|
71
|
+
mode: OnyxMode
|
|
72
|
+
reason: str
|
|
73
|
+
requires_llm: bool = False
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class FailureGroup:
|
|
78
|
+
key: str
|
|
79
|
+
title: str
|
|
80
|
+
fix_type: str
|
|
81
|
+
severity: str
|
|
82
|
+
traces: list[dict[str, Any]] = field(default_factory=list)
|
|
83
|
+
examples: list[str] = field(default_factory=list)
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def count(self) -> int:
|
|
87
|
+
return len(self.traces)
|
|
16
88
|
|
|
17
89
|
|
|
18
90
|
def _flatten_events(trace: dict[str, Any]) -> list[dict[str, Any]]:
|
|
@@ -22,54 +94,443 @@ def _flatten_events(trace: dict[str, Any]) -> list[dict[str, Any]]:
|
|
|
22
94
|
return []
|
|
23
95
|
|
|
24
96
|
|
|
25
|
-
def
|
|
26
|
-
|
|
97
|
+
def _error_message(trace: dict[str, Any]) -> str:
|
|
98
|
+
error = trace.get("error")
|
|
99
|
+
if isinstance(error, dict):
|
|
100
|
+
return str(error.get("message") or error.get("type") or "")[:240]
|
|
101
|
+
if error:
|
|
102
|
+
return str(error)[:240]
|
|
103
|
+
for event in _flatten_events(trace):
|
|
104
|
+
if event.get("status") in {"error", "failed", "timeout"}:
|
|
105
|
+
detail = event.get("detail")
|
|
106
|
+
if isinstance(detail, dict):
|
|
107
|
+
return str(detail.get("message") or detail.get("error") or event.get("step") or "")[:240]
|
|
108
|
+
return str(event.get("step") or event.get("status") or "")[:240]
|
|
109
|
+
return ""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _classify_failure(trace: dict[str, Any]) -> tuple[str, str, str, str] | None:
|
|
113
|
+
"""Return key/title/fix_type/severity for one failed trace."""
|
|
114
|
+
|
|
115
|
+
outcome = str(trace.get("final_outcome") or trace.get("status") or "").lower()
|
|
116
|
+
action = str(trace.get("action") or trace.get("tool") or "").lower()
|
|
117
|
+
error = trace.get("error") if isinstance(trace.get("error"), dict) else {}
|
|
118
|
+
error_type = str(error.get("type") or error.get("code") or "").lower()
|
|
119
|
+
message = (_error_message(trace) or "").lower()
|
|
120
|
+
steps = Counter(str(event.get("step") or "").lower() for event in _flatten_events(trace))
|
|
121
|
+
statuses = Counter(str(event.get("status") or "").lower() for event in _flatten_events(trace))
|
|
122
|
+
|
|
123
|
+
if "timeout" in outcome or "timeout" in error_type or "timeout" in message or steps.get("tool_timeout"):
|
|
124
|
+
return ("timeout_unknown_effect", "Unknown outcome after timeout", "retry_strategy", "high")
|
|
125
|
+
|
|
126
|
+
if (
|
|
127
|
+
"idempotent_replay" in outcome
|
|
128
|
+
or "duplicate" in outcome
|
|
129
|
+
or steps.get("duplicate_retry_detected")
|
|
130
|
+
or "duplicate" in message
|
|
131
|
+
):
|
|
132
|
+
return ("duplicate_retry", "Duplicate retry pattern", "tool_wrapper", "high")
|
|
133
|
+
|
|
134
|
+
if "blocked_by_policy" in outcome or "policy" in outcome or action == "database.execute":
|
|
135
|
+
return ("policy_block", "Policy-blocked unsafe action", "tool_wrapper", "high")
|
|
136
|
+
|
|
137
|
+
if "schema" in outcome or "schema" in error_type or "validation" in message or "bad request" in message:
|
|
138
|
+
return ("schema_validation", "Schema validation failure", "schema_validation", "medium")
|
|
139
|
+
|
|
140
|
+
if "drift" in outcome or steps.get("state_revalidation") or steps.get("state_reconciled"):
|
|
141
|
+
return ("state_drift", "State changed before resume", "state_handling", "high")
|
|
142
|
+
|
|
143
|
+
if "auth" in error_type or "401" in message or "unauthorized" in message or "forbidden" in message:
|
|
144
|
+
return ("auth_expired", "Tool authentication failure", "tool_wrapper", "medium")
|
|
145
|
+
|
|
146
|
+
if statuses.get("error") or statuses.get("failed") or error or outcome in {"failed", "error"}:
|
|
147
|
+
return ("tool_error", "Repeated tool error", "other", "medium")
|
|
148
|
+
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _group_failures(traces: list[dict[str, Any]]) -> dict[str, FailureGroup]:
|
|
153
|
+
groups: dict[str, FailureGroup] = {}
|
|
154
|
+
for trace in traces[-50:]:
|
|
155
|
+
classified = _classify_failure(trace)
|
|
156
|
+
if not classified:
|
|
157
|
+
continue
|
|
158
|
+
key, title, fix_type, severity = classified
|
|
159
|
+
group = groups.setdefault(key, FailureGroup(key, title, fix_type, severity))
|
|
160
|
+
group.traces.append(trace)
|
|
161
|
+
example = _error_message(trace)
|
|
162
|
+
if example and example not in group.examples:
|
|
163
|
+
group.examples.append(example)
|
|
164
|
+
return groups
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def summarize_failures(traces: list[dict[str, Any]], *, min_count: int = 3) -> str:
|
|
168
|
+
"""Summarize repeated failures for Onyx prompts and dashboards."""
|
|
169
|
+
|
|
170
|
+
groups = _group_failures(traces)
|
|
171
|
+
lines: list[str] = []
|
|
172
|
+
for group in sorted(groups.values(), key=lambda item: item.count, reverse=True):
|
|
173
|
+
if group.count < min_count:
|
|
174
|
+
continue
|
|
175
|
+
example = f" Example: {group.examples[0]}" if group.examples else ""
|
|
176
|
+
lines.append(f"- {group.title} happened {group.count} times.{example}")
|
|
177
|
+
return "\n".join(lines) if lines else "No significant repeated failures."
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def should_use_intelligence(traces: list[dict[str, Any]]) -> OnyxRoute:
|
|
181
|
+
"""Choose whether Onyx needs Claude-level context reasoning.
|
|
182
|
+
|
|
183
|
+
Deterministic Onyx is best for known production failure shapes: timeouts,
|
|
184
|
+
duplicate retries, schema errors, auth drift, policy blocks, and stale state.
|
|
185
|
+
Intelligence Onyx is for ambiguous trace clusters where prompts, planning,
|
|
186
|
+
tool sequencing, or agent assumptions may be the root cause.
|
|
187
|
+
"""
|
|
27
188
|
|
|
28
189
|
if not traces:
|
|
29
|
-
return
|
|
190
|
+
return OnyxRoute("deterministic", "No traces available yet.")
|
|
30
191
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
192
|
+
groups = _group_failures(traces)
|
|
193
|
+
repeated_groups = [group for group in groups.values() if group.count >= 3]
|
|
194
|
+
known_keys = {
|
|
195
|
+
"timeout_unknown_effect",
|
|
196
|
+
"duplicate_retry",
|
|
197
|
+
"policy_block",
|
|
198
|
+
"schema_validation",
|
|
199
|
+
"state_drift",
|
|
200
|
+
"auth_expired",
|
|
201
|
+
}
|
|
202
|
+
if repeated_groups and all(group.key in known_keys for group in repeated_groups):
|
|
203
|
+
names = ", ".join(group.key for group in repeated_groups[:3])
|
|
204
|
+
return OnyxRoute("deterministic", f"Known repeated failure pattern detected: {names}.")
|
|
205
|
+
|
|
206
|
+
promptish_markers = ("hallucinat", "wrong tool", "bad assumption", "missing context", "plan", "reasoning")
|
|
207
|
+
trace_text: list[str] = []
|
|
208
|
+
for trace in traces[-20:]:
|
|
209
|
+
trace_text.extend(
|
|
210
|
+
[
|
|
211
|
+
str(trace.get("action") or trace.get("tool") or ""),
|
|
212
|
+
_error_message(trace),
|
|
213
|
+
" ".join(str(event.get("step") or "") for event in _flatten_events(trace)),
|
|
214
|
+
]
|
|
215
|
+
)
|
|
216
|
+
all_text = " ".join(trace_text).lower()
|
|
217
|
+
if any(marker in all_text for marker in promptish_markers):
|
|
218
|
+
return OnyxRoute(
|
|
219
|
+
"intelligence",
|
|
220
|
+
"Trace text points to planning, prompt, tool-selection, or context quality issues.",
|
|
221
|
+
requires_llm=True,
|
|
43
222
|
)
|
|
44
223
|
|
|
45
|
-
if
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
apply={"idempotency": {"mode": "required", "key_fields": ["customer_id", "amount", "action"]}},
|
|
52
|
-
)
|
|
224
|
+
if repeated_groups:
|
|
225
|
+
names = ", ".join(group.key for group in repeated_groups[:3])
|
|
226
|
+
return OnyxRoute(
|
|
227
|
+
"intelligence",
|
|
228
|
+
f"Repeated but ambiguous failure cluster detected: {names}.",
|
|
229
|
+
requires_llm=True,
|
|
53
230
|
)
|
|
54
231
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
232
|
+
unknown_failures = [
|
|
233
|
+
trace
|
|
234
|
+
for trace in traces[-20:]
|
|
235
|
+
if (trace.get("error") or str(trace.get("final_outcome") or trace.get("status") or "").lower() in {"failed", "error"})
|
|
236
|
+
and _classify_failure(trace) is None
|
|
237
|
+
]
|
|
238
|
+
if len(unknown_failures) >= 2:
|
|
239
|
+
return OnyxRoute("intelligence", "Multiple unclassified failures need context-aware analysis.", requires_llm=True)
|
|
240
|
+
|
|
241
|
+
return OnyxRoute("deterministic", "No ambiguous repeated failure pattern detected.")
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _build_suggestion(group: FailureGroup) -> OnyxSuggestion:
|
|
245
|
+
if group.key == "timeout_unknown_effect":
|
|
246
|
+
return OnyxSuggestion(
|
|
247
|
+
title="Require reconciliation before retry",
|
|
248
|
+
severity="high",
|
|
249
|
+
reason=f"Seen this unknown-effect timeout pattern {group.count} times. A blind retry can duplicate a side effect.",
|
|
250
|
+
observation=f"Timeouts left the tool outcome unknown {group.count} times, so the agent could retry after the action already succeeded.",
|
|
251
|
+
suggested_fix=(
|
|
252
|
+
"Patch the tool wrapper so every retry first calls a reconcile function and returns already_succeeded "
|
|
253
|
+
"when live state confirms the side effect happened."
|
|
254
|
+
),
|
|
255
|
+
fix_type="retry_strategy",
|
|
256
|
+
expected_impact="high on duplicate side effects because retries become state-aware",
|
|
257
|
+
confidence=90,
|
|
258
|
+
one_click_patch='{"retry_strategy":"reconcile_before_retry","unknown_effect":true}',
|
|
259
|
+
evidence_count=group.count,
|
|
260
|
+
apply={"retry_strategy": "reconcile_before_retry", "unknown_effect": True},
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if group.key == "duplicate_retry":
|
|
264
|
+
return OnyxSuggestion(
|
|
265
|
+
title="Require idempotency keys for this action",
|
|
266
|
+
severity="high",
|
|
267
|
+
reason=f"Seen this duplicate-action pattern {group.count} times. The same request can create multiple tickets, charges, or messages.",
|
|
268
|
+
observation=f"The agent repeated an action {group.count} times after losing or mistrusting the first response.",
|
|
269
|
+
suggested_fix="Require an idempotency key derived from stable business fields before executing this tool.",
|
|
270
|
+
fix_type="tool_wrapper",
|
|
271
|
+
expected_impact="high on duplicate work because replay becomes deterministic",
|
|
272
|
+
confidence=88,
|
|
273
|
+
one_click_patch='{"idempotency":{"mode":"required","key_fields":["customer_id","amount","action"]}}',
|
|
274
|
+
evidence_count=group.count,
|
|
275
|
+
apply={"idempotency": {"mode": "required", "key_fields": ["customer_id", "amount", "action"]}},
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
if group.key == "policy_block":
|
|
279
|
+
return OnyxSuggestion(
|
|
280
|
+
title="Move this policy block into preflight",
|
|
281
|
+
severity="high",
|
|
282
|
+
reason=f"Seen unsafe actions blocked {group.count} times. The agent should learn before execution, not after planning.",
|
|
283
|
+
observation=f"Onyx saw {group.count} attempts to execute actions that policy would never allow.",
|
|
284
|
+
suggested_fix="Add a preflight policy preview that blocks direct database execution and suggests a scoped API/tool instead.",
|
|
285
|
+
fix_type="tool_wrapper",
|
|
286
|
+
expected_impact="medium-high on operator time because impossible actions stop before tool execution",
|
|
287
|
+
confidence=86,
|
|
288
|
+
one_click_patch='{"preflight":{"policy_preview":true,"block":["database.execute","*.delete"]}}',
|
|
289
|
+
evidence_count=group.count,
|
|
290
|
+
apply={"preflight": {"policy_preview": True, "block": ["database.execute", "*.delete"]}},
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
if group.key == "schema_validation":
|
|
294
|
+
return OnyxSuggestion(
|
|
295
|
+
title="Add strict schema validation before tool calls",
|
|
296
|
+
severity="medium",
|
|
297
|
+
reason=f"Seen malformed parameters {group.count} times. The agent is guessing tool input shape.",
|
|
298
|
+
observation=f"Schema or validation failures occurred {group.count} times, usually before the tool could do useful work.",
|
|
299
|
+
suggested_fix="Validate params locally and return a repairable error showing required fields and allowed enum values.",
|
|
300
|
+
fix_type="schema_validation",
|
|
301
|
+
expected_impact="medium on failure rate because bad calls become self-correcting",
|
|
302
|
+
confidence=82,
|
|
303
|
+
one_click_patch='{"schema_validation":{"mode":"strict","repair_hints":true}}',
|
|
304
|
+
evidence_count=group.count,
|
|
305
|
+
apply={"schema_validation": {"mode": "strict", "repair_hints": True}},
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
if group.key == "state_drift":
|
|
309
|
+
return OnyxSuggestion(
|
|
310
|
+
title="Revalidate state before thawing approvals",
|
|
311
|
+
severity="high",
|
|
312
|
+
reason=f"Seen state drift {group.count} times. Human approval can become stale while the world changes.",
|
|
313
|
+
observation=f"Pending work resumed against changed state {group.count} times.",
|
|
314
|
+
suggested_fix="Attach a state snapshot to each checkpoint and require revalidation before approval resumes execution.",
|
|
315
|
+
fix_type="state_handling",
|
|
316
|
+
expected_impact="high on unsafe resumes because stale approvals are requeued",
|
|
317
|
+
confidence=87,
|
|
318
|
+
one_click_patch='{"freeze_thaw":{"revalidate_before_resume":true,"drift_action":"requeue"}}',
|
|
319
|
+
evidence_count=group.count,
|
|
320
|
+
apply={"freeze_thaw": {"revalidate_before_resume": True, "drift_action": "requeue"}},
|
|
63
321
|
)
|
|
64
322
|
|
|
323
|
+
if group.key == "auth_expired":
|
|
324
|
+
return OnyxSuggestion(
|
|
325
|
+
title="Classify auth failures and refresh credentials",
|
|
326
|
+
severity="medium",
|
|
327
|
+
reason=f"Seen auth failures {group.count} times. The agent is treating credential drift like a normal tool error.",
|
|
328
|
+
observation=f"Credential or authorization failures happened {group.count} times.",
|
|
329
|
+
suggested_fix="Classify 401/403 separately, refresh credentials once, then escalate with the provider and scope.",
|
|
330
|
+
fix_type="tool_wrapper",
|
|
331
|
+
expected_impact="medium on recovery rate because expired credentials stop causing noisy retries",
|
|
332
|
+
confidence=80,
|
|
333
|
+
one_click_patch='{"auth":{"classify":["401","403"],"refresh_once":true,"escalate_on_failure":true}}',
|
|
334
|
+
evidence_count=group.count,
|
|
335
|
+
apply={"auth": {"classify": ["401", "403"], "refresh_once": True, "escalate_on_failure": True}},
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
return OnyxSuggestion(
|
|
339
|
+
title="Add targeted handling for repeated tool errors",
|
|
340
|
+
severity=group.severity,
|
|
341
|
+
reason=f"Seen this tool error {group.count} times. It is now a pattern, not noise.",
|
|
342
|
+
observation=f"A repeated tool failure happened {group.count} times.",
|
|
343
|
+
suggested_fix="Add a wrapper-specific classifier so the agent gets a repairable error instead of retrying blindly.",
|
|
344
|
+
fix_type=group.fix_type,
|
|
345
|
+
expected_impact="medium on failure rate because repeated errors become classified",
|
|
346
|
+
confidence=72,
|
|
347
|
+
one_click_patch='{"tool_errors":{"classify_repeated":true,"repair_hints":true}}',
|
|
348
|
+
evidence_count=group.count,
|
|
349
|
+
apply={"tool_errors": {"classify_repeated": True, "repair_hints": True}},
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def deterministic_analyze_traces(traces: list[dict[str, Any]]) -> list[OnyxSuggestion]:
|
|
354
|
+
"""Detect repeated failures and emit concrete, one-click fixes."""
|
|
355
|
+
|
|
356
|
+
if not traces:
|
|
357
|
+
return []
|
|
358
|
+
|
|
359
|
+
groups = _group_failures(traces)
|
|
360
|
+
suggestions = [_build_suggestion(group) for group in groups.values() if group.count >= 3]
|
|
361
|
+
suggestions.sort(key=lambda item: (item.severity != "high", -item.evidence_count, -item.confidence))
|
|
362
|
+
|
|
363
|
+
if suggestions:
|
|
364
|
+
return suggestions
|
|
365
|
+
|
|
366
|
+
outcomes = Counter(str(trace.get("final_outcome") or trace.get("status") or "unknown") for trace in traces)
|
|
367
|
+
return [
|
|
368
|
+
OnyxSuggestion(
|
|
369
|
+
title="No repeated failure pattern detected",
|
|
370
|
+
severity="info",
|
|
371
|
+
reason="Recent traces do not show recurring timeout, duplicate retry, or policy-block patterns.",
|
|
372
|
+
observation=f"Reviewed {len(traces)} recent runs. Most common outcome: {outcomes.most_common(1)[0][0]}.",
|
|
373
|
+
suggested_fix="Keep collecting structured traces. Onyx will suggest a patch once the same failure appears at least 3 times.",
|
|
374
|
+
fix_type="other",
|
|
375
|
+
expected_impact="low until repeated failures appear",
|
|
376
|
+
confidence=70,
|
|
377
|
+
evidence_count=0,
|
|
378
|
+
apply={},
|
|
379
|
+
)
|
|
380
|
+
]
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def analyze_traces(traces: list[dict[str, Any]]) -> list[OnyxSuggestion]:
|
|
384
|
+
"""Backward-compatible deterministic Onyx analysis."""
|
|
385
|
+
|
|
386
|
+
return deterministic_analyze_traces(traces)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def best_improvement(traces: list[dict[str, Any]]) -> dict[str, Any]:
|
|
390
|
+
"""Return the single most valuable Onyx improvement as strict JSON data."""
|
|
391
|
+
|
|
392
|
+
suggestions = deterministic_analyze_traces(traces)
|
|
65
393
|
if not suggestions:
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
394
|
+
return {
|
|
395
|
+
"observation": "No traces available yet.",
|
|
396
|
+
"suggested_fix": "Run the agent through Invoke so Onyx can inspect structured executions.",
|
|
397
|
+
"fix_type": "other",
|
|
398
|
+
"expected_impact": "low until traces exist",
|
|
399
|
+
"confidence": 60,
|
|
400
|
+
"one_click_patch": "",
|
|
401
|
+
}
|
|
402
|
+
top = suggestions[0]
|
|
403
|
+
return {
|
|
404
|
+
"observation": top.observation or top.reason,
|
|
405
|
+
"suggested_fix": top.suggested_fix or top.reason,
|
|
406
|
+
"fix_type": top.fix_type,
|
|
407
|
+
"expected_impact": top.expected_impact,
|
|
408
|
+
"confidence": top.confidence,
|
|
409
|
+
"one_click_patch": top.one_click_patch or json.dumps(top.apply, sort_keys=True),
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def _anthropic_client_from_env() -> Any | None:
|
|
414
|
+
if not os.environ.get("ANTHROPIC_API_KEY"):
|
|
415
|
+
return None
|
|
416
|
+
try:
|
|
417
|
+
from anthropic import AsyncAnthropic
|
|
418
|
+
except ImportError:
|
|
419
|
+
return None
|
|
420
|
+
return AsyncAnthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def _extract_llm_text(response: Any) -> str:
|
|
424
|
+
content = getattr(response, "content", None)
|
|
425
|
+
if isinstance(content, list):
|
|
426
|
+
parts: list[str] = []
|
|
427
|
+
for part in content:
|
|
428
|
+
text = getattr(part, "text", None)
|
|
429
|
+
if isinstance(text, str):
|
|
430
|
+
parts.append(text)
|
|
431
|
+
elif isinstance(part, dict):
|
|
432
|
+
parts.append(str(part.get("text", "")))
|
|
433
|
+
if parts:
|
|
434
|
+
return "".join(parts)
|
|
435
|
+
|
|
436
|
+
choices = getattr(response, "choices", None)
|
|
437
|
+
if choices:
|
|
438
|
+
message = getattr(choices[0], "message", None)
|
|
439
|
+
content = getattr(message, "content", None)
|
|
440
|
+
if isinstance(content, str):
|
|
441
|
+
return content
|
|
442
|
+
if isinstance(content, list):
|
|
443
|
+
return "".join(str(part.get("text", "")) if isinstance(part, dict) else str(part) for part in content)
|
|
444
|
+
if isinstance(response, dict):
|
|
445
|
+
return str(response.get("content") or response.get("text") or "")
|
|
446
|
+
return str(response)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
async def intelligence_analyze_failures(
|
|
450
|
+
traces: list[dict[str, Any]],
|
|
451
|
+
llm_client: Any | None = None,
|
|
452
|
+
*,
|
|
453
|
+
model: str = DEFAULT_ANTHROPIC_MODEL,
|
|
454
|
+
) -> dict[str, Any]:
|
|
455
|
+
"""Claude-backed Onyx for context-heavy failures."""
|
|
456
|
+
|
|
457
|
+
llm_client = llm_client or _anthropic_client_from_env()
|
|
458
|
+
if llm_client is None:
|
|
459
|
+
fallback = best_improvement(traces)
|
|
460
|
+
fallback["mode"] = "deterministic"
|
|
461
|
+
fallback["source"] = "deterministic_trace_analysis"
|
|
462
|
+
fallback["error"] = "Intelligence Onyx requires `pip install anthropic` and ANTHROPIC_API_KEY."
|
|
463
|
+
return fallback
|
|
464
|
+
|
|
465
|
+
failure_summary = summarize_failures(traces)
|
|
466
|
+
prompt = ONYX_FAILURE_ANALYSIS_PROMPT.format(num_runs=len(traces), failure_summary=failure_summary)
|
|
467
|
+
|
|
468
|
+
try:
|
|
469
|
+
response = await llm_client.messages.create(
|
|
470
|
+
model=os.environ.get("ANTHROPIC_MODEL", model),
|
|
471
|
+
max_tokens=900,
|
|
472
|
+
temperature=0.3,
|
|
473
|
+
system=(
|
|
474
|
+
"Return only valid JSON. Do not wrap the JSON in markdown. "
|
|
475
|
+
"Do not include commentary outside the JSON object."
|
|
476
|
+
),
|
|
477
|
+
messages=[{"role": "user", "content": prompt}],
|
|
73
478
|
)
|
|
479
|
+
analysis = json.loads(_extract_llm_text(response))
|
|
480
|
+
except Exception as exc: # noqa: BLE001 - this boundary must never break deploy.
|
|
481
|
+
fallback = best_improvement(traces)
|
|
482
|
+
fallback["mode"] = "deterministic"
|
|
483
|
+
fallback["source"] = "deterministic_trace_analysis"
|
|
484
|
+
fallback["error"] = f"Claude intelligence analysis failed: {exc}"
|
|
485
|
+
return fallback
|
|
486
|
+
|
|
487
|
+
required = {"observation", "suggested_fix", "fix_type", "expected_impact", "confidence"}
|
|
488
|
+
if not required.issubset(analysis):
|
|
489
|
+
fallback = best_improvement(traces)
|
|
490
|
+
fallback["mode"] = "deterministic"
|
|
491
|
+
fallback["source"] = "deterministic_trace_analysis"
|
|
492
|
+
fallback["error"] = "Onyx analysis was missing required fields."
|
|
493
|
+
return fallback
|
|
494
|
+
analysis.setdefault("mode", "intelligence")
|
|
495
|
+
analysis.setdefault("source", "claude_context_analysis")
|
|
496
|
+
return analysis
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
async def onyx_analyze_failures(
|
|
500
|
+
traces: list[dict[str, Any]],
|
|
501
|
+
llm_client: Any | None = None,
|
|
502
|
+
*,
|
|
503
|
+
mode: OnyxMode = "auto",
|
|
504
|
+
model: str = DEFAULT_ANTHROPIC_MODEL,
|
|
505
|
+
) -> dict[str, Any]:
|
|
506
|
+
"""Main entry point for Onyx analysis.
|
|
507
|
+
|
|
508
|
+
Modes:
|
|
509
|
+
- deterministic: fast local analysis for known failure classes.
|
|
510
|
+
- intelligence: Claude-backed analysis for ambiguous context-heavy failures.
|
|
511
|
+
- auto: deterministic by default, Claude only when context is required.
|
|
512
|
+
"""
|
|
513
|
+
|
|
514
|
+
if mode == "deterministic":
|
|
515
|
+
result = best_improvement(traces)
|
|
516
|
+
result["mode"] = "deterministic"
|
|
517
|
+
result["source"] = "deterministic_trace_analysis"
|
|
518
|
+
return result
|
|
519
|
+
|
|
520
|
+
if mode == "intelligence":
|
|
521
|
+
return await intelligence_analyze_failures(traces, llm_client, model=model)
|
|
522
|
+
|
|
523
|
+
route = should_use_intelligence(traces)
|
|
524
|
+
if route.requires_llm:
|
|
525
|
+
result = await intelligence_analyze_failures(traces, llm_client, model=model)
|
|
526
|
+
result["route_reason"] = route.reason
|
|
527
|
+
if result.get("error"):
|
|
528
|
+
result["intelligence_available"] = False
|
|
529
|
+
result["intelligence_reason"] = result["error"]
|
|
530
|
+
return result
|
|
74
531
|
|
|
75
|
-
|
|
532
|
+
result = best_improvement(traces)
|
|
533
|
+
result["mode"] = "deterministic"
|
|
534
|
+
result["source"] = "deterministic_trace_analysis"
|
|
535
|
+
result["route_reason"] = route.reason
|
|
536
|
+
return result
|
package/package.json
CHANGED