connectonion 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- connectonion/__init__.py +3 -2
- connectonion/cli/browser_agent/browser.py +433 -147
- connectonion/cli/browser_agent/element_finder.py +139 -0
- connectonion/cli/browser_agent/highlight_screenshot.py +174 -0
- connectonion/cli/browser_agent/prompt.md +188 -105
- connectonion/cli/browser_agent/prompts/element_matcher.md +59 -0
- connectonion/cli/browser_agent/prompts/form_filler.md +19 -0
- connectonion/cli/browser_agent/prompts/scroll_strategy.md +36 -0
- connectonion/cli/browser_agent/scripts/extract_elements.js +126 -0
- connectonion/cli/browser_agent/scroll.py +137 -0
- connectonion/cli/commands/eval_commands.py +286 -0
- connectonion/cli/main.py +11 -0
- connectonion/console.py +5 -5
- connectonion/core/agent.py +13 -10
- connectonion/core/llm.py +9 -19
- connectonion/logger.py +305 -135
- connectonion/network/__init__.py +3 -0
- connectonion/network/asgi.py +122 -2
- connectonion/network/connection.py +123 -0
- connectonion/network/host.py +7 -5
- connectonion/useful_plugins/__init__.py +4 -3
- connectonion/useful_plugins/ui_stream.py +164 -0
- {connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/METADATA +1 -1
- {connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/RECORD +27 -17
- /connectonion/{static → network/static}/docs.html +0 -0
- {connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/WHEEL +0 -0
- {connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Scroll Strategy
|
|
2
|
+
|
|
3
|
+
Analyze this webpage and determine the BEST way to scroll "{description}".
|
|
4
|
+
|
|
5
|
+
## Scrollable Elements Found
|
|
6
|
+
{scrollable_elements}
|
|
7
|
+
|
|
8
|
+
## Simplified HTML (first 5000 chars)
|
|
9
|
+
{simplified_html}
|
|
10
|
+
|
|
11
|
+
## Instructions
|
|
12
|
+
|
|
13
|
+
Return:
|
|
14
|
+
|
|
15
|
+
1. **method**: "window" | "element" | "container"
|
|
16
|
+
|
|
17
|
+
2. **selector**: CSS selector (empty if method is "window")
|
|
18
|
+
|
|
19
|
+
3. **javascript**: Complete IIFE that scrolls ONE iteration:
|
|
20
|
+
```javascript
|
|
21
|
+
(() => {{
|
|
22
|
+
const el = document.querySelector('.selector');
|
|
23
|
+
if (el) el.scrollTop += 1000;
|
|
24
|
+
return {{success: true}};
|
|
25
|
+
}})()
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
4. **explanation**: Brief reason
|
|
29
|
+
|
|
30
|
+
## Common Patterns
|
|
31
|
+
|
|
32
|
+
- Gmail/email lists: Scroll the container with overflow:auto, NOT window
|
|
33
|
+
- Social feeds (Twitter, LinkedIn): Often scroll the main feed container
|
|
34
|
+
- Regular pages: Use window.scrollBy(0, 1000)
|
|
35
|
+
|
|
36
|
+
User wants to scroll: "{description}"
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract interactive elements from the page with injected IDs.
|
|
3
|
+
*
|
|
4
|
+
* Inspired by browser-use (https://github.com/browser-use/browser-use).
|
|
5
|
+
*
|
|
6
|
+
* This script:
|
|
7
|
+
* 1. Finds all interactive elements (buttons, links, inputs, etc.)
|
|
8
|
+
* 2. Injects a unique `data-browser-agent-id` attribute into each
|
|
9
|
+
* 3. Returns element data with bounding boxes for LLM matching
|
|
10
|
+
*/
|
|
11
|
+
(() => {
|
|
12
|
+
const results = [];
|
|
13
|
+
let index = 0;
|
|
14
|
+
|
|
15
|
+
// Interactive element types
|
|
16
|
+
const INTERACTIVE_TAGS = new Set([
|
|
17
|
+
'a', 'button', 'input', 'select', 'textarea', 'label',
|
|
18
|
+
'details', 'summary', 'dialog'
|
|
19
|
+
]);
|
|
20
|
+
|
|
21
|
+
const INTERACTIVE_ROLES = new Set([
|
|
22
|
+
'button', 'link', 'menuitem', 'menuitemcheckbox', 'menuitemradio',
|
|
23
|
+
'option', 'radio', 'switch', 'tab', 'checkbox', 'textbox',
|
|
24
|
+
'searchbox', 'combobox', 'listbox', 'slider', 'spinbutton'
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
// Check if element is visible
|
|
28
|
+
function isVisible(el) {
|
|
29
|
+
const style = window.getComputedStyle(el);
|
|
30
|
+
if (style.display === 'none') return false;
|
|
31
|
+
if (style.visibility === 'hidden') return false;
|
|
32
|
+
if (parseFloat(style.opacity) === 0) return false;
|
|
33
|
+
|
|
34
|
+
// Skip visually-hidden accessibility elements
|
|
35
|
+
const className = el.className || '';
|
|
36
|
+
if (typeof className === 'string' &&
|
|
37
|
+
(className.includes('visually-hidden') ||
|
|
38
|
+
className.includes('sr-only') ||
|
|
39
|
+
className.includes('screen-reader'))) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const rect = el.getBoundingClientRect();
|
|
44
|
+
if (rect.width === 0 || rect.height === 0) return false;
|
|
45
|
+
|
|
46
|
+
// Skip elements that are clipped/hidden with CSS tricks
|
|
47
|
+
if (rect.width < 2 || rect.height < 2) return false;
|
|
48
|
+
|
|
49
|
+
// Check if in viewport (with some margin)
|
|
50
|
+
const margin = 100;
|
|
51
|
+
if (rect.bottom < -margin) return false;
|
|
52
|
+
if (rect.top > window.innerHeight + margin) return false;
|
|
53
|
+
if (rect.right < -margin) return false;
|
|
54
|
+
if (rect.left > window.innerWidth + margin) return false;
|
|
55
|
+
|
|
56
|
+
return true;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Get clean text content
|
|
60
|
+
function getText(el) {
|
|
61
|
+
// For inputs, get value or placeholder
|
|
62
|
+
if (el.tagName === 'INPUT' || el.tagName === 'TEXTAREA') {
|
|
63
|
+
return el.value || el.placeholder || '';
|
|
64
|
+
}
|
|
65
|
+
// For other elements, get inner text
|
|
66
|
+
const text = el.innerText || el.textContent || '';
|
|
67
|
+
return text.trim().replace(/\s+/g, ' ').substring(0, 80);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Process all elements
|
|
71
|
+
document.querySelectorAll('*').forEach(el => {
|
|
72
|
+
const tag = el.tagName.toLowerCase();
|
|
73
|
+
const role = el.getAttribute('role');
|
|
74
|
+
|
|
75
|
+
// Check if interactive
|
|
76
|
+
const isInteractiveTag = INTERACTIVE_TAGS.has(tag);
|
|
77
|
+
const isInteractiveRole = role && INTERACTIVE_ROLES.has(role);
|
|
78
|
+
const isClickable = window.getComputedStyle(el).cursor === 'pointer';
|
|
79
|
+
const hasTabIndex = el.hasAttribute('tabindex') && el.tabIndex >= 0;
|
|
80
|
+
const hasClickHandler = el.onclick !== null || el.hasAttribute('onclick');
|
|
81
|
+
|
|
82
|
+
if (!isInteractiveTag && !isInteractiveRole && !isClickable &&
|
|
83
|
+
!hasTabIndex && !hasClickHandler) {
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Skip hidden inputs
|
|
88
|
+
if (tag === 'input' && el.type === 'hidden') return;
|
|
89
|
+
|
|
90
|
+
// Skip empty elements with no text or useful attributes
|
|
91
|
+
const text = getText(el);
|
|
92
|
+
const ariaLabel = el.getAttribute('aria-label');
|
|
93
|
+
const placeholder = el.placeholder;
|
|
94
|
+
if (!text && !ariaLabel && !placeholder && tag !== 'input') return;
|
|
95
|
+
|
|
96
|
+
// Skip very small elements (likely icons)
|
|
97
|
+
const rect = el.getBoundingClientRect();
|
|
98
|
+
if (rect.width < 20 && rect.height < 20 && !text) return;
|
|
99
|
+
|
|
100
|
+
// Check visibility
|
|
101
|
+
if (!isVisible(el)) return;
|
|
102
|
+
|
|
103
|
+
// INJECT a unique ID attribute for reliable location
|
|
104
|
+
const highlightId = String(index);
|
|
105
|
+
el.setAttribute('data-browser-agent-id', highlightId);
|
|
106
|
+
|
|
107
|
+
results.push({
|
|
108
|
+
index: index++,
|
|
109
|
+
tag: tag,
|
|
110
|
+
text: text,
|
|
111
|
+
role: role,
|
|
112
|
+
aria_label: el.getAttribute('aria-label'),
|
|
113
|
+
placeholder: el.placeholder || null,
|
|
114
|
+
input_type: el.type || null,
|
|
115
|
+
href: (tag === 'a' && el.href) ? el.href.substring(0, 100) : null,
|
|
116
|
+
x: Math.round(rect.x),
|
|
117
|
+
y: Math.round(rect.y),
|
|
118
|
+
width: Math.round(rect.width),
|
|
119
|
+
height: Math.round(rect.height),
|
|
120
|
+
// Use injected attribute as locator - guaranteed to work!
|
|
121
|
+
locator: `[data-browser-agent-id="${highlightId}"]`
|
|
122
|
+
});
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
return results;
|
|
126
|
+
})()
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified scroll module - AI-powered with fallback strategies.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from scroll import scroll
|
|
6
|
+
result = scroll(page, take_screenshot, times=5, description="the email list")
|
|
7
|
+
"""
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
from connectonion import llm_do
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
_PROMPT = (Path(__file__).parent / "prompts" / "scroll_strategy.md").read_text()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ScrollStrategy(BaseModel):
|
|
17
|
+
method: str # "window", "element", "container"
|
|
18
|
+
selector: str
|
|
19
|
+
javascript: str
|
|
20
|
+
explanation: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def scroll(page, take_screenshot, times: int = 5, description: str = "the main content area") -> str:
|
|
24
|
+
"""Universal scroll with AI strategy and fallback.
|
|
25
|
+
|
|
26
|
+
Tries: AI-generated → Element scroll → Page scroll
|
|
27
|
+
Verifies success with screenshot comparison.
|
|
28
|
+
"""
|
|
29
|
+
if not page:
|
|
30
|
+
return "Browser not open"
|
|
31
|
+
|
|
32
|
+
timestamp = int(time.time())
|
|
33
|
+
before = f"scroll_before_{timestamp}.png"
|
|
34
|
+
take_screenshot(path=before)
|
|
35
|
+
|
|
36
|
+
strategies = [
|
|
37
|
+
("AI strategy", lambda: _ai_scroll(page, times, description)),
|
|
38
|
+
("Element scroll", lambda: _element_scroll(page, times)),
|
|
39
|
+
("Page scroll", lambda: _page_scroll(page, times)),
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
for name, execute in strategies:
|
|
43
|
+
print(f" Trying: {name}...")
|
|
44
|
+
try:
|
|
45
|
+
execute()
|
|
46
|
+
time.sleep(0.5)
|
|
47
|
+
after = f"scroll_after_{timestamp}.png"
|
|
48
|
+
take_screenshot(path=after)
|
|
49
|
+
|
|
50
|
+
if _screenshots_different(before, after):
|
|
51
|
+
print(f" ✅ {name} worked")
|
|
52
|
+
return f"Scrolled using {name}"
|
|
53
|
+
print(f" ⚠️ {name} didn't change content")
|
|
54
|
+
before = after
|
|
55
|
+
except Exception as e:
|
|
56
|
+
print(f" ❌ {name} failed: {e}")
|
|
57
|
+
|
|
58
|
+
return "All scroll strategies failed"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _ai_scroll(page, times: int, description: str):
|
|
62
|
+
"""AI-generated scroll strategy."""
|
|
63
|
+
scrollable = page.evaluate("""
|
|
64
|
+
(() => {
|
|
65
|
+
return Array.from(document.querySelectorAll('*'))
|
|
66
|
+
.filter(el => {
|
|
67
|
+
const s = window.getComputedStyle(el);
|
|
68
|
+
return (s.overflow === 'auto' || s.overflowY === 'scroll') &&
|
|
69
|
+
el.scrollHeight > el.clientHeight;
|
|
70
|
+
})
|
|
71
|
+
.slice(0, 3)
|
|
72
|
+
.map(el => ({tag: el.tagName, classes: el.className, id: el.id}));
|
|
73
|
+
})()
|
|
74
|
+
""")
|
|
75
|
+
|
|
76
|
+
html = page.evaluate("""
|
|
77
|
+
(() => {
|
|
78
|
+
const c = document.body.cloneNode(true);
|
|
79
|
+
c.querySelectorAll('script,style,img,svg').forEach(e => e.remove());
|
|
80
|
+
return c.innerHTML.substring(0, 5000);
|
|
81
|
+
})()
|
|
82
|
+
""")
|
|
83
|
+
|
|
84
|
+
strategy = llm_do(
|
|
85
|
+
_PROMPT.format(description=description, scrollable_elements=scrollable, simplified_html=html),
|
|
86
|
+
output=ScrollStrategy,
|
|
87
|
+
model="co/gemini-2.5-flash",
|
|
88
|
+
temperature=0.1
|
|
89
|
+
)
|
|
90
|
+
print(f" AI: {strategy.explanation}")
|
|
91
|
+
|
|
92
|
+
for _ in range(times):
|
|
93
|
+
page.evaluate(strategy.javascript)
|
|
94
|
+
time.sleep(1)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _element_scroll(page, times: int):
|
|
98
|
+
"""Scroll first scrollable element found."""
|
|
99
|
+
for _ in range(times):
|
|
100
|
+
page.evaluate("""
|
|
101
|
+
(() => {
|
|
102
|
+
const el = Array.from(document.querySelectorAll('*')).find(e => {
|
|
103
|
+
const s = window.getComputedStyle(e);
|
|
104
|
+
return (s.overflow === 'auto' || s.overflowY === 'scroll') &&
|
|
105
|
+
e.scrollHeight > e.clientHeight;
|
|
106
|
+
});
|
|
107
|
+
if (el) el.scrollTop += 1000;
|
|
108
|
+
})()
|
|
109
|
+
""")
|
|
110
|
+
time.sleep(0.8)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _page_scroll(page, times: int):
|
|
114
|
+
"""Scroll window."""
|
|
115
|
+
for _ in range(times):
|
|
116
|
+
page.evaluate("window.scrollBy(0, 1000)")
|
|
117
|
+
time.sleep(0.8)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _screenshots_different(file1: str, file2: str) -> bool:
|
|
121
|
+
"""Compare screenshots using PIL pixel difference."""
|
|
122
|
+
try:
|
|
123
|
+
from PIL import Image
|
|
124
|
+
import os
|
|
125
|
+
|
|
126
|
+
img1 = Image.open(os.path.join("screenshots", file1)).convert('RGB')
|
|
127
|
+
img2 = Image.open(os.path.join("screenshots", file2)).convert('RGB')
|
|
128
|
+
|
|
129
|
+
diff = sum(
|
|
130
|
+
abs(a - b)
|
|
131
|
+
for p1, p2 in zip(img1.getdata(), img2.getdata())
|
|
132
|
+
for a, b in zip(p1, p2)
|
|
133
|
+
)
|
|
134
|
+
threshold = img1.size[0] * img1.size[1] * 3 * 0.01 # 1%
|
|
135
|
+
return diff > threshold
|
|
136
|
+
except Exception:
|
|
137
|
+
return True # Assume different if comparison fails
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: CLI command for running and managing evals
|
|
3
|
+
LLM-Note:
|
|
4
|
+
Dependencies: imports from [pathlib, yaml, json, rich, importlib] | imported by [cli/main.py]
|
|
5
|
+
Data flow: handle_eval() → reads .co/evals/*.yaml → imports agent → runs with stored input → compares expected vs output
|
|
6
|
+
Integration: exposes handle_eval(name, run) for CLI
|
|
7
|
+
|
|
8
|
+
Eval YAML format:
|
|
9
|
+
- `turns`: List of inputs to send to agent sequentially (like a conversation).
|
|
10
|
+
Each turn can have one input. Turns run in order within same agent session,
|
|
11
|
+
simulating multi-round conversations. Use single turn for simple evals,
|
|
12
|
+
or multiple turns to test conversation flow.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import importlib.util
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
import sys
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Optional
|
|
22
|
+
|
|
23
|
+
import yaml
|
|
24
|
+
from pydantic import BaseModel
|
|
25
|
+
from rich.console import Console
|
|
26
|
+
from rich.table import Table
|
|
27
|
+
|
|
28
|
+
console = Console()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class JudgeResult(BaseModel):
|
|
32
|
+
"""Result from LLM judge evaluation."""
|
|
33
|
+
passed: bool
|
|
34
|
+
analysis: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_agent_from_file(file_path: str, cwd: str):
|
|
38
|
+
"""Import agent instance from file."""
|
|
39
|
+
from connectonion import Agent
|
|
40
|
+
|
|
41
|
+
if not os.path.isabs(file_path):
|
|
42
|
+
file_path = os.path.join(cwd, file_path)
|
|
43
|
+
|
|
44
|
+
if cwd not in sys.path:
|
|
45
|
+
sys.path.insert(0, cwd)
|
|
46
|
+
|
|
47
|
+
spec = importlib.util.spec_from_file_location("agent_module", file_path)
|
|
48
|
+
module = importlib.util.module_from_spec(spec)
|
|
49
|
+
spec.loader.exec_module(module)
|
|
50
|
+
|
|
51
|
+
if hasattr(module, 'agent') and isinstance(module.agent, Agent):
|
|
52
|
+
agent = module.agent
|
|
53
|
+
agent.logger.enable_sessions = False # Prevent duplicate eval files
|
|
54
|
+
return agent
|
|
55
|
+
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"No 'agent' instance found in {file_path}.\n\n"
|
|
58
|
+
f"Structure your file like this:\n\n"
|
|
59
|
+
f" agent = Agent(...)\n\n"
|
|
60
|
+
f" if __name__ == '__main__':\n"
|
|
61
|
+
f" agent.input('...')\n"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def handle_eval(name: Optional[str] = None, agent_file: Optional[str] = None):
|
|
66
|
+
"""Run evals and show results.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
name: Optional specific eval name to run
|
|
70
|
+
agent_file: Optional agent file path (overrides YAML setting)
|
|
71
|
+
"""
|
|
72
|
+
evals_dir = Path(".co/evals")
|
|
73
|
+
|
|
74
|
+
if not evals_dir.exists():
|
|
75
|
+
console.print("[yellow]No evals found.[/yellow]")
|
|
76
|
+
console.print("[dim]Create eval files in .co/evals/*.yaml[/dim]")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
if name:
|
|
80
|
+
eval_files = list(evals_dir.glob(f"{name}.yaml"))
|
|
81
|
+
if not eval_files:
|
|
82
|
+
console.print(f"[red]Eval not found: {name}[/red]")
|
|
83
|
+
return
|
|
84
|
+
else:
|
|
85
|
+
eval_files = list(evals_dir.glob("*.yaml"))
|
|
86
|
+
|
|
87
|
+
if not eval_files:
|
|
88
|
+
console.print("[yellow]No eval files found in .co/evals/[/yellow]")
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
_run_evals(eval_files, agent_file)
|
|
92
|
+
|
|
93
|
+
# Reload and show status
|
|
94
|
+
if name:
|
|
95
|
+
eval_files = list(evals_dir.glob(f"{name}.yaml"))
|
|
96
|
+
else:
|
|
97
|
+
eval_files = list(evals_dir.glob("*.yaml"))
|
|
98
|
+
|
|
99
|
+
_show_eval_status(eval_files)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _run_evals(eval_files: list, agent_override: Optional[str] = None):
|
|
103
|
+
"""Run agents for each eval and capture output."""
|
|
104
|
+
cwd = os.getcwd()
|
|
105
|
+
agents_cache = {} # Cache agents by file path
|
|
106
|
+
|
|
107
|
+
for eval_file in eval_files:
|
|
108
|
+
with open(eval_file) as f:
|
|
109
|
+
data = yaml.safe_load(f)
|
|
110
|
+
|
|
111
|
+
# Get agent file: CLI override > YAML > error
|
|
112
|
+
agent_file = agent_override or data.get('agent')
|
|
113
|
+
if not agent_file:
|
|
114
|
+
console.print(f"[red]No agent specified for {eval_file.stem}[/red]")
|
|
115
|
+
console.print(f"[dim]Add 'agent: agent.py' to the YAML or use --agent flag[/dim]")
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# Load agent (cached)
|
|
119
|
+
if agent_file not in agents_cache:
|
|
120
|
+
console.print(f"[cyan]Loading:[/cyan] {agent_file}")
|
|
121
|
+
agents_cache[agent_file] = get_agent_from_file(agent_file, cwd)
|
|
122
|
+
agent = agents_cache[agent_file]
|
|
123
|
+
|
|
124
|
+
turns = data.get('turns', [])
|
|
125
|
+
if not turns:
|
|
126
|
+
console.print(f"[yellow]No turns found in {eval_file.stem}[/yellow]")
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
console.print(f"[cyan]Running:[/cyan] {eval_file.stem}")
|
|
130
|
+
|
|
131
|
+
# Reset agent session for fresh state each eval
|
|
132
|
+
agent.reset_conversation()
|
|
133
|
+
|
|
134
|
+
file_modified = False
|
|
135
|
+
for turn in turns:
|
|
136
|
+
input_text = turn.get('input', '')
|
|
137
|
+
if not input_text:
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
# Show input (truncated)
|
|
141
|
+
display_input = input_text[:60] + "..." if len(input_text) > 60 else input_text
|
|
142
|
+
console.print(f" [dim]input:[/dim] {display_input}")
|
|
143
|
+
|
|
144
|
+
# Run agent and capture result
|
|
145
|
+
result = agent.input(input_text)
|
|
146
|
+
|
|
147
|
+
# Extract tools_called and metrics from agent session
|
|
148
|
+
trace = agent.current_session.get('trace', [])
|
|
149
|
+
tool_calls = [t for t in trace if t.get('type') == 'tool_execution']
|
|
150
|
+
llm_calls = [t for t in trace if t.get('type') == 'llm_call']
|
|
151
|
+
tools_called = [agent.logger._format_tool_call(t) for t in tool_calls]
|
|
152
|
+
|
|
153
|
+
total_tokens = sum(
|
|
154
|
+
(t.get('usage').input_tokens + t.get('usage').output_tokens)
|
|
155
|
+
for t in llm_calls if t.get('usage')
|
|
156
|
+
)
|
|
157
|
+
total_cost = sum(
|
|
158
|
+
t.get('usage').cost for t in llm_calls if t.get('usage')
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Build history as JSON array string (compact, easy to scan)
|
|
162
|
+
history_str = turn.get('history', '[]')
|
|
163
|
+
history = json.loads(history_str) if isinstance(history_str, str) else []
|
|
164
|
+
if turn.get('output'):
|
|
165
|
+
history.insert(0, {
|
|
166
|
+
"ts": turn.get('ts', ''),
|
|
167
|
+
"pass": turn.get('pass'),
|
|
168
|
+
"tokens": turn.get('tokens', 0),
|
|
169
|
+
"cost": turn.get('cost', 0)
|
|
170
|
+
})
|
|
171
|
+
|
|
172
|
+
# Store result in turn
|
|
173
|
+
turn['output'] = result
|
|
174
|
+
turn['tools_called'] = tools_called
|
|
175
|
+
turn['tokens'] = total_tokens
|
|
176
|
+
turn['cost'] = round(total_cost, 4)
|
|
177
|
+
turn['ts'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
178
|
+
turn['run'] = data.get('runs', 0) + 1
|
|
179
|
+
# Format history as multi-line JSON for readability
|
|
180
|
+
if history:
|
|
181
|
+
lines = [json.dumps(h) for h in history]
|
|
182
|
+
turn['history'] = "[\n" + ",\n".join(lines) + "]"
|
|
183
|
+
else:
|
|
184
|
+
turn['history'] = "[]"
|
|
185
|
+
file_modified = True
|
|
186
|
+
|
|
187
|
+
# Judge immediately if expected exists
|
|
188
|
+
expected = turn.get('expected', '')
|
|
189
|
+
if expected:
|
|
190
|
+
judge = _judge_with_llm(expected, result, input_text)
|
|
191
|
+
turn['pass'] = judge.passed
|
|
192
|
+
turn['analysis'] = judge.analysis
|
|
193
|
+
status = "[green]✓[/green]" if judge.passed else "[red]✗[/red]"
|
|
194
|
+
console.print(f" {status} {judge.analysis[:60]}...")
|
|
195
|
+
else:
|
|
196
|
+
# Show output (truncated)
|
|
197
|
+
display_output = result[:60] + "..." if len(result) > 60 else result
|
|
198
|
+
console.print(f" [green]output:[/green] {display_output}")
|
|
199
|
+
|
|
200
|
+
if file_modified:
|
|
201
|
+
# Update runs count and save
|
|
202
|
+
data['runs'] = data.get('runs', 0) + 1
|
|
203
|
+
data['updated'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
204
|
+
with open(eval_file, 'w') as f:
|
|
205
|
+
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
206
|
+
|
|
207
|
+
console.print(f"[green]✓[/green] {eval_file.stem} completed")
|
|
208
|
+
console.print()
|
|
209
|
+
|
|
210
|
+
console.print()
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _judge_with_llm(expected: str, output: str, input_text: str) -> JudgeResult:
|
|
214
|
+
"""Use LLM to judge if output matches expected."""
|
|
215
|
+
from connectonion import llm_do
|
|
216
|
+
|
|
217
|
+
prompt = f"""You are an eval judge. Determine if the agent's output satisfies the expected criteria.
|
|
218
|
+
|
|
219
|
+
Input: {input_text}
|
|
220
|
+
Expected: {expected}
|
|
221
|
+
Output: {output}
|
|
222
|
+
|
|
223
|
+
Does the output satisfy the expected criteria? Consider:
|
|
224
|
+
- Semantic similarity (not exact match)
|
|
225
|
+
- Key information presence
|
|
226
|
+
- Intent fulfillment
|
|
227
|
+
"""
|
|
228
|
+
return llm_do(prompt, output=JudgeResult)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _show_eval_status(eval_files: list):
|
|
232
|
+
"""Show pass/fail status for all evals (uses stored results, no re-judging)."""
|
|
233
|
+
table = Table(title="Eval Results", show_header=True)
|
|
234
|
+
table.add_column("Eval", style="cyan")
|
|
235
|
+
table.add_column("Status", justify="center")
|
|
236
|
+
table.add_column("Expected", max_width=30)
|
|
237
|
+
table.add_column("Output", max_width=30)
|
|
238
|
+
|
|
239
|
+
passed = 0
|
|
240
|
+
failed = 0
|
|
241
|
+
no_expected = 0
|
|
242
|
+
|
|
243
|
+
for eval_file in sorted(eval_files):
|
|
244
|
+
with open(eval_file) as f:
|
|
245
|
+
data = yaml.safe_load(f)
|
|
246
|
+
|
|
247
|
+
for turn in data.get('turns', []):
|
|
248
|
+
expected = turn.get('expected', '')
|
|
249
|
+
output = turn.get('output', '')
|
|
250
|
+
pass_result = turn.get('pass')
|
|
251
|
+
|
|
252
|
+
if not expected:
|
|
253
|
+
status = "[dim]—[/dim]"
|
|
254
|
+
no_expected += 1
|
|
255
|
+
elif pass_result is True:
|
|
256
|
+
status = "[green]✓ pass[/green]"
|
|
257
|
+
passed += 1
|
|
258
|
+
elif pass_result is False:
|
|
259
|
+
status = "[red]✗ fail[/red]"
|
|
260
|
+
failed += 1
|
|
261
|
+
else:
|
|
262
|
+
status = "[dim]pending[/dim]"
|
|
263
|
+
no_expected += 1
|
|
264
|
+
|
|
265
|
+
# Truncate for display
|
|
266
|
+
expected_display = (expected[:27] + "...") if len(expected) > 30 else expected
|
|
267
|
+
output_display = (output[:27] + "...") if len(output) > 30 else output
|
|
268
|
+
|
|
269
|
+
table.add_row(
|
|
270
|
+
eval_file.stem,
|
|
271
|
+
status,
|
|
272
|
+
expected_display or "[dim]not set[/dim]",
|
|
273
|
+
output_display
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
console.print(table)
|
|
277
|
+
console.print()
|
|
278
|
+
|
|
279
|
+
# Summary
|
|
280
|
+
if failed > 0:
|
|
281
|
+
console.print(f"[bold red]✗ {failed} failed[/bold red], ", end="")
|
|
282
|
+
if passed > 0:
|
|
283
|
+
console.print(f"[bold green]✓ {passed} passed[/bold green], ", end="")
|
|
284
|
+
if no_expected > 0:
|
|
285
|
+
console.print(f"[dim]{no_expected} no expected[/dim]", end="")
|
|
286
|
+
console.print()
|
connectonion/cli/main.py
CHANGED
|
@@ -55,6 +55,7 @@ def _show_help():
|
|
|
55
55
|
console.print(" [green]create[/green] <name> Create new project")
|
|
56
56
|
console.print(" [green]init[/green] Initialize in current directory")
|
|
57
57
|
console.print(" [green]copy[/green] <name> Copy tool/plugin source to project")
|
|
58
|
+
console.print(" [green]eval[/green] Run evals and show status")
|
|
58
59
|
console.print(" [green]deploy[/green] Deploy to ConnectOnion Cloud")
|
|
59
60
|
console.print(" [green]auth[/green] Authenticate for managed keys")
|
|
60
61
|
console.print(" [green]status[/green] Check account balance")
|
|
@@ -152,6 +153,16 @@ def copy(
|
|
|
152
153
|
handle_copy(names=names or [], list_all=list_all, path=path, force=force)
|
|
153
154
|
|
|
154
155
|
|
|
156
|
+
@app.command()
|
|
157
|
+
def eval(
|
|
158
|
+
name: Optional[str] = typer.Argument(None, help="Specific eval name"),
|
|
159
|
+
agent: Optional[str] = typer.Option(None, "--agent", "-a", help="Agent file (overrides YAML)"),
|
|
160
|
+
):
|
|
161
|
+
"""Run evals and show results."""
|
|
162
|
+
from .commands.eval_commands import handle_eval
|
|
163
|
+
handle_eval(name=name, agent_file=agent)
|
|
164
|
+
|
|
165
|
+
|
|
155
166
|
def cli():
|
|
156
167
|
"""Entry point."""
|
|
157
168
|
app()
|
connectonion/console.py
CHANGED
|
@@ -110,7 +110,7 @@ class Console:
|
|
|
110
110
|
● ─────────────────────
|
|
111
111
|
connectonion v0.5.1
|
|
112
112
|
o4-mini · 3 tools
|
|
113
|
-
.co/logs/ · .co/
|
|
113
|
+
.co/logs/ · .co/evals/
|
|
114
114
|
|
|
115
115
|
Args:
|
|
116
116
|
agent_name: Name of the agent
|
|
@@ -156,7 +156,7 @@ class Console:
|
|
|
156
156
|
|
|
157
157
|
# Add log paths if logging is enabled
|
|
158
158
|
if log_dir:
|
|
159
|
-
lines.append(f" [{DIM_COLOR}]{log_dir}logs/ · {log_dir}
|
|
159
|
+
lines.append(f" [{DIM_COLOR}]{log_dir}logs/ · {log_dir}evals/[/{DIM_COLOR}]")
|
|
160
160
|
|
|
161
161
|
# Add Aaron's message for free tier users
|
|
162
162
|
if aaron_message:
|
|
@@ -182,7 +182,7 @@ class Console:
|
|
|
182
182
|
if meta_line:
|
|
183
183
|
plain_lines.append(f" {meta_line}")
|
|
184
184
|
if log_dir:
|
|
185
|
-
plain_lines.append(f" {log_dir}logs/ · {log_dir}
|
|
185
|
+
plain_lines.append(f" {log_dir}logs/ · {log_dir}evals/")
|
|
186
186
|
if aaron_message:
|
|
187
187
|
plain_lines.append(f" {aaron_message}")
|
|
188
188
|
plain_lines.append(f" {separator}")
|
|
@@ -484,12 +484,12 @@ class Console:
|
|
|
484
484
|
|
|
485
485
|
[co] ═══════════════════════════════════════
|
|
486
486
|
[co] ✓ done · 2.3k tokens · $0.005 · 3.4s
|
|
487
|
-
[co] saved → .co/
|
|
487
|
+
[co] saved → .co/evals/research-assistant.yaml
|
|
488
488
|
|
|
489
489
|
Args:
|
|
490
490
|
duration_s: Total duration in seconds
|
|
491
491
|
session: Agent's current_session dict (contains trace with usage)
|
|
492
|
-
session_path: Optional path to
|
|
492
|
+
session_path: Optional path to eval file
|
|
493
493
|
"""
|
|
494
494
|
# Calculate totals from trace
|
|
495
495
|
trace = session.get('trace', [])
|