connectonion 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- connectonion/__init__.py +3 -2
- connectonion/cli/browser_agent/browser.py +488 -145
- connectonion/cli/browser_agent/scroll_strategies.py +276 -0
- connectonion/cli/commands/eval_commands.py +286 -0
- connectonion/cli/main.py +11 -0
- connectonion/console.py +5 -5
- connectonion/core/agent.py +13 -10
- connectonion/core/llm.py +9 -19
- connectonion/logger.py +305 -135
- connectonion/network/__init__.py +3 -0
- connectonion/network/asgi.py +122 -2
- connectonion/network/connection.py +123 -0
- connectonion/network/host.py +7 -5
- connectonion/useful_plugins/__init__.py +4 -3
- connectonion/useful_plugins/ui_stream.py +164 -0
- {connectonion-0.6.0.dist-info → connectonion-0.6.1.dist-info}/METADATA +1 -1
- {connectonion-0.6.0.dist-info → connectonion-0.6.1.dist-info}/RECORD +20 -16
- /connectonion/{static → network/static}/docs.html +0 -0
- {connectonion-0.6.0.dist-info → connectonion-0.6.1.dist-info}/WHEEL +0 -0
- {connectonion-0.6.0.dist-info → connectonion-0.6.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: Universal scrolling strategies with AI-powered selection and screenshot-based verification
|
|
3
|
+
LLM-Note:
|
|
4
|
+
Dependencies: imports from [typing, pydantic, connectonion.llm_do, PIL.Image, os, time] | imported by [web_automation.py] | tested by [tests/test_final_scroll.py]
|
|
5
|
+
Data flow: receives page: Page, take_screenshot: Callable, times: int, description: str from web_automation.scroll() → scroll_with_verification() orchestrates 3 strategies → ai_scroll_strategy() calls llm_do(HTML+scrollable_elements→ScrollStrategy, gpt-4o) → element_scroll_strategy()/page_scroll_strategy() fallbacks → page.evaluate(javascript) executes scroll → screenshots_are_different() compares PIL Images with 1% pixel threshold → returns success/failure string
|
|
6
|
+
State/Effects: calls page.evaluate() multiple times (mutates DOM scroll positions) | take_screenshot() writes PNG files to screenshots/*.png | time.sleep(1-1.2) between scroll iterations | AI calls to gpt-4o with temperature=0.1 for strategy generation
|
|
7
|
+
Integration: exposes scroll_with_verification() as main entry point from WebAutomation.scroll() | exposes scroll_page(), scroll_element() as standalone utilities | ScrollStrategy Pydantic model defines AI output schema (javascript: str, explanation: str) | screenshots_are_different() uses PIL for pixel-level comparison
|
|
8
|
+
Performance: ai_scroll_strategy() calls llm_do() once per scroll session (100-500ms) | analyzes first 5000 chars of HTML | finds up to 3 scrollable elements | executes JS times iterations with 1.2s delays | element/page strategies are synchronous JS execution (fast) | PIL screenshot comparison ~50-100ms
|
|
9
|
+
Errors: returns descriptive strings (not exceptions) - "All scroll strategies failed", "Browser not open" | screenshot comparison failure returns True (assumes different) to continue | page.evaluate() exceptions caught and next strategy tried | prints debug output to stdout
|
|
10
|
+
⚠️ Strategy order: AI-first may be slower but more accurate for complex sites (Gmail) - reorder if speed critical
|
|
11
|
+
⚠️ Screenshot verification: 1% threshold may need tuning for high-resolution displays or subtle animations
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from typing import Callable, List, Tuple
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
from connectonion import llm_do
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ScrollStrategy(BaseModel):
|
|
20
|
+
"""AI-generated scroll strategy."""
|
|
21
|
+
javascript: str
|
|
22
|
+
explanation: str
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def scroll_with_verification(
|
|
26
|
+
page,
|
|
27
|
+
take_screenshot: Callable,
|
|
28
|
+
times: int = 5,
|
|
29
|
+
description: str = "the main content area"
|
|
30
|
+
) -> str:
|
|
31
|
+
"""Universal scroll with automatic strategy selection and fallback.
|
|
32
|
+
|
|
33
|
+
Tries multiple strategies in order until one works:
|
|
34
|
+
1. AI-generated strategy (default)
|
|
35
|
+
2. Element scrolling
|
|
36
|
+
3. Page scrolling
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
page: Playwright page object
|
|
40
|
+
take_screenshot: Function to take screenshots
|
|
41
|
+
times: Number of scroll iterations
|
|
42
|
+
description: What to scroll (natural language)
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Status message with successful strategy
|
|
46
|
+
"""
|
|
47
|
+
if not page:
|
|
48
|
+
return "Browser not open"
|
|
49
|
+
|
|
50
|
+
print(f"\n📜 Starting universal scroll for: '{description}'")
|
|
51
|
+
|
|
52
|
+
import time
|
|
53
|
+
timestamp = int(time.time())
|
|
54
|
+
before_file = f"scroll_before_{timestamp}.png"
|
|
55
|
+
after_file = f"scroll_after_{timestamp}.png"
|
|
56
|
+
|
|
57
|
+
# Take before screenshot
|
|
58
|
+
take_screenshot(before_file)
|
|
59
|
+
|
|
60
|
+
strategies = [
|
|
61
|
+
("AI-generated strategy", lambda: ai_scroll_strategy(page, times, description)),
|
|
62
|
+
("Element scrolling", lambda: element_scroll_strategy(page, times)),
|
|
63
|
+
("Page scrolling", lambda: page_scroll_strategy(page, times))
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
for strategy_name, strategy_func in strategies:
|
|
67
|
+
print(f"\n Trying: {strategy_name}...")
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
strategy_func()
|
|
71
|
+
time.sleep(1)
|
|
72
|
+
|
|
73
|
+
# Take after screenshot
|
|
74
|
+
take_screenshot(after_file)
|
|
75
|
+
|
|
76
|
+
# Verify scroll worked
|
|
77
|
+
if screenshots_are_different(before_file, after_file):
|
|
78
|
+
print(f" ✅ {strategy_name} WORKED! Content changed.")
|
|
79
|
+
return f"Scroll successful using {strategy_name}. Check {before_file} vs {after_file}"
|
|
80
|
+
else:
|
|
81
|
+
print(f" ⚠️ {strategy_name} didn't change content. Trying next...")
|
|
82
|
+
before_file = after_file
|
|
83
|
+
after_file = f"scroll_after_{timestamp}_next.png"
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
print(f" ❌ {strategy_name} failed: {e}")
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
return "All scroll strategies failed. No visible content change."
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def screenshots_are_different(file1: str, file2: str) -> bool:
|
|
93
|
+
"""Compare screenshots to verify content changed.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
file1: First screenshot filename
|
|
97
|
+
file2: Second screenshot filename
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
True if screenshots are different
|
|
101
|
+
"""
|
|
102
|
+
try:
|
|
103
|
+
from PIL import Image
|
|
104
|
+
import os
|
|
105
|
+
|
|
106
|
+
path1 = os.path.join("screenshots", file1)
|
|
107
|
+
path2 = os.path.join("screenshots", file2)
|
|
108
|
+
|
|
109
|
+
img1 = Image.open(path1).convert('RGB')
|
|
110
|
+
img2 = Image.open(path2).convert('RGB')
|
|
111
|
+
|
|
112
|
+
# Calculate pixel difference
|
|
113
|
+
diff = sum(
|
|
114
|
+
abs(a - b)
|
|
115
|
+
for pixel1, pixel2 in zip(img1.getdata(), img2.getdata())
|
|
116
|
+
for a, b in zip(pixel1, pixel2)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# 1% threshold
|
|
120
|
+
threshold = img1.size[0] * img1.size[1] * 3 * 0.01
|
|
121
|
+
|
|
122
|
+
is_different = diff > threshold
|
|
123
|
+
print(f" Screenshot diff: {diff:.0f} (threshold: {threshold:.0f}) - {'DIFFERENT' if is_different else 'SAME'}")
|
|
124
|
+
|
|
125
|
+
return is_different
|
|
126
|
+
|
|
127
|
+
except Exception as e:
|
|
128
|
+
print(f" Warning: Screenshot comparison failed: {e}")
|
|
129
|
+
return True # Assume different if comparison fails
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def ai_scroll_strategy(page, times: int, description: str):
|
|
133
|
+
"""AI-generated scroll strategy.
|
|
134
|
+
|
|
135
|
+
Analyzes page structure and generates custom JavaScript.
|
|
136
|
+
"""
|
|
137
|
+
# Find scrollable elements
|
|
138
|
+
scrollable_elements = page.evaluate("""
|
|
139
|
+
(() => {
|
|
140
|
+
const scrollable = [];
|
|
141
|
+
document.querySelectorAll('*').forEach(el => {
|
|
142
|
+
const style = window.getComputedStyle(el);
|
|
143
|
+
if ((style.overflow === 'auto' || style.overflowY === 'scroll') &&
|
|
144
|
+
el.scrollHeight > el.clientHeight) {
|
|
145
|
+
scrollable.push({
|
|
146
|
+
tag: el.tagName,
|
|
147
|
+
classes: el.className,
|
|
148
|
+
id: el.id
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
});
|
|
152
|
+
return scrollable;
|
|
153
|
+
})()
|
|
154
|
+
""")
|
|
155
|
+
|
|
156
|
+
# Get simplified HTML
|
|
157
|
+
simplified_html = page.evaluate("""
|
|
158
|
+
(() => {
|
|
159
|
+
const clone = document.body.cloneNode(true);
|
|
160
|
+
clone.querySelectorAll('script, style, img, svg').forEach(el => el.remove());
|
|
161
|
+
return clone.innerHTML.substring(0, 5000);
|
|
162
|
+
})()
|
|
163
|
+
""")
|
|
164
|
+
|
|
165
|
+
# Generate scroll strategy using AI
|
|
166
|
+
strategy = llm_do(
|
|
167
|
+
f"""Generate JavaScript to scroll "{description}".
|
|
168
|
+
|
|
169
|
+
Scrollable elements: {scrollable_elements[:3]}
|
|
170
|
+
HTML structure: {simplified_html}
|
|
171
|
+
|
|
172
|
+
Return IIFE that scrolls the correct element:
|
|
173
|
+
(() => {{
|
|
174
|
+
const el = document.querySelector('.selector');
|
|
175
|
+
if (el) el.scrollTop += 1000;
|
|
176
|
+
return {{success: true}};
|
|
177
|
+
}})()
|
|
178
|
+
""",
|
|
179
|
+
output=ScrollStrategy,
|
|
180
|
+
model="gpt-4o",
|
|
181
|
+
temperature=0.1
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
print(f" AI generated: {strategy.explanation}")
|
|
185
|
+
|
|
186
|
+
# Execute scroll
|
|
187
|
+
import time
|
|
188
|
+
for i in range(times):
|
|
189
|
+
page.evaluate(strategy.javascript)
|
|
190
|
+
time.sleep(1.2)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def element_scroll_strategy(page, times: int):
|
|
194
|
+
"""Scroll first scrollable element found."""
|
|
195
|
+
import time
|
|
196
|
+
for i in range(times):
|
|
197
|
+
page.evaluate("""
|
|
198
|
+
(() => {
|
|
199
|
+
const el = Array.from(document.querySelectorAll('*')).find(e => {
|
|
200
|
+
const s = window.getComputedStyle(e);
|
|
201
|
+
return (s.overflow === 'auto' || s.overflowY === 'scroll') &&
|
|
202
|
+
e.scrollHeight > e.clientHeight;
|
|
203
|
+
});
|
|
204
|
+
if (el) el.scrollTop += 1000;
|
|
205
|
+
})()
|
|
206
|
+
""")
|
|
207
|
+
time.sleep(1)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def page_scroll_strategy(page, times: int):
|
|
211
|
+
"""Scroll the page window."""
|
|
212
|
+
import time
|
|
213
|
+
for i in range(times):
|
|
214
|
+
page.evaluate("window.scrollBy(0, 1000)")
|
|
215
|
+
time.sleep(1)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# Additional scroll helpers that can be called directly
|
|
219
|
+
def scroll_page(page, direction: str = "down", amount: int = 1000) -> str:
|
|
220
|
+
"""Scroll the page in a specific direction.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
page: Playwright page object
|
|
224
|
+
direction: "down", "up", "top", or "bottom"
|
|
225
|
+
amount: Pixels to scroll
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Status message
|
|
229
|
+
"""
|
|
230
|
+
if not page:
|
|
231
|
+
return "Browser not open"
|
|
232
|
+
|
|
233
|
+
if direction == "bottom":
|
|
234
|
+
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
235
|
+
return "Scrolled to bottom of page"
|
|
236
|
+
elif direction == "top":
|
|
237
|
+
page.evaluate("window.scrollTo(0, 0)")
|
|
238
|
+
return "Scrolled to top of page"
|
|
239
|
+
elif direction == "down":
|
|
240
|
+
page.evaluate(f"window.scrollBy(0, {amount})")
|
|
241
|
+
return f"Scrolled down {amount} pixels"
|
|
242
|
+
elif direction == "up":
|
|
243
|
+
page.evaluate(f"window.scrollBy(0, -{amount})")
|
|
244
|
+
return f"Scrolled up {amount} pixels"
|
|
245
|
+
else:
|
|
246
|
+
return f"Unknown direction: {direction}"
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def scroll_element(page, selector: str, amount: int = 1000) -> str:
|
|
250
|
+
"""Scroll a specific element by CSS selector.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
page: Playwright page object
|
|
254
|
+
selector: CSS selector for the element
|
|
255
|
+
amount: Pixels to scroll
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Status message
|
|
259
|
+
"""
|
|
260
|
+
if not page:
|
|
261
|
+
return "Browser not open"
|
|
262
|
+
|
|
263
|
+
result = page.evaluate(f"""
|
|
264
|
+
(() => {{
|
|
265
|
+
const element = document.querySelector('{selector}');
|
|
266
|
+
if (!element) return 'Element not found: {selector}';
|
|
267
|
+
|
|
268
|
+
const beforeScroll = element.scrollTop;
|
|
269
|
+
element.scrollTop += {amount};
|
|
270
|
+
const afterScroll = element.scrollTop;
|
|
271
|
+
|
|
272
|
+
return `Scrolled from ${{beforeScroll}}px to ${{afterScroll}}px (delta: ${{afterScroll - beforeScroll}}px)`;
|
|
273
|
+
}})()
|
|
274
|
+
""")
|
|
275
|
+
|
|
276
|
+
return result
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: CLI command for running and managing evals
|
|
3
|
+
LLM-Note:
|
|
4
|
+
Dependencies: imports from [pathlib, yaml, json, rich, importlib] | imported by [cli/main.py]
|
|
5
|
+
Data flow: handle_eval() → reads .co/evals/*.yaml → imports agent → runs with stored input → compares expected vs output
|
|
6
|
+
Integration: exposes handle_eval(name, run) for CLI
|
|
7
|
+
|
|
8
|
+
Eval YAML format:
|
|
9
|
+
- `turns`: List of inputs to send to agent sequentially (like a conversation).
|
|
10
|
+
Each turn can have one input. Turns run in order within same agent session,
|
|
11
|
+
simulating multi-round conversations. Use single turn for simple evals,
|
|
12
|
+
or multiple turns to test conversation flow.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import importlib.util
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
import sys
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Optional
|
|
22
|
+
|
|
23
|
+
import yaml
|
|
24
|
+
from pydantic import BaseModel
|
|
25
|
+
from rich.console import Console
|
|
26
|
+
from rich.table import Table
|
|
27
|
+
|
|
28
|
+
console = Console()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class JudgeResult(BaseModel):
|
|
32
|
+
"""Result from LLM judge evaluation."""
|
|
33
|
+
passed: bool
|
|
34
|
+
analysis: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_agent_from_file(file_path: str, cwd: str):
|
|
38
|
+
"""Import agent instance from file."""
|
|
39
|
+
from connectonion import Agent
|
|
40
|
+
|
|
41
|
+
if not os.path.isabs(file_path):
|
|
42
|
+
file_path = os.path.join(cwd, file_path)
|
|
43
|
+
|
|
44
|
+
if cwd not in sys.path:
|
|
45
|
+
sys.path.insert(0, cwd)
|
|
46
|
+
|
|
47
|
+
spec = importlib.util.spec_from_file_location("agent_module", file_path)
|
|
48
|
+
module = importlib.util.module_from_spec(spec)
|
|
49
|
+
spec.loader.exec_module(module)
|
|
50
|
+
|
|
51
|
+
if hasattr(module, 'agent') and isinstance(module.agent, Agent):
|
|
52
|
+
agent = module.agent
|
|
53
|
+
agent.logger.enable_sessions = False # Prevent duplicate eval files
|
|
54
|
+
return agent
|
|
55
|
+
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"No 'agent' instance found in {file_path}.\n\n"
|
|
58
|
+
f"Structure your file like this:\n\n"
|
|
59
|
+
f" agent = Agent(...)\n\n"
|
|
60
|
+
f" if __name__ == '__main__':\n"
|
|
61
|
+
f" agent.input('...')\n"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def handle_eval(name: Optional[str] = None, agent_file: Optional[str] = None):
|
|
66
|
+
"""Run evals and show results.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
name: Optional specific eval name to run
|
|
70
|
+
agent_file: Optional agent file path (overrides YAML setting)
|
|
71
|
+
"""
|
|
72
|
+
evals_dir = Path(".co/evals")
|
|
73
|
+
|
|
74
|
+
if not evals_dir.exists():
|
|
75
|
+
console.print("[yellow]No evals found.[/yellow]")
|
|
76
|
+
console.print("[dim]Create eval files in .co/evals/*.yaml[/dim]")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
if name:
|
|
80
|
+
eval_files = list(evals_dir.glob(f"{name}.yaml"))
|
|
81
|
+
if not eval_files:
|
|
82
|
+
console.print(f"[red]Eval not found: {name}[/red]")
|
|
83
|
+
return
|
|
84
|
+
else:
|
|
85
|
+
eval_files = list(evals_dir.glob("*.yaml"))
|
|
86
|
+
|
|
87
|
+
if not eval_files:
|
|
88
|
+
console.print("[yellow]No eval files found in .co/evals/[/yellow]")
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
_run_evals(eval_files, agent_file)
|
|
92
|
+
|
|
93
|
+
# Reload and show status
|
|
94
|
+
if name:
|
|
95
|
+
eval_files = list(evals_dir.glob(f"{name}.yaml"))
|
|
96
|
+
else:
|
|
97
|
+
eval_files = list(evals_dir.glob("*.yaml"))
|
|
98
|
+
|
|
99
|
+
_show_eval_status(eval_files)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _run_evals(eval_files: list, agent_override: Optional[str] = None):
|
|
103
|
+
"""Run agents for each eval and capture output."""
|
|
104
|
+
cwd = os.getcwd()
|
|
105
|
+
agents_cache = {} # Cache agents by file path
|
|
106
|
+
|
|
107
|
+
for eval_file in eval_files:
|
|
108
|
+
with open(eval_file) as f:
|
|
109
|
+
data = yaml.safe_load(f)
|
|
110
|
+
|
|
111
|
+
# Get agent file: CLI override > YAML > error
|
|
112
|
+
agent_file = agent_override or data.get('agent')
|
|
113
|
+
if not agent_file:
|
|
114
|
+
console.print(f"[red]No agent specified for {eval_file.stem}[/red]")
|
|
115
|
+
console.print(f"[dim]Add 'agent: agent.py' to the YAML or use --agent flag[/dim]")
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# Load agent (cached)
|
|
119
|
+
if agent_file not in agents_cache:
|
|
120
|
+
console.print(f"[cyan]Loading:[/cyan] {agent_file}")
|
|
121
|
+
agents_cache[agent_file] = get_agent_from_file(agent_file, cwd)
|
|
122
|
+
agent = agents_cache[agent_file]
|
|
123
|
+
|
|
124
|
+
turns = data.get('turns', [])
|
|
125
|
+
if not turns:
|
|
126
|
+
console.print(f"[yellow]No turns found in {eval_file.stem}[/yellow]")
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
console.print(f"[cyan]Running:[/cyan] {eval_file.stem}")
|
|
130
|
+
|
|
131
|
+
# Reset agent session for fresh state each eval
|
|
132
|
+
agent.reset_conversation()
|
|
133
|
+
|
|
134
|
+
file_modified = False
|
|
135
|
+
for turn in turns:
|
|
136
|
+
input_text = turn.get('input', '')
|
|
137
|
+
if not input_text:
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
# Show input (truncated)
|
|
141
|
+
display_input = input_text[:60] + "..." if len(input_text) > 60 else input_text
|
|
142
|
+
console.print(f" [dim]input:[/dim] {display_input}")
|
|
143
|
+
|
|
144
|
+
# Run agent and capture result
|
|
145
|
+
result = agent.input(input_text)
|
|
146
|
+
|
|
147
|
+
# Extract tools_called and metrics from agent session
|
|
148
|
+
trace = agent.current_session.get('trace', [])
|
|
149
|
+
tool_calls = [t for t in trace if t.get('type') == 'tool_execution']
|
|
150
|
+
llm_calls = [t for t in trace if t.get('type') == 'llm_call']
|
|
151
|
+
tools_called = [agent.logger._format_tool_call(t) for t in tool_calls]
|
|
152
|
+
|
|
153
|
+
total_tokens = sum(
|
|
154
|
+
(t.get('usage').input_tokens + t.get('usage').output_tokens)
|
|
155
|
+
for t in llm_calls if t.get('usage')
|
|
156
|
+
)
|
|
157
|
+
total_cost = sum(
|
|
158
|
+
t.get('usage').cost for t in llm_calls if t.get('usage')
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Build history as JSON array string (compact, easy to scan)
|
|
162
|
+
history_str = turn.get('history', '[]')
|
|
163
|
+
history = json.loads(history_str) if isinstance(history_str, str) else []
|
|
164
|
+
if turn.get('output'):
|
|
165
|
+
history.insert(0, {
|
|
166
|
+
"ts": turn.get('ts', ''),
|
|
167
|
+
"pass": turn.get('pass'),
|
|
168
|
+
"tokens": turn.get('tokens', 0),
|
|
169
|
+
"cost": turn.get('cost', 0)
|
|
170
|
+
})
|
|
171
|
+
|
|
172
|
+
# Store result in turn
|
|
173
|
+
turn['output'] = result
|
|
174
|
+
turn['tools_called'] = tools_called
|
|
175
|
+
turn['tokens'] = total_tokens
|
|
176
|
+
turn['cost'] = round(total_cost, 4)
|
|
177
|
+
turn['ts'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
178
|
+
turn['run'] = data.get('runs', 0) + 1
|
|
179
|
+
# Format history as multi-line JSON for readability
|
|
180
|
+
if history:
|
|
181
|
+
lines = [json.dumps(h) for h in history]
|
|
182
|
+
turn['history'] = "[\n" + ",\n".join(lines) + "]"
|
|
183
|
+
else:
|
|
184
|
+
turn['history'] = "[]"
|
|
185
|
+
file_modified = True
|
|
186
|
+
|
|
187
|
+
# Judge immediately if expected exists
|
|
188
|
+
expected = turn.get('expected', '')
|
|
189
|
+
if expected:
|
|
190
|
+
judge = _judge_with_llm(expected, result, input_text)
|
|
191
|
+
turn['pass'] = judge.passed
|
|
192
|
+
turn['analysis'] = judge.analysis
|
|
193
|
+
status = "[green]✓[/green]" if judge.passed else "[red]✗[/red]"
|
|
194
|
+
console.print(f" {status} {judge.analysis[:60]}...")
|
|
195
|
+
else:
|
|
196
|
+
# Show output (truncated)
|
|
197
|
+
display_output = result[:60] + "..." if len(result) > 60 else result
|
|
198
|
+
console.print(f" [green]output:[/green] {display_output}")
|
|
199
|
+
|
|
200
|
+
if file_modified:
|
|
201
|
+
# Update runs count and save
|
|
202
|
+
data['runs'] = data.get('runs', 0) + 1
|
|
203
|
+
data['updated'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
204
|
+
with open(eval_file, 'w') as f:
|
|
205
|
+
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
206
|
+
|
|
207
|
+
console.print(f"[green]✓[/green] {eval_file.stem} completed")
|
|
208
|
+
console.print()
|
|
209
|
+
|
|
210
|
+
console.print()
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _judge_with_llm(expected: str, output: str, input_text: str) -> JudgeResult:
|
|
214
|
+
"""Use LLM to judge if output matches expected."""
|
|
215
|
+
from connectonion import llm_do
|
|
216
|
+
|
|
217
|
+
prompt = f"""You are an eval judge. Determine if the agent's output satisfies the expected criteria.
|
|
218
|
+
|
|
219
|
+
Input: {input_text}
|
|
220
|
+
Expected: {expected}
|
|
221
|
+
Output: {output}
|
|
222
|
+
|
|
223
|
+
Does the output satisfy the expected criteria? Consider:
|
|
224
|
+
- Semantic similarity (not exact match)
|
|
225
|
+
- Key information presence
|
|
226
|
+
- Intent fulfillment
|
|
227
|
+
"""
|
|
228
|
+
return llm_do(prompt, output=JudgeResult)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _show_eval_status(eval_files: list):
|
|
232
|
+
"""Show pass/fail status for all evals (uses stored results, no re-judging)."""
|
|
233
|
+
table = Table(title="Eval Results", show_header=True)
|
|
234
|
+
table.add_column("Eval", style="cyan")
|
|
235
|
+
table.add_column("Status", justify="center")
|
|
236
|
+
table.add_column("Expected", max_width=30)
|
|
237
|
+
table.add_column("Output", max_width=30)
|
|
238
|
+
|
|
239
|
+
passed = 0
|
|
240
|
+
failed = 0
|
|
241
|
+
no_expected = 0
|
|
242
|
+
|
|
243
|
+
for eval_file in sorted(eval_files):
|
|
244
|
+
with open(eval_file) as f:
|
|
245
|
+
data = yaml.safe_load(f)
|
|
246
|
+
|
|
247
|
+
for turn in data.get('turns', []):
|
|
248
|
+
expected = turn.get('expected', '')
|
|
249
|
+
output = turn.get('output', '')
|
|
250
|
+
pass_result = turn.get('pass')
|
|
251
|
+
|
|
252
|
+
if not expected:
|
|
253
|
+
status = "[dim]—[/dim]"
|
|
254
|
+
no_expected += 1
|
|
255
|
+
elif pass_result is True:
|
|
256
|
+
status = "[green]✓ pass[/green]"
|
|
257
|
+
passed += 1
|
|
258
|
+
elif pass_result is False:
|
|
259
|
+
status = "[red]✗ fail[/red]"
|
|
260
|
+
failed += 1
|
|
261
|
+
else:
|
|
262
|
+
status = "[dim]pending[/dim]"
|
|
263
|
+
no_expected += 1
|
|
264
|
+
|
|
265
|
+
# Truncate for display
|
|
266
|
+
expected_display = (expected[:27] + "...") if len(expected) > 30 else expected
|
|
267
|
+
output_display = (output[:27] + "...") if len(output) > 30 else output
|
|
268
|
+
|
|
269
|
+
table.add_row(
|
|
270
|
+
eval_file.stem,
|
|
271
|
+
status,
|
|
272
|
+
expected_display or "[dim]not set[/dim]",
|
|
273
|
+
output_display
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
console.print(table)
|
|
277
|
+
console.print()
|
|
278
|
+
|
|
279
|
+
# Summary
|
|
280
|
+
if failed > 0:
|
|
281
|
+
console.print(f"[bold red]✗ {failed} failed[/bold red], ", end="")
|
|
282
|
+
if passed > 0:
|
|
283
|
+
console.print(f"[bold green]✓ {passed} passed[/bold green], ", end="")
|
|
284
|
+
if no_expected > 0:
|
|
285
|
+
console.print(f"[dim]{no_expected} no expected[/dim]", end="")
|
|
286
|
+
console.print()
|
connectonion/cli/main.py
CHANGED
|
@@ -55,6 +55,7 @@ def _show_help():
|
|
|
55
55
|
console.print(" [green]create[/green] <name> Create new project")
|
|
56
56
|
console.print(" [green]init[/green] Initialize in current directory")
|
|
57
57
|
console.print(" [green]copy[/green] <name> Copy tool/plugin source to project")
|
|
58
|
+
console.print(" [green]eval[/green] Run evals and show status")
|
|
58
59
|
console.print(" [green]deploy[/green] Deploy to ConnectOnion Cloud")
|
|
59
60
|
console.print(" [green]auth[/green] Authenticate for managed keys")
|
|
60
61
|
console.print(" [green]status[/green] Check account balance")
|
|
@@ -152,6 +153,16 @@ def copy(
|
|
|
152
153
|
handle_copy(names=names or [], list_all=list_all, path=path, force=force)
|
|
153
154
|
|
|
154
155
|
|
|
156
|
+
@app.command()
|
|
157
|
+
def eval(
|
|
158
|
+
name: Optional[str] = typer.Argument(None, help="Specific eval name"),
|
|
159
|
+
agent: Optional[str] = typer.Option(None, "--agent", "-a", help="Agent file (overrides YAML)"),
|
|
160
|
+
):
|
|
161
|
+
"""Run evals and show results."""
|
|
162
|
+
from .commands.eval_commands import handle_eval
|
|
163
|
+
handle_eval(name=name, agent_file=agent)
|
|
164
|
+
|
|
165
|
+
|
|
155
166
|
def cli():
|
|
156
167
|
"""Entry point."""
|
|
157
168
|
app()
|
connectonion/console.py
CHANGED
|
@@ -110,7 +110,7 @@ class Console:
|
|
|
110
110
|
● ─────────────────────
|
|
111
111
|
connectonion v0.5.1
|
|
112
112
|
o4-mini · 3 tools
|
|
113
|
-
.co/logs/ · .co/
|
|
113
|
+
.co/logs/ · .co/evals/
|
|
114
114
|
|
|
115
115
|
Args:
|
|
116
116
|
agent_name: Name of the agent
|
|
@@ -156,7 +156,7 @@ class Console:
|
|
|
156
156
|
|
|
157
157
|
# Add log paths if logging is enabled
|
|
158
158
|
if log_dir:
|
|
159
|
-
lines.append(f" [{DIM_COLOR}]{log_dir}logs/ · {log_dir}
|
|
159
|
+
lines.append(f" [{DIM_COLOR}]{log_dir}logs/ · {log_dir}evals/[/{DIM_COLOR}]")
|
|
160
160
|
|
|
161
161
|
# Add Aaron's message for free tier users
|
|
162
162
|
if aaron_message:
|
|
@@ -182,7 +182,7 @@ class Console:
|
|
|
182
182
|
if meta_line:
|
|
183
183
|
plain_lines.append(f" {meta_line}")
|
|
184
184
|
if log_dir:
|
|
185
|
-
plain_lines.append(f" {log_dir}logs/ · {log_dir}
|
|
185
|
+
plain_lines.append(f" {log_dir}logs/ · {log_dir}evals/")
|
|
186
186
|
if aaron_message:
|
|
187
187
|
plain_lines.append(f" {aaron_message}")
|
|
188
188
|
plain_lines.append(f" {separator}")
|
|
@@ -484,12 +484,12 @@ class Console:
|
|
|
484
484
|
|
|
485
485
|
[co] ═══════════════════════════════════════
|
|
486
486
|
[co] ✓ done · 2.3k tokens · $0.005 · 3.4s
|
|
487
|
-
[co] saved → .co/
|
|
487
|
+
[co] saved → .co/evals/research-assistant.yaml
|
|
488
488
|
|
|
489
489
|
Args:
|
|
490
490
|
duration_s: Total duration in seconds
|
|
491
491
|
session: Agent's current_session dict (contains trace with usage)
|
|
492
|
-
session_path: Optional path to
|
|
492
|
+
session_path: Optional path to eval file
|
|
493
493
|
"""
|
|
494
494
|
# Calculate totals from trace
|
|
495
495
|
trace = session.get('trace', [])
|
connectonion/core/agent.py
CHANGED
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
Purpose: Orchestrate AI agent execution with LLM calls, tool execution, and automatic logging
|
|
3
3
|
LLM-Note:
|
|
4
4
|
Dependencies: imports from [llm.py, tool_factory.py, prompts.py, decorators.py, logger.py, tool_executor.py, tool_registry.py] | imported by [__init__.py, debug_agent/__init__.py] | tested by [tests/test_agent.py, tests/test_agent_prompts.py, tests/test_agent_workflows.py]
|
|
5
|
-
Data flow: receives user prompt: str from Agent.input() → creates/extends current_session with messages → calls llm.complete() with tool schemas → receives LLMResponse with tool_calls → executes tools via tool_executor.execute_and_record_tools() → appends tool results to messages → repeats loop until no tool_calls or max_iterations → logger logs to .co/logs/{name}.log and .co/
|
|
6
|
-
State/Effects: modifies self.current_session['messages', 'trace', 'turn', 'iteration'] | writes to .co/logs/{name}.log and .co/
|
|
7
|
-
Integration: exposes Agent(name, tools, system_prompt, model, log, quiet), .input(prompt), .execute_tool(name, args), .add_tool(func), .remove_tool(name), .list_tools(), .reset_conversation() | tools stored in ToolRegistry with attribute access (agent.tools.tool_name) and instance storage (agent.tools.gmail) | tool execution delegates to tool_executor module | log defaults to .co/logs/ (None), can be True (current dir), False (disabled), or custom path | quiet=True suppresses console but keeps
|
|
5
|
+
Data flow: receives user prompt: str from Agent.input() → creates/extends current_session with messages → calls llm.complete() with tool schemas → receives LLMResponse with tool_calls → executes tools via tool_executor.execute_and_record_tools() → appends tool results to messages → repeats loop until no tool_calls or max_iterations → logger logs to .co/logs/{name}.log and .co/evals/{name}.yaml → returns final response: str
|
|
6
|
+
State/Effects: modifies self.current_session['messages', 'trace', 'turn', 'iteration'] | writes to .co/logs/{name}.log and .co/evals/ via logger.py
|
|
7
|
+
Integration: exposes Agent(name, tools, system_prompt, model, log, quiet), .input(prompt), .execute_tool(name, args), .add_tool(func), .remove_tool(name), .list_tools(), .reset_conversation() | tools stored in ToolRegistry with attribute access (agent.tools.tool_name) and instance storage (agent.tools.gmail) | tool execution delegates to tool_executor module | log defaults to .co/logs/ (None), can be True (current dir), False (disabled), or custom path | quiet=True suppresses console but keeps eval logging | trust enforcement moved to host() for network access control
|
|
8
8
|
Performance: max_iterations=10 default (configurable per-input) | session state persists across turns for multi-turn conversations | ToolRegistry provides O(1) tool lookup via .get() or attribute access
|
|
9
9
|
Errors: LLM errors bubble up | tool execution errors captured in trace and returned to LLM for retry
|
|
10
10
|
"""
|
|
@@ -51,11 +51,14 @@ class Agent:
|
|
|
51
51
|
# Current session context (runtime only)
|
|
52
52
|
self.current_session = None
|
|
53
53
|
|
|
54
|
+
# Connection to client (None locally, injected by host() for WebSocket)
|
|
55
|
+
self.connection = None
|
|
56
|
+
|
|
54
57
|
# Token usage tracking
|
|
55
58
|
self.total_cost: float = 0.0 # Cumulative cost in USD
|
|
56
59
|
self.last_usage: Optional[TokenUsage] = None # From most recent LLM call
|
|
57
60
|
|
|
58
|
-
# Initialize logger (unified: terminal + file + YAML
|
|
61
|
+
# Initialize logger (unified: terminal + file + YAML evals)
|
|
59
62
|
# Environment variable override (highest priority)
|
|
60
63
|
effective_log = log
|
|
61
64
|
if os.getenv('CONNECTONION_LOG'):
|
|
@@ -250,16 +253,16 @@ class Agent:
|
|
|
250
253
|
|
|
251
254
|
self.current_session['result'] = result
|
|
252
255
|
|
|
253
|
-
# Print completion summary
|
|
254
|
-
if self.logger.console:
|
|
255
|
-
session_path = f".co/sessions/{self.name}.yaml" if self.logger.enable_sessions else None
|
|
256
|
-
self.logger.console.print_completion(duration, self.current_session, session_path)
|
|
257
|
-
|
|
258
256
|
self._invoke_events('on_complete')
|
|
259
257
|
|
|
260
|
-
# Log turn to YAML
|
|
258
|
+
# Log turn to YAML eval (after on_complete so handlers can modify state)
|
|
261
259
|
self.logger.log_turn(prompt, result, duration * 1000, self.current_session, self.llm.model)
|
|
262
260
|
|
|
261
|
+
# Print completion summary (after log_turn so we have the eval path)
|
|
262
|
+
if self.logger.console:
|
|
263
|
+
eval_path = self.logger.get_eval_path()
|
|
264
|
+
self.logger.console.print_completion(duration, self.current_session, eval_path)
|
|
265
|
+
|
|
263
266
|
return result
|
|
264
267
|
|
|
265
268
|
def reset_conversation(self):
|