connectonion 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- connectonion/__init__.py +3 -2
- connectonion/cli/browser_agent/browser.py +433 -147
- connectonion/cli/browser_agent/element_finder.py +139 -0
- connectonion/cli/browser_agent/highlight_screenshot.py +174 -0
- connectonion/cli/browser_agent/prompt.md +188 -105
- connectonion/cli/browser_agent/prompts/element_matcher.md +59 -0
- connectonion/cli/browser_agent/prompts/form_filler.md +19 -0
- connectonion/cli/browser_agent/prompts/scroll_strategy.md +36 -0
- connectonion/cli/browser_agent/scripts/extract_elements.js +126 -0
- connectonion/cli/browser_agent/scroll.py +137 -0
- connectonion/cli/commands/eval_commands.py +286 -0
- connectonion/cli/main.py +11 -0
- connectonion/console.py +5 -5
- connectonion/core/agent.py +13 -10
- connectonion/core/llm.py +9 -19
- connectonion/logger.py +305 -135
- connectonion/network/__init__.py +3 -0
- connectonion/network/asgi.py +122 -2
- connectonion/network/connection.py +123 -0
- connectonion/network/host.py +7 -5
- connectonion/useful_plugins/__init__.py +4 -3
- connectonion/useful_plugins/ui_stream.py +164 -0
- {connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/METADATA +1 -1
- {connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/RECORD +27 -17
- /connectonion/{static → network/static}/docs.html +0 -0
- {connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/WHEEL +0 -0
- {connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/entry_points.txt +0 -0
connectonion/core/agent.py
CHANGED
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
Purpose: Orchestrate AI agent execution with LLM calls, tool execution, and automatic logging
|
|
3
3
|
LLM-Note:
|
|
4
4
|
Dependencies: imports from [llm.py, tool_factory.py, prompts.py, decorators.py, logger.py, tool_executor.py, tool_registry.py] | imported by [__init__.py, debug_agent/__init__.py] | tested by [tests/test_agent.py, tests/test_agent_prompts.py, tests/test_agent_workflows.py]
|
|
5
|
-
Data flow: receives user prompt: str from Agent.input() → creates/extends current_session with messages → calls llm.complete() with tool schemas → receives LLMResponse with tool_calls → executes tools via tool_executor.execute_and_record_tools() → appends tool results to messages → repeats loop until no tool_calls or max_iterations → logger logs to .co/logs/{name}.log and .co/
|
|
6
|
-
State/Effects: modifies self.current_session['messages', 'trace', 'turn', 'iteration'] | writes to .co/logs/{name}.log and .co/
|
|
7
|
-
Integration: exposes Agent(name, tools, system_prompt, model, log, quiet), .input(prompt), .execute_tool(name, args), .add_tool(func), .remove_tool(name), .list_tools(), .reset_conversation() | tools stored in ToolRegistry with attribute access (agent.tools.tool_name) and instance storage (agent.tools.gmail) | tool execution delegates to tool_executor module | log defaults to .co/logs/ (None), can be True (current dir), False (disabled), or custom path | quiet=True suppresses console but keeps
|
|
5
|
+
Data flow: receives user prompt: str from Agent.input() → creates/extends current_session with messages → calls llm.complete() with tool schemas → receives LLMResponse with tool_calls → executes tools via tool_executor.execute_and_record_tools() → appends tool results to messages → repeats loop until no tool_calls or max_iterations → logger logs to .co/logs/{name}.log and .co/evals/{name}.yaml → returns final response: str
|
|
6
|
+
State/Effects: modifies self.current_session['messages', 'trace', 'turn', 'iteration'] | writes to .co/logs/{name}.log and .co/evals/ via logger.py
|
|
7
|
+
Integration: exposes Agent(name, tools, system_prompt, model, log, quiet), .input(prompt), .execute_tool(name, args), .add_tool(func), .remove_tool(name), .list_tools(), .reset_conversation() | tools stored in ToolRegistry with attribute access (agent.tools.tool_name) and instance storage (agent.tools.gmail) | tool execution delegates to tool_executor module | log defaults to .co/logs/ (None), can be True (current dir), False (disabled), or custom path | quiet=True suppresses console but keeps eval logging | trust enforcement moved to host() for network access control
|
|
8
8
|
Performance: max_iterations=10 default (configurable per-input) | session state persists across turns for multi-turn conversations | ToolRegistry provides O(1) tool lookup via .get() or attribute access
|
|
9
9
|
Errors: LLM errors bubble up | tool execution errors captured in trace and returned to LLM for retry
|
|
10
10
|
"""
|
|
@@ -51,11 +51,14 @@ class Agent:
|
|
|
51
51
|
# Current session context (runtime only)
|
|
52
52
|
self.current_session = None
|
|
53
53
|
|
|
54
|
+
# Connection to client (None locally, injected by host() for WebSocket)
|
|
55
|
+
self.connection = None
|
|
56
|
+
|
|
54
57
|
# Token usage tracking
|
|
55
58
|
self.total_cost: float = 0.0 # Cumulative cost in USD
|
|
56
59
|
self.last_usage: Optional[TokenUsage] = None # From most recent LLM call
|
|
57
60
|
|
|
58
|
-
# Initialize logger (unified: terminal + file + YAML
|
|
61
|
+
# Initialize logger (unified: terminal + file + YAML evals)
|
|
59
62
|
# Environment variable override (highest priority)
|
|
60
63
|
effective_log = log
|
|
61
64
|
if os.getenv('CONNECTONION_LOG'):
|
|
@@ -250,16 +253,16 @@ class Agent:
|
|
|
250
253
|
|
|
251
254
|
self.current_session['result'] = result
|
|
252
255
|
|
|
253
|
-
# Print completion summary
|
|
254
|
-
if self.logger.console:
|
|
255
|
-
session_path = f".co/sessions/{self.name}.yaml" if self.logger.enable_sessions else None
|
|
256
|
-
self.logger.console.print_completion(duration, self.current_session, session_path)
|
|
257
|
-
|
|
258
256
|
self._invoke_events('on_complete')
|
|
259
257
|
|
|
260
|
-
# Log turn to YAML
|
|
258
|
+
# Log turn to YAML eval (after on_complete so handlers can modify state)
|
|
261
259
|
self.logger.log_turn(prompt, result, duration * 1000, self.current_session, self.llm.model)
|
|
262
260
|
|
|
261
|
+
# Print completion summary (after log_turn so we have the eval path)
|
|
262
|
+
if self.logger.console:
|
|
263
|
+
eval_path = self.logger.get_eval_path()
|
|
264
|
+
self.logger.console.print_completion(duration, self.current_session, eval_path)
|
|
265
|
+
|
|
263
266
|
return result
|
|
264
267
|
|
|
265
268
|
def reset_conversation(self):
|
connectonion/core/llm.py
CHANGED
|
@@ -734,28 +734,18 @@ class OpenOnionLLM(LLM):
|
|
|
734
734
|
)
|
|
735
735
|
|
|
736
736
|
def structured_complete(self, messages: List[Dict], output_schema: Type[BaseModel], **kwargs) -> BaseModel:
|
|
737
|
-
"""Get structured Pydantic output using OpenAI-compatible API.
|
|
738
|
-
|
|
737
|
+
"""Get structured Pydantic output using OpenAI-compatible chat completions API.
|
|
738
|
+
|
|
739
|
+
Uses beta.chat.completions.parse() which routes through /v1/chat/completions,
|
|
740
|
+
allowing proper provider routing for Gemini, OpenAI, and other models.
|
|
741
|
+
"""
|
|
742
|
+
completion = self.client.beta.chat.completions.parse(
|
|
739
743
|
model=self.model,
|
|
740
|
-
|
|
741
|
-
|
|
744
|
+
messages=messages,
|
|
745
|
+
response_format=output_schema,
|
|
742
746
|
**kwargs
|
|
743
747
|
)
|
|
744
|
-
|
|
745
|
-
# Handle edge cases
|
|
746
|
-
if response.status == "incomplete":
|
|
747
|
-
if response.incomplete_details.reason == "max_output_tokens":
|
|
748
|
-
raise ValueError("Response incomplete: maximum output tokens reached")
|
|
749
|
-
elif response.incomplete_details.reason == "content_filter":
|
|
750
|
-
raise ValueError("Response incomplete: content filtered")
|
|
751
|
-
|
|
752
|
-
# Check for refusal
|
|
753
|
-
if response.output and len(response.output) > 0:
|
|
754
|
-
first_content = response.output[0].content[0] if response.output[0].content else None
|
|
755
|
-
if first_content and hasattr(first_content, 'type') and first_content.type == "refusal":
|
|
756
|
-
raise ValueError(f"Model refused to respond: {first_content.refusal}")
|
|
757
|
-
|
|
758
|
-
return response.output_parsed
|
|
748
|
+
return completion.choices[0].message.parsed
|
|
759
749
|
|
|
760
750
|
|
|
761
751
|
def create_llm(model: str, api_key: Optional[str] = None, **kwargs) -> LLM:
|
connectonion/logger.py
CHANGED
|
@@ -1,52 +1,74 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Purpose: Unified logging interface for agents - terminal output + plain text + YAML
|
|
2
|
+
Purpose: Unified logging interface for agents - terminal output + plain text + YAML evals
|
|
3
3
|
LLM-Note:
|
|
4
|
-
Dependencies: imports from [datetime, pathlib, typing, yaml, console.py] | imported by [agent.py, tool_executor.py] | tested by [tests/unit/test_logger.py]
|
|
5
|
-
Data flow: receives from Agent/tool_executor → delegates to Console for terminal/file → writes YAML
|
|
6
|
-
State/Effects: writes to .co/
|
|
4
|
+
Dependencies: imports from [datetime, pathlib, typing, json, re, yaml, os, console.py] | imported by [agent.py, tool_executor.py] | tested by [tests/unit/test_logger.py]
|
|
5
|
+
Data flow: receives from Agent/tool_executor → delegates to Console for terminal/file → writes YAML evals to .co/evals/
|
|
6
|
+
State/Effects: writes to .co/evals/{input_slug}.yaml (one file per unique first input) | run data stored in .co/evals/{input_slug}/run_{n}.yaml | eval data persisted after each turn
|
|
7
7
|
Integration: exposes Logger(agent_name, quiet, log), .print(), .log_tool_call(name, args), .log_tool_result(result, timing), .log_llm_response(), .start_session(), .log_turn()
|
|
8
|
-
|
|
9
|
-
Performance: YAML written after each turn (incremental) |
|
|
8
|
+
Eval format: eval.yaml (metadata + turns) | run_N.yaml (system_prompt, model, cwd, tokens, cost, duration_ms, timestamp, messages as multi-line JSON)
|
|
9
|
+
Performance: YAML written after each turn (incremental) | Console delegation is direct passthrough
|
|
10
10
|
Errors: let I/O errors bubble up (no try-except)
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
+
import json
|
|
14
|
+
import re
|
|
13
15
|
from datetime import datetime
|
|
14
16
|
from pathlib import Path
|
|
15
|
-
from typing import Optional, Union, Dict, Any
|
|
17
|
+
from typing import Optional, Union, Dict, Any, List
|
|
18
|
+
|
|
16
19
|
import yaml
|
|
17
20
|
|
|
18
21
|
from .console import Console
|
|
19
22
|
|
|
20
23
|
|
|
24
|
+
def _slugify(text: str, max_length: int = 50) -> str:
|
|
25
|
+
"""Convert text to URL-friendly slug for filenames.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text: Input text to slugify
|
|
29
|
+
max_length: Maximum length of slug
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Lowercase slug with words separated by underscores
|
|
33
|
+
"""
|
|
34
|
+
# Lowercase and replace spaces/special chars with underscores
|
|
35
|
+
slug = re.sub(r'[^a-zA-Z0-9]+', '_', text.lower())
|
|
36
|
+
# Remove leading/trailing underscores
|
|
37
|
+
slug = slug.strip('_')
|
|
38
|
+
# Truncate to max length at word boundary
|
|
39
|
+
if len(slug) > max_length:
|
|
40
|
+
slug = slug[:max_length].rsplit('_', 1)[0]
|
|
41
|
+
return slug or 'default'
|
|
42
|
+
|
|
43
|
+
|
|
21
44
|
class Logger:
|
|
22
|
-
"""Unified logging: terminal output + plain text + YAML
|
|
45
|
+
"""Unified logging: terminal output + plain text + YAML evals.
|
|
23
46
|
|
|
24
|
-
Facade pattern: wraps Console for terminal/file logging, adds YAML
|
|
47
|
+
Facade pattern: wraps Console for terminal/file logging, adds YAML evals.
|
|
25
48
|
|
|
26
|
-
|
|
27
|
-
|
|
49
|
+
Eval files are named from the first input (slugified). Same input sequence
|
|
50
|
+
= same file with multiple runs. Each run stored as YAML with messages as JSON.
|
|
51
|
+
Log = Eval (same format, add expect field for tests).
|
|
28
52
|
|
|
29
53
|
Args:
|
|
30
|
-
agent_name: Name of the agent (used in filenames)
|
|
54
|
+
agent_name: Name of the agent (used in log filenames)
|
|
31
55
|
quiet: Suppress console output (default False)
|
|
32
56
|
log: Enable file logging (default True, or path string for custom location)
|
|
33
57
|
|
|
34
58
|
Files created:
|
|
35
59
|
- .co/logs/{agent_name}.log: Plain text log with session markers
|
|
36
|
-
- .co/
|
|
60
|
+
- .co/evals/{input_slug}.yaml: Structured YAML with turns and history
|
|
61
|
+
- .co/evals/{input_slug}/run_{n}.yaml: Run metadata + messages as multi-line JSON
|
|
37
62
|
|
|
38
63
|
Examples:
|
|
39
64
|
# Development (default) - see output + save everything
|
|
40
65
|
logger = Logger("my-agent")
|
|
41
66
|
|
|
42
|
-
# Eval mode - quiet but record
|
|
67
|
+
# Eval mode - quiet but record evals
|
|
43
68
|
logger = Logger("my-agent", quiet=True)
|
|
44
69
|
|
|
45
70
|
# Benchmark - completely off
|
|
46
71
|
logger = Logger("my-agent", log=False)
|
|
47
|
-
|
|
48
|
-
# Custom log path
|
|
49
|
-
logger = Logger("my-agent", log="custom/path.log")
|
|
50
72
|
"""
|
|
51
73
|
|
|
52
74
|
def __init__(
|
|
@@ -59,7 +81,7 @@ class Logger:
|
|
|
59
81
|
|
|
60
82
|
# Determine what to enable
|
|
61
83
|
self.enable_console = not quiet
|
|
62
|
-
self.enable_sessions = True #
|
|
84
|
+
self.enable_sessions = True # Evals on unless log=False
|
|
63
85
|
self.enable_file = True
|
|
64
86
|
self.log_file_path = Path(f".co/logs/{agent_name}.log")
|
|
65
87
|
|
|
@@ -73,7 +95,7 @@ class Logger:
|
|
|
73
95
|
self.log_file_path = Path(log)
|
|
74
96
|
# else: log=True or log=None → defaults
|
|
75
97
|
|
|
76
|
-
# If quiet=True, also disable file (only keep
|
|
98
|
+
# If quiet=True, also disable file (only keep evals)
|
|
77
99
|
if quiet:
|
|
78
100
|
self.enable_file = False
|
|
79
101
|
|
|
@@ -83,9 +105,12 @@ class Logger:
|
|
|
83
105
|
file_path = self.log_file_path if self.enable_file else None
|
|
84
106
|
self.console = Console(log_file=file_path)
|
|
85
107
|
|
|
86
|
-
#
|
|
87
|
-
self.
|
|
88
|
-
self.
|
|
108
|
+
# Eval state
|
|
109
|
+
self.eval_file: Optional[Path] = None
|
|
110
|
+
self.eval_dir: Optional[Path] = None
|
|
111
|
+
self.eval_data: Optional[Dict[str, Any]] = None
|
|
112
|
+
self.current_run: int = 0
|
|
113
|
+
self._first_input: Optional[str] = None # Track first input for file naming
|
|
89
114
|
|
|
90
115
|
# Delegate to Console
|
|
91
116
|
def print(self, message: str, style: str = None):
|
|
@@ -129,68 +154,76 @@ class Logger:
|
|
|
129
154
|
parts.append(f"{k}={v_str}")
|
|
130
155
|
return f"{tool_name}({', '.join(parts)})"
|
|
131
156
|
|
|
132
|
-
#
|
|
157
|
+
# Eval logging (YAML + JSONL) - Log = Eval, same format
|
|
133
158
|
def start_session(self, system_prompt: str = "", session_id: Optional[str] = None):
|
|
134
|
-
"""Initialize session
|
|
159
|
+
"""Initialize eval session state.
|
|
135
160
|
|
|
136
|
-
|
|
137
|
-
|
|
161
|
+
Note: The actual file is created lazily in log_turn() when we have
|
|
162
|
+
the first input to generate the filename from.
|
|
163
|
+
System prompt is stored in messages JSONL, not in eval YAML.
|
|
138
164
|
|
|
139
165
|
Args:
|
|
140
|
-
system_prompt:
|
|
141
|
-
session_id: Optional session identifier
|
|
142
|
-
.co/sessions/{session_id}.yaml for thread-safe HTTP API.
|
|
143
|
-
If None, uses agent name for interactive mode.
|
|
166
|
+
system_prompt: Unused (kept for backward compatibility)
|
|
167
|
+
session_id: Optional session identifier (used for HTTP API thread safety)
|
|
144
168
|
"""
|
|
145
169
|
if not self.enable_sessions:
|
|
146
170
|
return
|
|
147
171
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
self.
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
self.
|
|
172
|
+
self._first_input = None
|
|
173
|
+
self.eval_file = None
|
|
174
|
+
self.eval_dir = None
|
|
175
|
+
self.eval_data = None
|
|
176
|
+
self.current_run = 0
|
|
177
|
+
|
|
178
|
+
def _init_eval_file(self, first_input: str):
|
|
179
|
+
"""Initialize or load eval file based on first input.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
first_input: The first user input (used to name the file)
|
|
183
|
+
"""
|
|
184
|
+
evals_dir = Path(".co/evals")
|
|
185
|
+
evals_dir.mkdir(parents=True, exist_ok=True)
|
|
186
|
+
|
|
187
|
+
# Generate filename from first input
|
|
188
|
+
slug = _slugify(first_input)
|
|
189
|
+
self.eval_file = evals_dir / f"{slug}.yaml"
|
|
190
|
+
self.eval_dir = evals_dir / slug
|
|
191
|
+
self._first_input = first_input
|
|
192
|
+
|
|
193
|
+
# Load existing or create new
|
|
194
|
+
if self.eval_file.exists():
|
|
195
|
+
with open(self.eval_file, 'r') as f:
|
|
196
|
+
self.eval_data = yaml.safe_load(f) or {}
|
|
197
|
+
|
|
198
|
+
# Check if this is the same conversation (same first input)
|
|
199
|
+
existing_turns = self.eval_data.get('turns', [])
|
|
200
|
+
if existing_turns and existing_turns[0].get('input') == first_input:
|
|
201
|
+
# Same conversation - new run
|
|
202
|
+
self.current_run = self.eval_data.get('runs', 0) + 1
|
|
203
|
+
self.eval_data['runs'] = self.current_run
|
|
204
|
+
else:
|
|
205
|
+
# Different first input but same slug (collision) - treat as new
|
|
206
|
+
self.current_run = 1
|
|
207
|
+
self.eval_data = self._create_new_eval_data(first_input)
|
|
180
208
|
else:
|
|
181
|
-
self.
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
209
|
+
self.current_run = 1
|
|
210
|
+
self.eval_data = self._create_new_eval_data(first_input)
|
|
211
|
+
|
|
212
|
+
# Create messages directory
|
|
213
|
+
self.eval_dir.mkdir(parents=True, exist_ok=True)
|
|
214
|
+
|
|
215
|
+
def _create_new_eval_data(self, first_input: str) -> Dict[str, Any]:
|
|
216
|
+
"""Create new eval data structure."""
|
|
217
|
+
return {
|
|
218
|
+
"name": _slugify(first_input),
|
|
219
|
+
"created": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
220
|
+
"runs": 1,
|
|
221
|
+
"model": "",
|
|
222
|
+
"turns": []
|
|
223
|
+
}
|
|
191
224
|
|
|
192
225
|
def log_turn(self, user_input: str, result: str, duration_ms: float, session: dict, model: str):
|
|
193
|
-
"""Log turn
|
|
226
|
+
"""Log turn to YAML file and messages to JSONL.
|
|
194
227
|
|
|
195
228
|
Args:
|
|
196
229
|
user_input: The user's input prompt
|
|
@@ -199,9 +232,13 @@ class Logger:
|
|
|
199
232
|
session: Agent's current_session dict (contains messages, trace)
|
|
200
233
|
model: Model name string
|
|
201
234
|
"""
|
|
202
|
-
if not self.enable_sessions
|
|
235
|
+
if not self.enable_sessions:
|
|
203
236
|
return
|
|
204
237
|
|
|
238
|
+
# Initialize file on first turn (lazy initialization)
|
|
239
|
+
if self.eval_data is None:
|
|
240
|
+
self._init_eval_file(user_input)
|
|
241
|
+
|
|
205
242
|
# Aggregate from trace
|
|
206
243
|
trace = session.get('trace', [])
|
|
207
244
|
llm_calls = [t for t in trace if t.get('type') == 'llm_call']
|
|
@@ -216,85 +253,218 @@ class Logger:
|
|
|
216
253
|
for t in llm_calls if t.get('usage')
|
|
217
254
|
)
|
|
218
255
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
256
|
+
# Build metadata as compact JSON string
|
|
257
|
+
meta = json.dumps({
|
|
258
|
+
"tokens": total_tokens,
|
|
259
|
+
"cost": round(total_cost, 4),
|
|
260
|
+
"duration_ms": int(duration_ms),
|
|
261
|
+
"ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
# Build turn data for this run
|
|
265
|
+
run_data = {
|
|
266
|
+
'run': self.current_run,
|
|
267
|
+
'output': result,
|
|
226
268
|
'tools_called': [self._format_tool_call(t) for t in tool_calls],
|
|
227
|
-
'
|
|
228
|
-
'evaluation': session.get('evaluation', '')
|
|
269
|
+
'expected': session.get('expected', ''),
|
|
270
|
+
'evaluation': session.get('evaluation', ''),
|
|
271
|
+
'meta': meta
|
|
229
272
|
}
|
|
230
273
|
|
|
231
|
-
#
|
|
232
|
-
|
|
233
|
-
self.
|
|
234
|
-
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
|
|
274
|
+
# Find or create turn entry
|
|
275
|
+
turn_index = session.get('turn', 1) - 1 # 0-indexed
|
|
276
|
+
turns = self.eval_data['turns']
|
|
277
|
+
|
|
278
|
+
if turn_index < len(turns):
|
|
279
|
+
# Existing turn - add to history
|
|
280
|
+
existing_turn = turns[turn_index]
|
|
281
|
+
if existing_turn.get('input') == user_input:
|
|
282
|
+
# Same input - this is a new run
|
|
283
|
+
history = existing_turn.get('history', [])
|
|
284
|
+
# Move current run to history (metadata only)
|
|
285
|
+
if existing_turn.get('run'):
|
|
286
|
+
history.insert(0, {
|
|
287
|
+
'run': existing_turn.get('run', self.current_run - 1),
|
|
288
|
+
'status': existing_turn.get('evaluation', ''),
|
|
289
|
+
'meta': existing_turn.get('meta', '')
|
|
290
|
+
})
|
|
291
|
+
# Update with new run data
|
|
292
|
+
existing_turn.update({
|
|
293
|
+
'run': run_data['run'],
|
|
294
|
+
'output': run_data['output'],
|
|
295
|
+
'tools_called': run_data['tools_called'],
|
|
296
|
+
'expected': run_data['expected'],
|
|
297
|
+
'evaluation': run_data['evaluation'],
|
|
298
|
+
'meta': run_data['meta'],
|
|
299
|
+
'history': history
|
|
300
|
+
})
|
|
301
|
+
else:
|
|
302
|
+
# Different input at same position - shouldn't happen normally
|
|
303
|
+
turns.append({
|
|
304
|
+
'input': user_input,
|
|
305
|
+
**run_data,
|
|
306
|
+
'history': []
|
|
307
|
+
})
|
|
308
|
+
else:
|
|
309
|
+
# New turn
|
|
310
|
+
turns.append({
|
|
311
|
+
'input': user_input,
|
|
312
|
+
**run_data,
|
|
313
|
+
'history': []
|
|
314
|
+
})
|
|
315
|
+
|
|
316
|
+
# Update metadata
|
|
317
|
+
self.eval_data['updated'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
318
|
+
self.eval_data['model'] = model
|
|
319
|
+
|
|
320
|
+
# Write run YAML with messages
|
|
321
|
+
self._write_run_yaml(
|
|
322
|
+
messages=session.get('messages', []),
|
|
323
|
+
model=model,
|
|
324
|
+
tokens=total_tokens,
|
|
325
|
+
cost=total_cost,
|
|
326
|
+
duration_ms=duration_ms
|
|
238
327
|
)
|
|
239
328
|
|
|
240
|
-
#
|
|
241
|
-
|
|
242
|
-
turn_data['turn'] = turn_num
|
|
243
|
-
turn_data['timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
244
|
-
self.session_data['turns'].append(turn_data)
|
|
329
|
+
# Write YAML
|
|
330
|
+
self._write_eval()
|
|
245
331
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
saved_count = sum(len(msgs) for msgs in self.session_data['messages'].values())
|
|
249
|
-
turn_messages = all_messages[saved_count + 1:] # +1 to skip system message
|
|
250
|
-
self.session_data['messages'][turn_num] = turn_messages
|
|
332
|
+
def _write_run_yaml(self, messages: List[Dict], model: str, tokens: int, cost: float, duration_ms: float):
|
|
333
|
+
"""Write run metadata and messages to YAML file.
|
|
251
334
|
|
|
252
|
-
|
|
253
|
-
|
|
335
|
+
Args:
|
|
336
|
+
messages: List of message dicts
|
|
337
|
+
model: Model name
|
|
338
|
+
tokens: Total tokens used
|
|
339
|
+
cost: Total cost
|
|
340
|
+
duration_ms: Duration in milliseconds
|
|
341
|
+
"""
|
|
342
|
+
if not self.eval_dir:
|
|
343
|
+
return
|
|
344
|
+
|
|
345
|
+
import os
|
|
346
|
+
import sys
|
|
347
|
+
|
|
348
|
+
# Extract system prompt from messages
|
|
349
|
+
system_prompt = ""
|
|
350
|
+
for msg in messages:
|
|
351
|
+
if msg.get('role') == 'system':
|
|
352
|
+
system_prompt = msg.get('content', '')
|
|
353
|
+
break
|
|
354
|
+
|
|
355
|
+
# Get agent file path (the script being run)
|
|
356
|
+
agent_file = sys.argv[0] if sys.argv else ""
|
|
357
|
+
# Make it relative to cwd if possible
|
|
358
|
+
cwd = os.getcwd()
|
|
359
|
+
if agent_file and os.path.isabs(agent_file):
|
|
360
|
+
try:
|
|
361
|
+
agent_file = os.path.relpath(agent_file, cwd)
|
|
362
|
+
except ValueError:
|
|
363
|
+
pass # Keep absolute if on different drive (Windows)
|
|
364
|
+
|
|
365
|
+
# Format messages as pretty JSON (one message per line)
|
|
366
|
+
messages_json_lines = []
|
|
367
|
+
for msg in messages:
|
|
368
|
+
messages_json_lines.append(" " + json.dumps(msg, ensure_ascii=False))
|
|
369
|
+
messages_formatted = "[\n" + ",\n".join(messages_json_lines) + "\n]"
|
|
370
|
+
|
|
371
|
+
# Build run data
|
|
372
|
+
run_data = {
|
|
373
|
+
'agent': agent_file,
|
|
374
|
+
'system_prompt': system_prompt,
|
|
375
|
+
'model': model,
|
|
376
|
+
'cwd': cwd,
|
|
377
|
+
'tokens': tokens,
|
|
378
|
+
'cost': round(cost, 4),
|
|
379
|
+
'duration_ms': int(duration_ms),
|
|
380
|
+
'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
381
|
+
'messages': messages_formatted
|
|
382
|
+
}
|
|
254
383
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
384
|
+
# Write YAML with messages as literal block
|
|
385
|
+
run_file = self.eval_dir / f"run_{self.current_run}.yaml"
|
|
386
|
+
with open(run_file, 'w', encoding='utf-8') as f:
|
|
387
|
+
# Write metadata fields normally
|
|
388
|
+
for key in ['agent', 'system_prompt', 'model', 'cwd', 'tokens', 'cost', 'duration_ms', 'timestamp']:
|
|
389
|
+
value = run_data[key]
|
|
390
|
+
if isinstance(value, str) and '\n' in value:
|
|
391
|
+
f.write(f"{key}: |\n")
|
|
392
|
+
for line in value.split('\n'):
|
|
393
|
+
f.write(f" {line}\n")
|
|
394
|
+
elif isinstance(value, str):
|
|
395
|
+
# Quote strings that might have special chars
|
|
396
|
+
f.write(f"{key}: {json.dumps(value)}\n")
|
|
397
|
+
else:
|
|
398
|
+
f.write(f"{key}: {value}\n")
|
|
399
|
+
# Write messages as literal block
|
|
400
|
+
f.write("messages: |\n")
|
|
401
|
+
for line in messages_formatted.split('\n'):
|
|
402
|
+
f.write(f" {line}\n")
|
|
403
|
+
|
|
404
|
+
def _write_eval(self):
|
|
405
|
+
"""Write eval data to YAML file."""
|
|
406
|
+
if not self.eval_file or not self.eval_data:
|
|
407
|
+
return
|
|
408
|
+
|
|
409
|
+
# Build ordered output
|
|
258
410
|
ordered = {
|
|
259
|
-
'name': self.
|
|
260
|
-
'
|
|
261
|
-
'
|
|
262
|
-
'
|
|
263
|
-
'
|
|
264
|
-
'
|
|
265
|
-
'turns': self.session_data['turns'],
|
|
266
|
-
# Detail section (scroll down)
|
|
267
|
-
'system_prompt': self.session_data.get('system_prompt', ''),
|
|
268
|
-
'messages': self.session_data['messages']
|
|
411
|
+
'name': self.eval_data['name'],
|
|
412
|
+
'created': self.eval_data['created'],
|
|
413
|
+
'updated': self.eval_data.get('updated', ''),
|
|
414
|
+
'runs': self.eval_data['runs'],
|
|
415
|
+
'model': self.eval_data['model'],
|
|
416
|
+
'turns': self.eval_data['turns']
|
|
269
417
|
}
|
|
270
|
-
|
|
418
|
+
|
|
419
|
+
with open(self.eval_file, 'w', encoding='utf-8') as f:
|
|
271
420
|
yaml.dump(ordered, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
272
421
|
|
|
273
|
-
def
|
|
274
|
-
"""
|
|
422
|
+
def get_eval_path(self) -> Optional[str]:
|
|
423
|
+
"""Get the path to the current eval file.
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
Path string like '.co/evals/what_is_25_x_4.yaml' or None
|
|
427
|
+
"""
|
|
428
|
+
if self.eval_file:
|
|
429
|
+
return str(self.eval_file)
|
|
430
|
+
return None
|
|
431
|
+
|
|
432
|
+
def load_messages(self, run: Optional[int] = None) -> list:
|
|
433
|
+
"""Load messages from run YAML file.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
run: Run number to load (default: current run)
|
|
275
437
|
|
|
276
438
|
Returns:
|
|
277
|
-
|
|
439
|
+
List of message dicts
|
|
278
440
|
"""
|
|
279
|
-
if not self.
|
|
441
|
+
if not self.eval_dir:
|
|
280
442
|
return []
|
|
281
|
-
with open(self.session_file, 'r') as f:
|
|
282
|
-
data = yaml.safe_load(f) or {}
|
|
283
443
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
444
|
+
run_num = run or self.current_run
|
|
445
|
+
run_file = self.eval_dir / f"run_{run_num}.yaml"
|
|
446
|
+
|
|
447
|
+
if not run_file.exists():
|
|
448
|
+
# Try legacy JSONL format
|
|
449
|
+
jsonl_file = self.eval_dir / f"run_{run_num}.jsonl"
|
|
450
|
+
if jsonl_file.exists():
|
|
451
|
+
messages = []
|
|
452
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
453
|
+
for line in f:
|
|
454
|
+
if line.strip():
|
|
455
|
+
messages.append(json.loads(line))
|
|
456
|
+
return messages
|
|
457
|
+
return []
|
|
288
458
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
messages.extend(turn_messages[turn_num])
|
|
459
|
+
with open(run_file, 'r', encoding='utf-8') as f:
|
|
460
|
+
data = yaml.safe_load(f)
|
|
292
461
|
|
|
293
|
-
|
|
462
|
+
messages_str = data.get('messages', '[]')
|
|
463
|
+
return json.loads(messages_str)
|
|
294
464
|
|
|
295
465
|
def load_session(self) -> dict:
|
|
296
|
-
"""Load
|
|
297
|
-
if not self.
|
|
298
|
-
return {'
|
|
299
|
-
with open(self.
|
|
300
|
-
return yaml.safe_load(f) or {'
|
|
466
|
+
"""Load eval data from file."""
|
|
467
|
+
if not self.eval_file or not self.eval_file.exists():
|
|
468
|
+
return {'turns': [], 'runs': 0}
|
|
469
|
+
with open(self.eval_file, 'r') as f:
|
|
470
|
+
return yaml.safe_load(f) or {'turns': [], 'runs': 0}
|