connectonion 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,9 +2,9 @@
2
2
  Purpose: Orchestrate AI agent execution with LLM calls, tool execution, and automatic logging
3
3
  LLM-Note:
4
4
  Dependencies: imports from [llm.py, tool_factory.py, prompts.py, decorators.py, logger.py, tool_executor.py, tool_registry.py] | imported by [__init__.py, debug_agent/__init__.py] | tested by [tests/test_agent.py, tests/test_agent_prompts.py, tests/test_agent_workflows.py]
5
- Data flow: receives user prompt: str from Agent.input() → creates/extends current_session with messages → calls llm.complete() with tool schemas → receives LLMResponse with tool_calls → executes tools via tool_executor.execute_and_record_tools() → appends tool results to messages → repeats loop until no tool_calls or max_iterations → logger logs to .co/logs/{name}.log and .co/sessions/{name}_{timestamp}.yaml → returns final response: str
6
- State/Effects: modifies self.current_session['messages', 'trace', 'turn', 'iteration'] | writes to .co/logs/{name}.log and .co/sessions/ via logger.py
7
- Integration: exposes Agent(name, tools, system_prompt, model, log, quiet), .input(prompt), .execute_tool(name, args), .add_tool(func), .remove_tool(name), .list_tools(), .reset_conversation() | tools stored in ToolRegistry with attribute access (agent.tools.tool_name) and instance storage (agent.tools.gmail) | tool execution delegates to tool_executor module | log defaults to .co/logs/ (None), can be True (current dir), False (disabled), or custom path | quiet=True suppresses console but keeps session logging | trust enforcement moved to host() for network access control
5
+ Data flow: receives user prompt: str from Agent.input() → creates/extends current_session with messages → calls llm.complete() with tool schemas → receives LLMResponse with tool_calls → executes tools via tool_executor.execute_and_record_tools() → appends tool results to messages → repeats loop until no tool_calls or max_iterations → logger logs to .co/logs/{name}.log and .co/evals/{name}.yaml → returns final response: str
6
+ State/Effects: modifies self.current_session['messages', 'trace', 'turn', 'iteration'] | writes to .co/logs/{name}.log and .co/evals/ via logger.py
7
+ Integration: exposes Agent(name, tools, system_prompt, model, log, quiet), .input(prompt), .execute_tool(name, args), .add_tool(func), .remove_tool(name), .list_tools(), .reset_conversation() | tools stored in ToolRegistry with attribute access (agent.tools.tool_name) and instance storage (agent.tools.gmail) | tool execution delegates to tool_executor module | log defaults to .co/logs/ (None), can be True (current dir), False (disabled), or custom path | quiet=True suppresses console but keeps eval logging | trust enforcement moved to host() for network access control
8
8
  Performance: max_iterations=10 default (configurable per-input) | session state persists across turns for multi-turn conversations | ToolRegistry provides O(1) tool lookup via .get() or attribute access
9
9
  Errors: LLM errors bubble up | tool execution errors captured in trace and returned to LLM for retry
10
10
  """
@@ -51,11 +51,14 @@ class Agent:
51
51
  # Current session context (runtime only)
52
52
  self.current_session = None
53
53
 
54
+ # Connection to client (None locally, injected by host() for WebSocket)
55
+ self.connection = None
56
+
54
57
  # Token usage tracking
55
58
  self.total_cost: float = 0.0 # Cumulative cost in USD
56
59
  self.last_usage: Optional[TokenUsage] = None # From most recent LLM call
57
60
 
58
- # Initialize logger (unified: terminal + file + YAML sessions)
61
+ # Initialize logger (unified: terminal + file + YAML evals)
59
62
  # Environment variable override (highest priority)
60
63
  effective_log = log
61
64
  if os.getenv('CONNECTONION_LOG'):
@@ -250,16 +253,16 @@ class Agent:
250
253
 
251
254
  self.current_session['result'] = result
252
255
 
253
- # Print completion summary
254
- if self.logger.console:
255
- session_path = f".co/sessions/{self.name}.yaml" if self.logger.enable_sessions else None
256
- self.logger.console.print_completion(duration, self.current_session, session_path)
257
-
258
256
  self._invoke_events('on_complete')
259
257
 
260
- # Log turn to YAML session (after on_complete so handlers can modify state)
258
+ # Log turn to YAML eval (after on_complete so handlers can modify state)
261
259
  self.logger.log_turn(prompt, result, duration * 1000, self.current_session, self.llm.model)
262
260
 
261
+ # Print completion summary (after log_turn so we have the eval path)
262
+ if self.logger.console:
263
+ eval_path = self.logger.get_eval_path()
264
+ self.logger.console.print_completion(duration, self.current_session, eval_path)
265
+
263
266
  return result
264
267
 
265
268
  def reset_conversation(self):
connectonion/core/llm.py CHANGED
@@ -734,28 +734,18 @@ class OpenOnionLLM(LLM):
734
734
  )
735
735
 
736
736
  def structured_complete(self, messages: List[Dict], output_schema: Type[BaseModel], **kwargs) -> BaseModel:
737
- """Get structured Pydantic output using OpenAI-compatible API."""
738
- response = self.client.responses.parse(
737
+ """Get structured Pydantic output using OpenAI-compatible chat completions API.
738
+
739
+ Uses beta.chat.completions.parse() which routes through /v1/chat/completions,
740
+ allowing proper provider routing for Gemini, OpenAI, and other models.
741
+ """
742
+ completion = self.client.beta.chat.completions.parse(
739
743
  model=self.model,
740
- input=messages,
741
- text_format=output_schema,
744
+ messages=messages,
745
+ response_format=output_schema,
742
746
  **kwargs
743
747
  )
744
-
745
- # Handle edge cases
746
- if response.status == "incomplete":
747
- if response.incomplete_details.reason == "max_output_tokens":
748
- raise ValueError("Response incomplete: maximum output tokens reached")
749
- elif response.incomplete_details.reason == "content_filter":
750
- raise ValueError("Response incomplete: content filtered")
751
-
752
- # Check for refusal
753
- if response.output and len(response.output) > 0:
754
- first_content = response.output[0].content[0] if response.output[0].content else None
755
- if first_content and hasattr(first_content, 'type') and first_content.type == "refusal":
756
- raise ValueError(f"Model refused to respond: {first_content.refusal}")
757
-
758
- return response.output_parsed
748
+ return completion.choices[0].message.parsed
759
749
 
760
750
 
761
751
  def create_llm(model: str, api_key: Optional[str] = None, **kwargs) -> LLM:
connectonion/logger.py CHANGED
@@ -1,52 +1,74 @@
1
1
  """
2
- Purpose: Unified logging interface for agents - terminal output + plain text + YAML sessions
2
+ Purpose: Unified logging interface for agents - terminal output + plain text + YAML evals
3
3
  LLM-Note:
4
- Dependencies: imports from [datetime, pathlib, typing, yaml, console.py] | imported by [agent.py, tool_executor.py] | tested by [tests/unit/test_logger.py]
5
- Data flow: receives from Agent/tool_executor → delegates to Console for terminal/file → writes YAML sessions to .co/sessions/
6
- State/Effects: writes to .co/sessions/{agent_name}.yaml (one file per agent, appends turns) | delegates file logging to Console | session data persisted after each turn
4
+ Dependencies: imports from [datetime, pathlib, typing, json, re, yaml, os, console.py] | imported by [agent.py, tool_executor.py] | tested by [tests/unit/test_logger.py]
5
+ Data flow: receives from Agent/tool_executor → delegates to Console for terminal/file → writes YAML evals to .co/evals/
6
+ State/Effects: writes to .co/evals/{input_slug}.yaml (one file per unique first input) | run data stored in .co/evals/{input_slug}/run_{n}.yaml | eval data persisted after each turn
7
7
  Integration: exposes Logger(agent_name, quiet, log), .print(), .log_tool_call(name, args), .log_tool_result(result, timing), .log_llm_response(), .start_session(), .log_turn()
8
- Session format: metadata at top turns summary (with tools_called as function-call style) system_prompt + messages at end (see docs/session-yaml-format.md)
9
- Performance: YAML written after each turn (incremental) | loads existing session file on start | Console delegation is direct passthrough
8
+ Eval format: eval.yaml (metadata + turns) | run_N.yaml (system_prompt, model, cwd, tokens, cost, duration_ms, timestamp, messages as multi-line JSON)
9
+ Performance: YAML written after each turn (incremental) | Console delegation is direct passthrough
10
10
  Errors: let I/O errors bubble up (no try-except)
11
11
  """
12
12
 
13
+ import json
14
+ import re
13
15
  from datetime import datetime
14
16
  from pathlib import Path
15
- from typing import Optional, Union, Dict, Any
17
+ from typing import Optional, Union, Dict, Any, List
18
+
16
19
  import yaml
17
20
 
18
21
  from .console import Console
19
22
 
20
23
 
24
+ def _slugify(text: str, max_length: int = 50) -> str:
25
+ """Convert text to URL-friendly slug for filenames.
26
+
27
+ Args:
28
+ text: Input text to slugify
29
+ max_length: Maximum length of slug
30
+
31
+ Returns:
32
+ Lowercase slug with words separated by underscores
33
+ """
34
+ # Lowercase and replace spaces/special chars with underscores
35
+ slug = re.sub(r'[^a-zA-Z0-9]+', '_', text.lower())
36
+ # Remove leading/trailing underscores
37
+ slug = slug.strip('_')
38
+ # Truncate to max length at word boundary
39
+ if len(slug) > max_length:
40
+ slug = slug[:max_length].rsplit('_', 1)[0]
41
+ return slug or 'default'
42
+
43
+
21
44
  class Logger:
22
- """Unified logging: terminal output + plain text + YAML sessions.
45
+ """Unified logging: terminal output + plain text + YAML evals.
23
46
 
24
- Facade pattern: wraps Console for terminal/file logging, adds YAML sessions.
47
+ Facade pattern: wraps Console for terminal/file logging, adds YAML evals.
25
48
 
26
- Session files use one file per agent (.co/sessions/{agent_name}.yaml) to
27
- reduce file clutter. New turns are appended to the same file.
49
+ Eval files are named from the first input (slugified). Same input sequence
50
+ = same file with multiple runs. Each run stored as YAML with messages as JSON.
51
+ Log = Eval (same format, add expect field for tests).
28
52
 
29
53
  Args:
30
- agent_name: Name of the agent (used in filenames)
54
+ agent_name: Name of the agent (used in log filenames)
31
55
  quiet: Suppress console output (default False)
32
56
  log: Enable file logging (default True, or path string for custom location)
33
57
 
34
58
  Files created:
35
59
  - .co/logs/{agent_name}.log: Plain text log with session markers
36
- - .co/sessions/{agent_name}.yaml: Structured YAML with all turns
60
+ - .co/evals/{input_slug}.yaml: Structured YAML with turns and history
61
+ - .co/evals/{input_slug}/run_{n}.yaml: Run metadata + messages as multi-line JSON
37
62
 
38
63
  Examples:
39
64
  # Development (default) - see output + save everything
40
65
  logger = Logger("my-agent")
41
66
 
42
- # Eval mode - quiet but record sessions
67
+ # Eval mode - quiet but record evals
43
68
  logger = Logger("my-agent", quiet=True)
44
69
 
45
70
  # Benchmark - completely off
46
71
  logger = Logger("my-agent", log=False)
47
-
48
- # Custom log path
49
- logger = Logger("my-agent", log="custom/path.log")
50
72
  """
51
73
 
52
74
  def __init__(
@@ -59,7 +81,7 @@ class Logger:
59
81
 
60
82
  # Determine what to enable
61
83
  self.enable_console = not quiet
62
- self.enable_sessions = True # Sessions on unless log=False
84
+ self.enable_sessions = True # Evals on unless log=False
63
85
  self.enable_file = True
64
86
  self.log_file_path = Path(f".co/logs/{agent_name}.log")
65
87
 
@@ -73,7 +95,7 @@ class Logger:
73
95
  self.log_file_path = Path(log)
74
96
  # else: log=True or log=None → defaults
75
97
 
76
- # If quiet=True, also disable file (only keep sessions)
98
+ # If quiet=True, also disable file (only keep evals)
77
99
  if quiet:
78
100
  self.enable_file = False
79
101
 
@@ -83,9 +105,12 @@ class Logger:
83
105
  file_path = self.log_file_path if self.enable_file else None
84
106
  self.console = Console(log_file=file_path)
85
107
 
86
- # Session state (YAML)
87
- self.session_file: Optional[Path] = None
88
- self.session_data: Optional[Dict[str, Any]] = None
108
+ # Eval state
109
+ self.eval_file: Optional[Path] = None
110
+ self.eval_dir: Optional[Path] = None
111
+ self.eval_data: Optional[Dict[str, Any]] = None
112
+ self.current_run: int = 0
113
+ self._first_input: Optional[str] = None # Track first input for file naming
89
114
 
90
115
  # Delegate to Console
91
116
  def print(self, message: str, style: str = None):
@@ -129,68 +154,76 @@ class Logger:
129
154
  parts.append(f"{k}={v_str}")
130
155
  return f"{tool_name}({', '.join(parts)})"
131
156
 
132
- # Session logging (YAML)
157
+ # Eval logging (YAML + JSONL) - Log = Eval, same format
133
158
  def start_session(self, system_prompt: str = "", session_id: Optional[str] = None):
134
- """Initialize session YAML file.
159
+ """Initialize eval session state.
135
160
 
136
- Uses one file per session_id (for HTTP API) or per agent (for interactive).
137
- Loads existing session data if file exists, appends new turns.
161
+ Note: The actual file is created lazily in log_turn() when we have
162
+ the first input to generate the filename from.
163
+ System prompt is stored in messages JSONL, not in eval YAML.
138
164
 
139
165
  Args:
140
- system_prompt: The system prompt for this session
141
- session_id: Optional session identifier. If provided, logs to
142
- .co/sessions/{session_id}.yaml for thread-safe HTTP API.
143
- If None, uses agent name for interactive mode.
166
+ system_prompt: Unused (kept for backward compatibility)
167
+ session_id: Optional session identifier (used for HTTP API thread safety)
144
168
  """
145
169
  if not self.enable_sessions:
146
170
  return
147
171
 
148
- sessions_dir = Path(".co/sessions")
149
- sessions_dir.mkdir(parents=True, exist_ok=True)
150
-
151
- # Use session_id if provided (HTTP API), otherwise use agent_name (interactive)
152
- filename = session_id if session_id else self.agent_name
153
- # Sanitize: keep only safe characters (alphanumeric, dash, underscore)
154
- import re
155
- filename = re.sub(r'[^a-zA-Z0-9_-]', '_', filename)[:255] or 'default'
156
- self.session_file = sessions_dir / f"{filename}.yaml"
157
-
158
- # Load existing session or create new
159
- if self.session_file.exists():
160
- with open(self.session_file, 'r') as f:
161
- self.session_data = yaml.safe_load(f) or {}
162
- # Ensure ALL required fields exist (handles empty/corrupted files)
163
- if 'name' not in self.session_data:
164
- self.session_data['name'] = self.agent_name
165
- if 'session_id' not in self.session_data and session_id:
166
- self.session_data['session_id'] = session_id
167
- if 'created' not in self.session_data:
168
- self.session_data['created'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
169
- if 'total_cost' not in self.session_data:
170
- self.session_data['total_cost'] = 0.0
171
- if 'total_tokens' not in self.session_data:
172
- self.session_data['total_tokens'] = 0
173
- if 'turns' not in self.session_data:
174
- self.session_data['turns'] = []
175
- if 'messages' not in self.session_data:
176
- self.session_data['messages'] = {}
177
- # Update system_prompt if provided
178
- if system_prompt:
179
- self.session_data['system_prompt'] = system_prompt
172
+ self._first_input = None
173
+ self.eval_file = None
174
+ self.eval_dir = None
175
+ self.eval_data = None
176
+ self.current_run = 0
177
+
178
+ def _init_eval_file(self, first_input: str):
179
+ """Initialize or load eval file based on first input.
180
+
181
+ Args:
182
+ first_input: The first user input (used to name the file)
183
+ """
184
+ evals_dir = Path(".co/evals")
185
+ evals_dir.mkdir(parents=True, exist_ok=True)
186
+
187
+ # Generate filename from first input
188
+ slug = _slugify(first_input)
189
+ self.eval_file = evals_dir / f"{slug}.yaml"
190
+ self.eval_dir = evals_dir / slug
191
+ self._first_input = first_input
192
+
193
+ # Load existing or create new
194
+ if self.eval_file.exists():
195
+ with open(self.eval_file, 'r') as f:
196
+ self.eval_data = yaml.safe_load(f) or {}
197
+
198
+ # Check if this is the same conversation (same first input)
199
+ existing_turns = self.eval_data.get('turns', [])
200
+ if existing_turns and existing_turns[0].get('input') == first_input:
201
+ # Same conversation - new run
202
+ self.current_run = self.eval_data.get('runs', 0) + 1
203
+ self.eval_data['runs'] = self.current_run
204
+ else:
205
+ # Different first input but same slug (collision) - treat as new
206
+ self.current_run = 1
207
+ self.eval_data = self._create_new_eval_data(first_input)
180
208
  else:
181
- self.session_data = {
182
- "name": self.agent_name,
183
- "session_id": session_id,
184
- "created": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
185
- "total_cost": 0.0,
186
- "total_tokens": 0,
187
- "system_prompt": system_prompt,
188
- "turns": [],
189
- "messages": {} # Dict keyed by turn number
190
- }
209
+ self.current_run = 1
210
+ self.eval_data = self._create_new_eval_data(first_input)
211
+
212
+ # Create messages directory
213
+ self.eval_dir.mkdir(parents=True, exist_ok=True)
214
+
215
+ def _create_new_eval_data(self, first_input: str) -> Dict[str, Any]:
216
+ """Create new eval data structure."""
217
+ return {
218
+ "name": _slugify(first_input),
219
+ "created": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
220
+ "runs": 1,
221
+ "model": "",
222
+ "turns": []
223
+ }
191
224
 
192
225
  def log_turn(self, user_input: str, result: str, duration_ms: float, session: dict, model: str):
193
- """Log turn summary + messages to YAML file.
226
+ """Log turn to YAML file and messages to JSONL.
194
227
 
195
228
  Args:
196
229
  user_input: The user's input prompt
@@ -199,9 +232,13 @@ class Logger:
199
232
  session: Agent's current_session dict (contains messages, trace)
200
233
  model: Model name string
201
234
  """
202
- if not self.enable_sessions or not self.session_data:
235
+ if not self.enable_sessions:
203
236
  return
204
237
 
238
+ # Initialize file on first turn (lazy initialization)
239
+ if self.eval_data is None:
240
+ self._init_eval_file(user_input)
241
+
205
242
  # Aggregate from trace
206
243
  trace = session.get('trace', [])
207
244
  llm_calls = [t for t in trace if t.get('type') == 'llm_call']
@@ -216,85 +253,218 @@ class Logger:
216
253
  for t in llm_calls if t.get('usage')
217
254
  )
218
255
 
219
- turn_data = {
220
- 'input': user_input,
221
- 'expected': session.get('expected', ''),
222
- 'model': model,
223
- 'duration_ms': int(duration_ms),
224
- 'tokens': total_tokens,
225
- 'cost': round(total_cost, 4),
256
+ # Build metadata as compact JSON string
257
+ meta = json.dumps({
258
+ "tokens": total_tokens,
259
+ "cost": round(total_cost, 4),
260
+ "duration_ms": int(duration_ms),
261
+ "ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
262
+ })
263
+
264
+ # Build turn data for this run
265
+ run_data = {
266
+ 'run': self.current_run,
267
+ 'output': result,
226
268
  'tools_called': [self._format_tool_call(t) for t in tool_calls],
227
- 'result': result,
228
- 'evaluation': session.get('evaluation', '')
269
+ 'expected': session.get('expected', ''),
270
+ 'evaluation': session.get('evaluation', ''),
271
+ 'meta': meta
229
272
  }
230
273
 
231
- # Update session aggregates
232
- self.session_data['updated'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
233
- self.session_data['total_cost'] = round(
234
- self.session_data.get('total_cost', 0) + turn_data['cost'], 4
235
- )
236
- self.session_data['total_tokens'] = (
237
- self.session_data.get('total_tokens', 0) + turn_data['tokens']
274
+ # Find or create turn entry
275
+ turn_index = session.get('turn', 1) - 1 # 0-indexed
276
+ turns = self.eval_data['turns']
277
+
278
+ if turn_index < len(turns):
279
+ # Existing turn - add to history
280
+ existing_turn = turns[turn_index]
281
+ if existing_turn.get('input') == user_input:
282
+ # Same input - this is a new run
283
+ history = existing_turn.get('history', [])
284
+ # Move current run to history (metadata only)
285
+ if existing_turn.get('run'):
286
+ history.insert(0, {
287
+ 'run': existing_turn.get('run', self.current_run - 1),
288
+ 'status': existing_turn.get('evaluation', ''),
289
+ 'meta': existing_turn.get('meta', '')
290
+ })
291
+ # Update with new run data
292
+ existing_turn.update({
293
+ 'run': run_data['run'],
294
+ 'output': run_data['output'],
295
+ 'tools_called': run_data['tools_called'],
296
+ 'expected': run_data['expected'],
297
+ 'evaluation': run_data['evaluation'],
298
+ 'meta': run_data['meta'],
299
+ 'history': history
300
+ })
301
+ else:
302
+ # Different input at same position - shouldn't happen normally
303
+ turns.append({
304
+ 'input': user_input,
305
+ **run_data,
306
+ 'history': []
307
+ })
308
+ else:
309
+ # New turn
310
+ turns.append({
311
+ 'input': user_input,
312
+ **run_data,
313
+ 'history': []
314
+ })
315
+
316
+ # Update metadata
317
+ self.eval_data['updated'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
318
+ self.eval_data['model'] = model
319
+
320
+ # Write run YAML with messages
321
+ self._write_run_yaml(
322
+ messages=session.get('messages', []),
323
+ model=model,
324
+ tokens=total_tokens,
325
+ cost=total_cost,
326
+ duration_ms=duration_ms
238
327
  )
239
328
 
240
- # Add turn number and timestamp
241
- turn_num = len(self.session_data['turns']) + 1
242
- turn_data['turn'] = turn_num
243
- turn_data['timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
244
- self.session_data['turns'].append(turn_data)
329
+ # Write YAML
330
+ self._write_eval()
245
331
 
246
- # Extract this turn's messages (everything after what we've already saved)
247
- all_messages = session.get('messages', [])
248
- saved_count = sum(len(msgs) for msgs in self.session_data['messages'].values())
249
- turn_messages = all_messages[saved_count + 1:] # +1 to skip system message
250
- self.session_data['messages'][turn_num] = turn_messages
332
+ def _write_run_yaml(self, messages: List[Dict], model: str, tokens: int, cost: float, duration_ms: float):
333
+ """Write run metadata and messages to YAML file.
251
334
 
252
- # Write YAML
253
- self._write_session()
335
+ Args:
336
+ messages: List of message dicts
337
+ model: Model name
338
+ tokens: Total tokens used
339
+ cost: Total cost
340
+ duration_ms: Duration in milliseconds
341
+ """
342
+ if not self.eval_dir:
343
+ return
344
+
345
+ import os
346
+ import sys
347
+
348
+ # Extract system prompt from messages
349
+ system_prompt = ""
350
+ for msg in messages:
351
+ if msg.get('role') == 'system':
352
+ system_prompt = msg.get('content', '')
353
+ break
354
+
355
+ # Get agent file path (the script being run)
356
+ agent_file = sys.argv[0] if sys.argv else ""
357
+ # Make it relative to cwd if possible
358
+ cwd = os.getcwd()
359
+ if agent_file and os.path.isabs(agent_file):
360
+ try:
361
+ agent_file = os.path.relpath(agent_file, cwd)
362
+ except ValueError:
363
+ pass # Keep absolute if on different drive (Windows)
364
+
365
+ # Format messages as pretty JSON (one message per line)
366
+ messages_json_lines = []
367
+ for msg in messages:
368
+ messages_json_lines.append(" " + json.dumps(msg, ensure_ascii=False))
369
+ messages_formatted = "[\n" + ",\n".join(messages_json_lines) + "\n]"
370
+
371
+ # Build run data
372
+ run_data = {
373
+ 'agent': agent_file,
374
+ 'system_prompt': system_prompt,
375
+ 'model': model,
376
+ 'cwd': cwd,
377
+ 'tokens': tokens,
378
+ 'cost': round(cost, 4),
379
+ 'duration_ms': int(duration_ms),
380
+ 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
381
+ 'messages': messages_formatted
382
+ }
254
383
 
255
- def _write_session(self):
256
- """Write session data with turns summary first, detail at end."""
257
- # Build ordered dict: compact metadata → turns → detail (system_prompt + messages)
384
+ # Write YAML with messages as literal block
385
+ run_file = self.eval_dir / f"run_{self.current_run}.yaml"
386
+ with open(run_file, 'w', encoding='utf-8') as f:
387
+ # Write metadata fields normally
388
+ for key in ['agent', 'system_prompt', 'model', 'cwd', 'tokens', 'cost', 'duration_ms', 'timestamp']:
389
+ value = run_data[key]
390
+ if isinstance(value, str) and '\n' in value:
391
+ f.write(f"{key}: |\n")
392
+ for line in value.split('\n'):
393
+ f.write(f" {line}\n")
394
+ elif isinstance(value, str):
395
+ # Quote strings that might have special chars
396
+ f.write(f"{key}: {json.dumps(value)}\n")
397
+ else:
398
+ f.write(f"{key}: {value}\n")
399
+ # Write messages as literal block
400
+ f.write("messages: |\n")
401
+ for line in messages_formatted.split('\n'):
402
+ f.write(f" {line}\n")
403
+
404
+ def _write_eval(self):
405
+ """Write eval data to YAML file."""
406
+ if not self.eval_file or not self.eval_data:
407
+ return
408
+
409
+ # Build ordered output
258
410
  ordered = {
259
- 'name': self.session_data['name'],
260
- 'session_id': self.session_data.get('session_id'),
261
- 'created': self.session_data['created'],
262
- 'updated': self.session_data.get('updated', ''),
263
- 'total_cost': self.session_data.get('total_cost', 0),
264
- 'total_tokens': self.session_data.get('total_tokens', 0),
265
- 'turns': self.session_data['turns'],
266
- # Detail section (scroll down)
267
- 'system_prompt': self.session_data.get('system_prompt', ''),
268
- 'messages': self.session_data['messages']
411
+ 'name': self.eval_data['name'],
412
+ 'created': self.eval_data['created'],
413
+ 'updated': self.eval_data.get('updated', ''),
414
+ 'runs': self.eval_data['runs'],
415
+ 'model': self.eval_data['model'],
416
+ 'turns': self.eval_data['turns']
269
417
  }
270
- with open(self.session_file, 'w') as f:
418
+
419
+ with open(self.eval_file, 'w', encoding='utf-8') as f:
271
420
  yaml.dump(ordered, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
272
421
 
273
- def load_messages(self) -> list:
274
- """Load and reconstruct full message list from session file.
422
+ def get_eval_path(self) -> Optional[str]:
423
+ """Get the path to the current eval file.
424
+
425
+ Returns:
426
+ Path string like '.co/evals/what_is_25_x_4.yaml' or None
427
+ """
428
+ if self.eval_file:
429
+ return str(self.eval_file)
430
+ return None
431
+
432
+ def load_messages(self, run: Optional[int] = None) -> list:
433
+ """Load messages from run YAML file.
434
+
435
+ Args:
436
+ run: Run number to load (default: current run)
275
437
 
276
438
  Returns:
277
- Full message list: [system_message] + all turn messages in order
439
+ List of message dicts
278
440
  """
279
- if not self.session_file or not self.session_file.exists():
441
+ if not self.eval_dir:
280
442
  return []
281
- with open(self.session_file, 'r') as f:
282
- data = yaml.safe_load(f) or {}
283
443
 
284
- # Reconstruct: system prompt + all turn messages in order
285
- messages = []
286
- if data.get('system_prompt'):
287
- messages.append({"role": "system", "content": data['system_prompt']})
444
+ run_num = run or self.current_run
445
+ run_file = self.eval_dir / f"run_{run_num}.yaml"
446
+
447
+ if not run_file.exists():
448
+ # Try legacy JSONL format
449
+ jsonl_file = self.eval_dir / f"run_{run_num}.jsonl"
450
+ if jsonl_file.exists():
451
+ messages = []
452
+ with open(jsonl_file, 'r', encoding='utf-8') as f:
453
+ for line in f:
454
+ if line.strip():
455
+ messages.append(json.loads(line))
456
+ return messages
457
+ return []
288
458
 
289
- turn_messages = data.get('messages', {})
290
- for turn_num in sorted(turn_messages.keys()):
291
- messages.extend(turn_messages[turn_num])
459
+ with open(run_file, 'r', encoding='utf-8') as f:
460
+ data = yaml.safe_load(f)
292
461
 
293
- return messages
462
+ messages_str = data.get('messages', '[]')
463
+ return json.loads(messages_str)
294
464
 
295
465
  def load_session(self) -> dict:
296
- """Load session data from file."""
297
- if not self.session_file or not self.session_file.exists():
298
- return {'system_prompt': '', 'turns': [], 'messages': {}}
299
- with open(self.session_file, 'r') as f:
300
- return yaml.safe_load(f) or {'system_prompt': '', 'turns': [], 'messages': {}}
466
+ """Load eval data from file."""
467
+ if not self.eval_file or not self.eval_file.exists():
468
+ return {'turns': [], 'runs': 0}
469
+ with open(self.eval_file, 'r') as f:
470
+ return yaml.safe_load(f) or {'turns': [], 'runs': 0}