thoughtflow 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thoughtflow/__init__.py +97 -5
- thoughtflow/_util.py +752 -0
- thoughtflow/action.py +357 -0
- thoughtflow/agent.py +66 -0
- thoughtflow/eval/__init__.py +34 -0
- thoughtflow/eval/harness.py +200 -0
- thoughtflow/eval/replay.py +137 -0
- thoughtflow/llm.py +250 -0
- thoughtflow/memory/__init__.py +32 -0
- thoughtflow/memory/base.py +1658 -0
- thoughtflow/message.py +140 -0
- thoughtflow/py.typed +2 -0
- thoughtflow/thought.py +1102 -0
- thoughtflow/thoughtflow6.py +4180 -0
- thoughtflow/tools/__init__.py +27 -0
- thoughtflow/tools/base.py +145 -0
- thoughtflow/tools/registry.py +122 -0
- thoughtflow/trace/__init__.py +34 -0
- thoughtflow/trace/events.py +183 -0
- thoughtflow/trace/schema.py +111 -0
- thoughtflow/trace/session.py +141 -0
- thoughtflow-0.0.3.dist-info/METADATA +215 -0
- thoughtflow-0.0.3.dist-info/RECORD +25 -0
- {thoughtflow-0.0.1.dist-info → thoughtflow-0.0.3.dist-info}/WHEEL +1 -2
- {thoughtflow-0.0.1.dist-info → thoughtflow-0.0.3.dist-info/licenses}/LICENSE +1 -1
- thoughtflow/jtools1.py +0 -25
- thoughtflow/jtools2.py +0 -27
- thoughtflow-0.0.1.dist-info/METADATA +0 -17
- thoughtflow-0.0.1.dist-info/RECORD +0 -8
- thoughtflow-0.0.1.dist-info/top_level.txt +0 -1
thoughtflow/action.py
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ACTION class for ThoughtFlow.
|
|
3
|
+
|
|
4
|
+
The ACTION class encapsulates an external or internal operation that can be invoked
|
|
5
|
+
within a Thoughtflow agent workflow.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
|
|
12
|
+
from thoughtflow._util import event_stamp
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ACTION:
|
|
16
|
+
"""
|
|
17
|
+
The ACTION class encapsulates an external or internal operation that can be invoked within a Thoughtflow agent.
|
|
18
|
+
It is designed to represent a single, named action (such as a tool call, API request, or function) whose result
|
|
19
|
+
is stored in the agent's state for later inspection, branching, or retry.
|
|
20
|
+
|
|
21
|
+
An ACTION represents a discrete, named operation (function, API call, tool invocation) that can be defined once
|
|
22
|
+
and executed multiple times with different parameters. When executed, the ACTION handles logging, error management,
|
|
23
|
+
and result storage in a consistent way.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
name (str): Identifier for this action, used for logging and storing results.
|
|
27
|
+
id (str): Unique identifier for this action instance (event_stamp).
|
|
28
|
+
fn (callable): The function to execute when this action is called.
|
|
29
|
+
config (dict): Default configuration parameters that will be passed to the function.
|
|
30
|
+
result_key (str): Key where results are stored in memory (defaults to "{name}_result").
|
|
31
|
+
description (str): Human-readable description of what this action does.
|
|
32
|
+
last_result (Any): The most recent result from executing this action.
|
|
33
|
+
last_error (Exception): The most recent error from executing this action, if any.
|
|
34
|
+
execution_count (int): Number of times this action has been executed.
|
|
35
|
+
execution_history (list): Full execution history with timing and success/error tracking.
|
|
36
|
+
|
|
37
|
+
Methods:
|
|
38
|
+
__init__(name, fn, config=None, result_key=None, description=None):
|
|
39
|
+
Initializes an ACTION with a name, function, and optional configuration.
|
|
40
|
+
|
|
41
|
+
__call__(memory, **kwargs):
|
|
42
|
+
Executes the action function with the memory object and any override parameters.
|
|
43
|
+
The function receives (memory, **merged_kwargs) where merged_kwargs combines
|
|
44
|
+
self.config with any call-specific kwargs.
|
|
45
|
+
|
|
46
|
+
Returns the memory object with results stored via set_var.
|
|
47
|
+
Logs execution details with JSON-formatted event data.
|
|
48
|
+
Tracks execution timing and history.
|
|
49
|
+
|
|
50
|
+
Handles exceptions during execution by logging them rather than raising them,
|
|
51
|
+
allowing the workflow to continue and decide how to handle failures.
|
|
52
|
+
|
|
53
|
+
get_last_result():
|
|
54
|
+
Returns the most recent result from executing this action.
|
|
55
|
+
|
|
56
|
+
was_successful():
|
|
57
|
+
Returns True if the last execution was successful, False otherwise.
|
|
58
|
+
|
|
59
|
+
reset_stats():
|
|
60
|
+
Resets execution statistics (count, last_result, last_error, execution_history).
|
|
61
|
+
|
|
62
|
+
copy():
|
|
63
|
+
Returns a copy of this ACTION with a new ID and reset statistics.
|
|
64
|
+
|
|
65
|
+
to_dict():
|
|
66
|
+
Returns a serializable dictionary representation of this action.
|
|
67
|
+
|
|
68
|
+
from_dict(cls, data, fn_registry):
|
|
69
|
+
Class method to reconstruct an ACTION from a dictionary representation.
|
|
70
|
+
|
|
71
|
+
Example Usage:
|
|
72
|
+
# Define a web search action
|
|
73
|
+
def search_web(memory, query, max_results=3):
|
|
74
|
+
# Implementation of web search
|
|
75
|
+
results = web_api.search(query, limit=max_results)
|
|
76
|
+
return {"status": "success", "hits": results}
|
|
77
|
+
|
|
78
|
+
search_action = ACTION(
|
|
79
|
+
name="web_search",
|
|
80
|
+
fn=search_web,
|
|
81
|
+
config={"max_results": 5},
|
|
82
|
+
description="Searches the web for information"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Execute the action
|
|
86
|
+
memory = MEMORY()
|
|
87
|
+
memory = search_action(memory, query="thoughtflow framework")
|
|
88
|
+
|
|
89
|
+
# Access results
|
|
90
|
+
result = memory.get_var("web_search_result")
|
|
91
|
+
|
|
92
|
+
# Check execution history
|
|
93
|
+
print(search_action.execution_history[-1]['duration_ms']) # Execution time
|
|
94
|
+
print(search_action.execution_history[-1]['success']) # True/False
|
|
95
|
+
|
|
96
|
+
Design Principles:
|
|
97
|
+
1. Explicit and inspectable operations with consistent logging
|
|
98
|
+
2. Predictable result storage via memory.set_var
|
|
99
|
+
3. Error handling that doesn't interrupt workflow execution
|
|
100
|
+
4. Composability with other Thoughtflow components (MEMORY, THOUGHT)
|
|
101
|
+
5. Serialization support for reproducibility
|
|
102
|
+
6. Full execution history with timing for debugging and optimization
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def __init__(self, name, fn, config=None, result_key=None, description=None):
|
|
106
|
+
"""
|
|
107
|
+
Initialize an ACTION with a name, function, and optional configuration.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
name (str): Identifier for this action, used for logging and result storage.
|
|
111
|
+
fn (callable): The function to execute when this action is called.
|
|
112
|
+
config (dict, optional): Default configuration parameters passed to the function.
|
|
113
|
+
result_key (str, optional): Key where results are stored in memory (defaults to "{name}_result").
|
|
114
|
+
description (str, optional): Human-readable description of what this action does.
|
|
115
|
+
"""
|
|
116
|
+
self.name = name
|
|
117
|
+
self.id = event_stamp() # Unique identifier for this action instance
|
|
118
|
+
self.fn = fn
|
|
119
|
+
self.config = config or {}
|
|
120
|
+
self.result_key = result_key or "{}_result".format(name)
|
|
121
|
+
self.description = description or "Action: {}".format(name)
|
|
122
|
+
self.last_result = None
|
|
123
|
+
self.last_error = None
|
|
124
|
+
self.execution_count = 0
|
|
125
|
+
self.execution_history = [] # Full execution tracking with timing
|
|
126
|
+
|
|
127
|
+
def __call__(self, memory, **kwargs):
|
|
128
|
+
"""
|
|
129
|
+
Execute the action function with the memory object and any override parameters.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
memory (MEMORY): The memory object to update with results.
|
|
133
|
+
**kwargs: Parameters that override the default config for this execution.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
MEMORY: The updated memory object with results stored in memory.vars[result_key].
|
|
137
|
+
|
|
138
|
+
Note:
|
|
139
|
+
The function receives (memory, **merged_kwargs) where merged_kwargs combines
|
|
140
|
+
self.config with any call-specific kwargs.
|
|
141
|
+
|
|
142
|
+
Exceptions during execution are logged rather than raised, allowing the
|
|
143
|
+
workflow to continue and decide how to handle failures.
|
|
144
|
+
"""
|
|
145
|
+
import time as time_module
|
|
146
|
+
|
|
147
|
+
start_time = time_module.time()
|
|
148
|
+
|
|
149
|
+
# Merge default config with call-specific kwargs
|
|
150
|
+
merged_kwargs = {**self.config, **kwargs}
|
|
151
|
+
self.execution_count += 1
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
# Execute the function
|
|
155
|
+
result = self.fn(memory, **merged_kwargs)
|
|
156
|
+
self.last_result = result
|
|
157
|
+
self.last_error = None
|
|
158
|
+
|
|
159
|
+
# Calculate execution duration
|
|
160
|
+
duration_ms = (time_module.time() - start_time) * 1000
|
|
161
|
+
|
|
162
|
+
# Store result in memory using set_var (correct API)
|
|
163
|
+
if hasattr(memory, "set_var") and callable(getattr(memory, "set_var", None)):
|
|
164
|
+
memory.set_var(self.result_key, result, desc="Result of action: {}".format(self.name))
|
|
165
|
+
|
|
166
|
+
# Build execution event for logging (JSON format like THOUGHT)
|
|
167
|
+
execution_event = {
|
|
168
|
+
'action_name': self.name,
|
|
169
|
+
'action_id': self.id,
|
|
170
|
+
'status': 'success',
|
|
171
|
+
'duration_ms': round(duration_ms, 2),
|
|
172
|
+
'result_key': self.result_key
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
# Log successful execution (single message with JSON, no invalid details param)
|
|
176
|
+
if hasattr(memory, "add_log") and callable(getattr(memory, "add_log", None)):
|
|
177
|
+
memory.add_log("Action execution complete: " + json.dumps(execution_event))
|
|
178
|
+
|
|
179
|
+
# Track execution history
|
|
180
|
+
self.execution_history.append({
|
|
181
|
+
'stamp': event_stamp(),
|
|
182
|
+
'memory_id': getattr(memory, 'id', None),
|
|
183
|
+
'duration_ms': duration_ms,
|
|
184
|
+
'success': True,
|
|
185
|
+
'error': None
|
|
186
|
+
})
|
|
187
|
+
|
|
188
|
+
except Exception as e:
|
|
189
|
+
# Handle and log exceptions
|
|
190
|
+
self.last_error = e
|
|
191
|
+
|
|
192
|
+
# Calculate execution duration
|
|
193
|
+
duration_ms = (time_module.time() - start_time) * 1000
|
|
194
|
+
|
|
195
|
+
# Build error event for logging
|
|
196
|
+
error_event = {
|
|
197
|
+
'action_name': self.name,
|
|
198
|
+
'action_id': self.id,
|
|
199
|
+
'status': 'error',
|
|
200
|
+
'error': str(e),
|
|
201
|
+
'duration_ms': round(duration_ms, 2),
|
|
202
|
+
'result_key': self.result_key
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
# Log failed execution (single message with JSON)
|
|
206
|
+
if hasattr(memory, "add_log") and callable(getattr(memory, "add_log", None)):
|
|
207
|
+
memory.add_log("Action execution failed: " + json.dumps(error_event))
|
|
208
|
+
|
|
209
|
+
# Store error info in memory using set_var
|
|
210
|
+
if hasattr(memory, "set_var") and callable(getattr(memory, "set_var", None)):
|
|
211
|
+
memory.set_var(self.result_key, error_event, desc="Error in action: {}".format(self.name))
|
|
212
|
+
|
|
213
|
+
# Track execution history
|
|
214
|
+
self.execution_history.append({
|
|
215
|
+
'stamp': event_stamp(),
|
|
216
|
+
'memory_id': getattr(memory, 'id', None),
|
|
217
|
+
'duration_ms': duration_ms,
|
|
218
|
+
'success': False,
|
|
219
|
+
'error': str(e)
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
return memory
|
|
223
|
+
|
|
224
|
+
def get_last_result(self):
|
|
225
|
+
"""
|
|
226
|
+
Returns the most recent result from executing this action.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
Any: The last result or None if the action hasn't been executed.
|
|
230
|
+
"""
|
|
231
|
+
return self.last_result
|
|
232
|
+
|
|
233
|
+
def was_successful(self):
|
|
234
|
+
"""
|
|
235
|
+
Returns True if the last execution was successful, False otherwise.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
bool: True if the last execution completed without errors, False otherwise.
|
|
239
|
+
"""
|
|
240
|
+
return self.last_error is None and self.execution_count > 0
|
|
241
|
+
|
|
242
|
+
def reset_stats(self):
|
|
243
|
+
"""
|
|
244
|
+
Resets execution statistics (count, last_result, last_error, execution_history).
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
ACTION: Self for method chaining.
|
|
248
|
+
"""
|
|
249
|
+
self.execution_count = 0
|
|
250
|
+
self.last_result = None
|
|
251
|
+
self.last_error = None
|
|
252
|
+
self.execution_history = []
|
|
253
|
+
return self
|
|
254
|
+
|
|
255
|
+
def copy(self):
|
|
256
|
+
"""
|
|
257
|
+
Return a copy of this ACTION with a new ID.
|
|
258
|
+
|
|
259
|
+
The function reference is shared (same callable), but config is copied.
|
|
260
|
+
Execution statistics are reset in the copy.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
ACTION: A new ACTION instance with copied attributes and new ID.
|
|
264
|
+
"""
|
|
265
|
+
new_action = ACTION(
|
|
266
|
+
name=self.name,
|
|
267
|
+
fn=self.fn, # Same function reference
|
|
268
|
+
config=self.config.copy() if self.config else None,
|
|
269
|
+
result_key=self.result_key,
|
|
270
|
+
description=self.description
|
|
271
|
+
)
|
|
272
|
+
# New ID is already assigned in __init__, no need to set it
|
|
273
|
+
return new_action
|
|
274
|
+
|
|
275
|
+
def to_dict(self):
|
|
276
|
+
"""
|
|
277
|
+
Returns a serializable dictionary representation of this action.
|
|
278
|
+
|
|
279
|
+
Note: The function itself cannot be serialized, so it's represented by name.
|
|
280
|
+
When deserializing, a function registry must be provided.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
dict: Serializable representation of this action.
|
|
284
|
+
"""
|
|
285
|
+
return {
|
|
286
|
+
"name": self.name,
|
|
287
|
+
"id": self.id,
|
|
288
|
+
"fn_name": self.fn.__name__,
|
|
289
|
+
"config": self.config,
|
|
290
|
+
"result_key": self.result_key,
|
|
291
|
+
"description": self.description,
|
|
292
|
+
"execution_count": self.execution_count,
|
|
293
|
+
"execution_history": self.execution_history
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
@classmethod
|
|
297
|
+
def from_dict(cls, data, fn_registry):
|
|
298
|
+
"""
|
|
299
|
+
Reconstruct an ACTION from a dictionary representation.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
data (dict): Dictionary representation of an ACTION.
|
|
303
|
+
fn_registry (dict): Dictionary mapping function names to function objects.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
ACTION: Reconstructed ACTION object.
|
|
307
|
+
|
|
308
|
+
Raises:
|
|
309
|
+
KeyError: If the function name is not found in the registry.
|
|
310
|
+
"""
|
|
311
|
+
if data["fn_name"] not in fn_registry:
|
|
312
|
+
raise KeyError("Function '{}' not found in registry".format(data['fn_name']))
|
|
313
|
+
|
|
314
|
+
action = cls(
|
|
315
|
+
name=data["name"],
|
|
316
|
+
fn=fn_registry[data["fn_name"]],
|
|
317
|
+
config=data["config"],
|
|
318
|
+
result_key=data["result_key"],
|
|
319
|
+
description=data["description"]
|
|
320
|
+
)
|
|
321
|
+
# Restore ID if provided, otherwise keep the new one from __init__
|
|
322
|
+
if data.get("id"):
|
|
323
|
+
action.id = data["id"]
|
|
324
|
+
action.execution_count = data.get("execution_count", 0)
|
|
325
|
+
action.execution_history = data.get("execution_history", [])
|
|
326
|
+
return action
|
|
327
|
+
|
|
328
|
+
def __str__(self):
|
|
329
|
+
"""
|
|
330
|
+
Returns a string representation of this action.
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
str: String representation.
|
|
334
|
+
"""
|
|
335
|
+
return "ACTION({}, desc='{}', executions={})".format(self.name, self.description, self.execution_count)
|
|
336
|
+
|
|
337
|
+
def __repr__(self):
|
|
338
|
+
"""
|
|
339
|
+
Returns a detailed string representation of this action.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
str: Detailed string representation.
|
|
343
|
+
"""
|
|
344
|
+
return ("ACTION(name='{}', fn={}, "
|
|
345
|
+
"config={}, result_key='{}', "
|
|
346
|
+
"description='{}', execution_count={})".format(
|
|
347
|
+
self.name, self.fn.__name__, self.config,
|
|
348
|
+
self.result_key, self.description, self.execution_count))
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
### ACTION CLASS TESTS
|
|
352
|
+
|
|
353
|
+
ActionClassTests = """
|
|
354
|
+
# --- ACTION Class Tests ---
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
"""
|
thoughtflow/agent.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DEPRECATED: Use THOUGHT class instead.
|
|
3
|
+
|
|
4
|
+
The Agent class has been replaced by the THOUGHT class which provides
|
|
5
|
+
a more powerful and flexible interface for LLM interactions.
|
|
6
|
+
|
|
7
|
+
Example migration:
|
|
8
|
+
# Old (deprecated):
|
|
9
|
+
agent = Agent(adapter)
|
|
10
|
+
response = agent.call(messages)
|
|
11
|
+
|
|
12
|
+
# New:
|
|
13
|
+
from thoughtflow import THOUGHT, MEMORY, LLM
|
|
14
|
+
|
|
15
|
+
llm = LLM("openai:gpt-4o", key="your-api-key")
|
|
16
|
+
thought = THOUGHT(name="my_thought", llm=llm, prompt="...")
|
|
17
|
+
memory = MEMORY()
|
|
18
|
+
memory = thought(memory)
|
|
19
|
+
result = memory.get_var("my_thought_result")
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import warnings
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Agent:
|
|
28
|
+
"""
|
|
29
|
+
DEPRECATED: Use THOUGHT class instead.
|
|
30
|
+
|
|
31
|
+
The Agent class has been deprecated in favor of the THOUGHT class,
|
|
32
|
+
which provides a more powerful and flexible interface for LLM interactions.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, *args, **kwargs):
|
|
36
|
+
warnings.warn(
|
|
37
|
+
"Agent is deprecated. Use THOUGHT instead. "
|
|
38
|
+
"See the migration guide in the module docstring.",
|
|
39
|
+
DeprecationWarning,
|
|
40
|
+
stacklevel=2
|
|
41
|
+
)
|
|
42
|
+
raise NotImplementedError(
|
|
43
|
+
"Agent is deprecated. Use THOUGHT instead. "
|
|
44
|
+
"Example: thought = THOUGHT(name='my_thought', llm=llm, prompt='...')"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class TracedAgent:
|
|
49
|
+
"""
|
|
50
|
+
DEPRECATED: Use THOUGHT class instead.
|
|
51
|
+
|
|
52
|
+
The TracedAgent class has been deprecated. THOUGHT provides built-in
|
|
53
|
+
execution history tracking and tracing capabilities.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, *args, **kwargs):
|
|
57
|
+
warnings.warn(
|
|
58
|
+
"TracedAgent is deprecated. Use THOUGHT instead. "
|
|
59
|
+
"THOUGHT provides built-in execution history tracking.",
|
|
60
|
+
DeprecationWarning,
|
|
61
|
+
stacklevel=2
|
|
62
|
+
)
|
|
63
|
+
raise NotImplementedError(
|
|
64
|
+
"TracedAgent is deprecated. Use THOUGHT instead. "
|
|
65
|
+
"THOUGHT provides built-in execution history tracking via execution_history."
|
|
66
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation utilities for ThoughtFlow.
|
|
3
|
+
|
|
4
|
+
Deterministic evaluation is a first-class constraint in ThoughtFlow.
|
|
5
|
+
This module provides utilities for:
|
|
6
|
+
- Record/replay workflows
|
|
7
|
+
- Golden tests (expected response shape/constraints)
|
|
8
|
+
- Prompt/version pinning
|
|
9
|
+
- Stable metrics extraction from traces
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
>>> from thoughtflow.eval import Replay, Harness
|
|
13
|
+
>>>
|
|
14
|
+
>>> # Record a session
|
|
15
|
+
>>> session = agent.call(messages, record=True)
|
|
16
|
+
>>> session.save("golden.json")
|
|
17
|
+
>>>
|
|
18
|
+
>>> # Replay and compare
|
|
19
|
+
>>> replay = Replay.load("golden.json")
|
|
20
|
+
>>> results = replay.run(agent)
|
|
21
|
+
>>> assert results.matches_expected()
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from thoughtflow.eval.replay import Replay
|
|
27
|
+
from thoughtflow.eval.harness import Harness, TestCase, TestResult
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"Replay",
|
|
31
|
+
"Harness",
|
|
32
|
+
"TestCase",
|
|
33
|
+
"TestResult",
|
|
34
|
+
]
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test harness for ThoughtFlow evaluations.
|
|
3
|
+
|
|
4
|
+
Provides structured test cases and evaluation harnesses for
|
|
5
|
+
systematic agent testing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from thoughtflow.agent import Agent
|
|
15
|
+
from thoughtflow.message import MessageList
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class TestCase:
|
|
20
|
+
"""A single test case for agent evaluation.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
name: Human-readable name for the test.
|
|
24
|
+
messages: Input messages for the test.
|
|
25
|
+
params: Optional call parameters.
|
|
26
|
+
expected: Expected response (exact match or callable validator).
|
|
27
|
+
tags: Tags for filtering/grouping tests.
|
|
28
|
+
metadata: Additional test metadata.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
name: str
|
|
32
|
+
messages: MessageList
|
|
33
|
+
params: dict[str, Any] | None = None
|
|
34
|
+
expected: str | Callable[[str], bool] | None = None
|
|
35
|
+
tags: list[str] = field(default_factory=list)
|
|
36
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
37
|
+
|
|
38
|
+
def validate(self, response: str) -> bool:
|
|
39
|
+
"""Validate a response against expectations.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
response: The agent's response.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
True if valid, False otherwise.
|
|
46
|
+
"""
|
|
47
|
+
if self.expected is None:
|
|
48
|
+
return True
|
|
49
|
+
if callable(self.expected):
|
|
50
|
+
return self.expected(response)
|
|
51
|
+
return response == self.expected
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class TestResult:
|
|
56
|
+
"""Result of running a test case.
|
|
57
|
+
|
|
58
|
+
Attributes:
|
|
59
|
+
test_case: The test case that was run.
|
|
60
|
+
passed: Whether the test passed.
|
|
61
|
+
response: The agent's response.
|
|
62
|
+
error: Error message if the test failed.
|
|
63
|
+
duration_ms: How long the test took.
|
|
64
|
+
metadata: Additional result metadata.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
test_case: TestCase
|
|
68
|
+
passed: bool
|
|
69
|
+
response: str | None = None
|
|
70
|
+
error: str | None = None
|
|
71
|
+
duration_ms: int | None = None
|
|
72
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class Harness:
|
|
76
|
+
"""Test harness for running evaluation suites.
|
|
77
|
+
|
|
78
|
+
The Harness provides a structured way to:
|
|
79
|
+
- Define test cases
|
|
80
|
+
- Run them against agents
|
|
81
|
+
- Collect and analyze results
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
>>> harness = Harness()
|
|
85
|
+
>>>
|
|
86
|
+
>>> # Add test cases
|
|
87
|
+
>>> harness.add(TestCase(
|
|
88
|
+
... name="greeting",
|
|
89
|
+
... messages=[{"role": "user", "content": "Hello!"}],
|
|
90
|
+
... expected=lambda r: "hello" in r.lower()
|
|
91
|
+
... ))
|
|
92
|
+
>>>
|
|
93
|
+
>>> # Run all tests
|
|
94
|
+
>>> results = harness.run(agent)
|
|
95
|
+
>>>
|
|
96
|
+
>>> # Check results
|
|
97
|
+
>>> print(f"Passed: {results.passed_count}/{results.total_count}")
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
def __init__(self) -> None:
|
|
101
|
+
"""Initialize an empty harness."""
|
|
102
|
+
self.test_cases: list[TestCase] = []
|
|
103
|
+
|
|
104
|
+
def add(self, test_case: TestCase) -> None:
|
|
105
|
+
"""Add a test case to the harness.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
test_case: The test case to add.
|
|
109
|
+
"""
|
|
110
|
+
self.test_cases.append(test_case)
|
|
111
|
+
|
|
112
|
+
def add_many(self, test_cases: list[TestCase]) -> None:
|
|
113
|
+
"""Add multiple test cases.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
test_cases: List of test cases to add.
|
|
117
|
+
"""
|
|
118
|
+
self.test_cases.extend(test_cases)
|
|
119
|
+
|
|
120
|
+
def filter_by_tags(self, tags: list[str]) -> list[TestCase]:
|
|
121
|
+
"""Filter test cases by tags.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
tags: Tags to filter by.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Test cases matching any of the specified tags.
|
|
128
|
+
"""
|
|
129
|
+
return [tc for tc in self.test_cases if any(t in tc.tags for t in tags)]
|
|
130
|
+
|
|
131
|
+
def run(
|
|
132
|
+
self,
|
|
133
|
+
agent: Agent,
|
|
134
|
+
filter_tags: list[str] | None = None,
|
|
135
|
+
) -> HarnessResults:
|
|
136
|
+
"""Run all test cases against an agent.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
agent: The agent to test.
|
|
140
|
+
filter_tags: Optional tags to filter which tests to run.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
HarnessResults with all test results.
|
|
144
|
+
|
|
145
|
+
Raises:
|
|
146
|
+
NotImplementedError: This is a placeholder implementation.
|
|
147
|
+
"""
|
|
148
|
+
# TODO: Implement test execution
|
|
149
|
+
raise NotImplementedError(
|
|
150
|
+
"Harness.run() is not yet implemented. "
|
|
151
|
+
"This is a placeholder for the ThoughtFlow alpha release."
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class HarnessResults:
|
|
157
|
+
"""Results from running a test harness.
|
|
158
|
+
|
|
159
|
+
Attributes:
|
|
160
|
+
results: Individual test results.
|
|
161
|
+
metadata: Harness-level metadata.
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
results: list[TestResult] = field(default_factory=list)
|
|
165
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def total_count(self) -> int:
|
|
169
|
+
"""Total number of tests run."""
|
|
170
|
+
return len(self.results)
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def passed_count(self) -> int:
|
|
174
|
+
"""Number of tests that passed."""
|
|
175
|
+
return sum(1 for r in self.results if r.passed)
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def failed_count(self) -> int:
|
|
179
|
+
"""Number of tests that failed."""
|
|
180
|
+
return self.total_count - self.passed_count
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def pass_rate(self) -> float:
|
|
184
|
+
"""Percentage of tests that passed."""
|
|
185
|
+
if self.total_count == 0:
|
|
186
|
+
return 0.0
|
|
187
|
+
return self.passed_count / self.total_count
|
|
188
|
+
|
|
189
|
+
def summary(self) -> dict[str, Any]:
|
|
190
|
+
"""Get a summary of the results.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Dict with summary statistics.
|
|
194
|
+
"""
|
|
195
|
+
return {
|
|
196
|
+
"total": self.total_count,
|
|
197
|
+
"passed": self.passed_count,
|
|
198
|
+
"failed": self.failed_count,
|
|
199
|
+
"pass_rate": self.pass_rate,
|
|
200
|
+
}
|