ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
|
|
3
|
+
|
|
4
|
+
def parse_observations(observation_tree, dfs_observations, dfs_callable: callable):
|
|
5
|
+
messages = []
|
|
6
|
+
for node in dfs_observations:
|
|
7
|
+
# assume there will only be one AgentExecutor in the trace!
|
|
8
|
+
if node.obs.name == 'AgentExecutor': return _parse_agent_executor(node.children, dfs_callable(node.children))
|
|
9
|
+
return messages
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _parse_agent_executor(observation_tree, dfs_observations):
|
|
13
|
+
messages = []
|
|
14
|
+
for node in dfs_observations:
|
|
15
|
+
if node.obs.type == 'GENERATION':
|
|
16
|
+
print(node.obs.id)
|
|
17
|
+
messages.extend(_get_messages(node.obs.input))
|
|
18
|
+
# get intemediate steps from parent
|
|
19
|
+
messages.extend(_get_intermediate_steps(node.parent))
|
|
20
|
+
messages.extend(_get_messages([node.obs.output]))
|
|
21
|
+
return messages
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _get_messages(data):
|
|
25
|
+
messages = []
|
|
26
|
+
for msg in data:
|
|
27
|
+
if msg['role'] == 'system': messages.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
|
|
28
|
+
elif msg['role'] == 'user':
|
|
29
|
+
content = ''
|
|
30
|
+
if isinstance(msg['content'], list):
|
|
31
|
+
content = []
|
|
32
|
+
for item in msg['content']:
|
|
33
|
+
if item['type'] == ['text']: content.append(item['text'])
|
|
34
|
+
content = ' '.join(content)
|
|
35
|
+
elif isinstance(msg['content'], str):
|
|
36
|
+
content = msg['content']
|
|
37
|
+
|
|
38
|
+
messages.append(OTelParserMessage(role='user', content=content, type=ContentType.text))
|
|
39
|
+
elif msg['role'] == 'assistant':
|
|
40
|
+
content = msg['content'] or ''
|
|
41
|
+
additional_kwargs = msg.get('additional_kwargs', {})
|
|
42
|
+
tool_calls = None
|
|
43
|
+
if 'tool_calls' in additional_kwargs:
|
|
44
|
+
tool_calls = []
|
|
45
|
+
for tc in additional_kwargs['tool_calls']:
|
|
46
|
+
id_ = tc['id']
|
|
47
|
+
function = OTelParserFunction(name=tc['function']['name'], arguments=tc['function']['arguments'])
|
|
48
|
+
tool_calls.append(OTelParserToolCall(id=id_, function=function))
|
|
49
|
+
messages.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=ContentType.tool_call))
|
|
50
|
+
return messages
|
|
51
|
+
|
|
52
|
+
def _get_intermediate_steps(node):
|
|
53
|
+
messages = []
|
|
54
|
+
tool_calls_n_responses = node.obs.input['intermediate_steps']
|
|
55
|
+
for tc, tr in tool_calls_n_responses:
|
|
56
|
+
if 'tool' in tc and 'tool_input' in tc and 'tool_call_id' in tc:
|
|
57
|
+
tool_call_id = tc['tool_call_id']
|
|
58
|
+
if isinstance(tr, str):
|
|
59
|
+
messages.append(OTelParserMessage(role='tool', content=tr, tool_call_id=tool_call_id, type=ContentType.tool_response))
|
|
60
|
+
continue
|
|
61
|
+
elif (isinstance(tr, dict) and 'content' not in tr):
|
|
62
|
+
messages.append(OTelParserMessage(role='tool', content=json.dumps(tr), tool_call_id=tool_call_id, type=ContentType.tool_response))
|
|
63
|
+
continue
|
|
64
|
+
elif isinstance(tr, dict) and 'content' in tr:
|
|
65
|
+
content = tr['content']
|
|
66
|
+
if isinstance(content, str):
|
|
67
|
+
messages.append(OTelParserMessage(role='tool', content=content, tool_call_id=tool_call_id, type=ContentType.tool_response))
|
|
68
|
+
continue
|
|
69
|
+
elif isinstance(content, list):
|
|
70
|
+
for part in content:
|
|
71
|
+
if isinstance(part, dict) and part['type'] == 'text':
|
|
72
|
+
text = part['text']
|
|
73
|
+
if isinstance(text, dict): text = json.dumps(text)
|
|
74
|
+
messages.append(OTelParserMessage(role='tool', content=text, tool_call_id=tool_call_id, type=ContentType.tool_response))
|
|
75
|
+
continue
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError(f"Unexpected part type: {type(part)} or part[type] '{part['type']}' != 'text'")
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError(f"Unexpected content type: {type(content)}")
|
|
80
|
+
else:
|
|
81
|
+
raise ValueError(f"Unexpected tool response: Type: {type(tr)}, Value: {tr}")
|
|
82
|
+
|
|
83
|
+
else:
|
|
84
|
+
print('Tool Call:', tc)
|
|
85
|
+
print('Tool Response:', tr)
|
|
86
|
+
return messages
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
|
|
3
|
+
|
|
4
|
+
def parse_observations(observation_tree, dfs_observations):
|
|
5
|
+
messages = []
|
|
6
|
+
is_first_generation = True
|
|
7
|
+
for obs in dfs_observations:
|
|
8
|
+
if obs.obs.type == 'GENERATION':
|
|
9
|
+
if is_first_generation:
|
|
10
|
+
messages.extend(_get_input_message(obs))
|
|
11
|
+
is_first_generation = False
|
|
12
|
+
parent = obs.parent
|
|
13
|
+
if parent.obs.type == 'CHAIN':
|
|
14
|
+
# TODO: messages is a list. confirm, we will only see one message in the list.
|
|
15
|
+
msg = parent.obs.output['messages'][0]
|
|
16
|
+
content = msg['content'] or ''
|
|
17
|
+
msg_type = ContentType.text
|
|
18
|
+
tool_calls = msg['tool_calls'] or None
|
|
19
|
+
if tool_calls is not None:
|
|
20
|
+
msg_type = ContentType.tool_call
|
|
21
|
+
tool_calls = [_to_tool_call(tc) for tc in tool_calls]
|
|
22
|
+
messages.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=msg_type))
|
|
23
|
+
elif obs.obs.type == 'TOOL':
|
|
24
|
+
parent_node = obs.parent
|
|
25
|
+
if parent_node.obs.type == 'CHAIN':
|
|
26
|
+
for tool_response in parent_node.obs.output['messages']:
|
|
27
|
+
messages.append(OTelParserMessage(role='tool', content=tool_response['content'], tool_call_id=tool_response['tool_call_id'], type=ContentType.tool_response))
|
|
28
|
+
return messages
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_input_message(obs_node):
|
|
32
|
+
ret = []
|
|
33
|
+
parent = obs_node.parent
|
|
34
|
+
if parent.obs.type == 'CHAIN':
|
|
35
|
+
for msg in parent.obs.input['messages']:
|
|
36
|
+
if msg['type'] == 'system': ret.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
|
|
37
|
+
elif msg['type'] == 'human': ret.append(OTelParserMessage(role='user', content=msg['content'], type=ContentType.text))
|
|
38
|
+
elif msg['type'] == 'tool': ret.append(OTelParserMessage(role='tool', content=msg['content'], tool_call_id=msg['tool_call_id'], type=ContentType.tool_response))
|
|
39
|
+
elif msg['type'] == 'ai':
|
|
40
|
+
content = msg['content'] or ''
|
|
41
|
+
tool_calls = msg['tool_calls'] or None
|
|
42
|
+
msg_type = ContentType.text
|
|
43
|
+
if tool_calls is not None:
|
|
44
|
+
msg_type = ContentType.tool_call
|
|
45
|
+
tool_calls = [_to_tool_call(tc) for tc in tool_calls]
|
|
46
|
+
ret.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=msg_type))
|
|
47
|
+
return ret
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _to_tool_call(tool_call):
|
|
51
|
+
return OTelParserToolCall(
|
|
52
|
+
id=tool_call['id'],
|
|
53
|
+
type='function', # OTelParserToolCall expects literal 'function'
|
|
54
|
+
function=_to_function(tool_call)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def _to_function(func):
|
|
58
|
+
return OTelParserFunction(
|
|
59
|
+
name=func['name'],
|
|
60
|
+
arguments=json.dumps(func['args'])
|
|
61
|
+
)
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
from dotenv import load_dotenv
|
|
2
|
+
load_dotenv()
|
|
3
|
+
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from functools import partial
|
|
6
|
+
from typing import Dict, List, Any, Optional
|
|
7
|
+
|
|
8
|
+
from langfuse import get_client
|
|
9
|
+
langfuse_client = get_client()
|
|
10
|
+
|
|
11
|
+
from wxo_agentic_evaluation.type import OTelParserMessage
|
|
12
|
+
from . import langgraph_parser, pydantic_parser, langflow_parser, wxo_parser
|
|
13
|
+
|
|
14
|
+
def parse_session(session_id):
|
|
15
|
+
traces = get_traces(session_id)
|
|
16
|
+
messages: list[OTelParserMessage] = []
|
|
17
|
+
for tr in traces.data:
|
|
18
|
+
trace_messages = parse_trace(tr)
|
|
19
|
+
messages = add_messages(messages, trace_messages)
|
|
20
|
+
return messages
|
|
21
|
+
|
|
22
|
+
def parse_trace(trace):
|
|
23
|
+
messages: list[OTelParserMessage] = []
|
|
24
|
+
agent_framework = get_agent_framework(trace)
|
|
25
|
+
if agent_framework == 'langfuse.langgraph_agent': parser_func = langgraph_parser.parse_observations
|
|
26
|
+
elif agent_framework == 'langfuse.langflow': parser_func = partial(langflow_parser.parse_observations, dfs_callable=dfs_)
|
|
27
|
+
elif agent_framework == 'langgraph_agent': parser_func = langgraph_parser.parse_observations
|
|
28
|
+
elif agent_framework == 'pydantic_ai':
|
|
29
|
+
parser_func = pydantic_parser.parse_observations
|
|
30
|
+
sys_message = pydantic_parser.get_system_message(trace)
|
|
31
|
+
if sys_message: messages.append(sys_message)
|
|
32
|
+
else:
|
|
33
|
+
parser_func = None
|
|
34
|
+
parsers_to_try = [
|
|
35
|
+
partial(langflow_parser.parse_observations, dfs_callable=dfs_),
|
|
36
|
+
wxo_parser.parse_observations,
|
|
37
|
+
]
|
|
38
|
+
observations = get_observations(trace.observations)
|
|
39
|
+
dfs_observations = dfs_(observations)
|
|
40
|
+
if parser_func:
|
|
41
|
+
parsed_messages = parser_func(observations, dfs_observations)
|
|
42
|
+
messages = add_messages(messages, parsed_messages)
|
|
43
|
+
else:
|
|
44
|
+
for parser_func in parsers_to_try:
|
|
45
|
+
try:
|
|
46
|
+
parsed_messages = parser_func(observations, dfs_observations)
|
|
47
|
+
if not parsed_messages: continue
|
|
48
|
+
messages = add_messages(messages, parsed_messages)
|
|
49
|
+
break
|
|
50
|
+
except Exception as e:
|
|
51
|
+
print(e)
|
|
52
|
+
return messages
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def add_messages(messages: list[OTelParserMessage], parsed_messages: list[OTelParserMessage]):
|
|
56
|
+
ret: list[OTelParserMessage] = []
|
|
57
|
+
seen: set[str] = set()
|
|
58
|
+
for msg in messages:
|
|
59
|
+
msg_hash = msg.hash()
|
|
60
|
+
if msg_hash in seen: continue
|
|
61
|
+
seen.add(msg_hash)
|
|
62
|
+
ret.append(msg)
|
|
63
|
+
for msg in parsed_messages:
|
|
64
|
+
msg_hash = msg.hash()
|
|
65
|
+
if msg_hash in seen: continue
|
|
66
|
+
seen.add(msg_hash)
|
|
67
|
+
ret.append(msg)
|
|
68
|
+
return ret
|
|
69
|
+
|
|
70
|
+
def get_agent_framework(trace):
|
|
71
|
+
"""
|
|
72
|
+
Supported frameworks:
|
|
73
|
+
- OpenInference
|
|
74
|
+
- Pydantic AI
|
|
75
|
+
- Langflow
|
|
76
|
+
- Langgraph Agent
|
|
77
|
+
- LangFuse
|
|
78
|
+
- Langflow
|
|
79
|
+
- LangGraph Agent
|
|
80
|
+
"""
|
|
81
|
+
md_attrs = trace.metadata.get('attributes', {})
|
|
82
|
+
scope = trace.metadata.get('scope', {})
|
|
83
|
+
scope_name = scope.get('name', '')
|
|
84
|
+
|
|
85
|
+
if scope_name == 'langfuse-sdk':
|
|
86
|
+
if trace.name == 'LangGraph': return 'langfuse.langgraph_agent'
|
|
87
|
+
|
|
88
|
+
if 'langflow.project.name' in md_attrs.keys(): return 'langflow'
|
|
89
|
+
if 'pydantic-ai' in scope_name: return 'pydantic_ai'
|
|
90
|
+
if 'openinference.instrumentation.langchain' in scope_name:
|
|
91
|
+
# TODO: need to find a better way to detect Langgraph Agent
|
|
92
|
+
return 'langgraph_agent'
|
|
93
|
+
|
|
94
|
+
# check for langflow
|
|
95
|
+
# get observations for trace
|
|
96
|
+
observations = dfs_(get_observations(trace.observations))
|
|
97
|
+
for obs in observations:
|
|
98
|
+
if 'from_langflow_component' in obs.obs.metadata.keys(): return 'langfuse.langflow'
|
|
99
|
+
return "UNKNOWN"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_traces(session_id):
|
|
103
|
+
traces = langfuse_client.api.trace.list(
|
|
104
|
+
session_id=session_id,
|
|
105
|
+
limit=100,
|
|
106
|
+
)
|
|
107
|
+
# sort by timestamp
|
|
108
|
+
traces.data.sort(key=lambda x: x.timestamp)
|
|
109
|
+
return traces
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def get_observations(observations_ids):
|
|
113
|
+
observations = [langfuse_client.api.observations.get(obs_id) for obs_id in observations_ids]
|
|
114
|
+
observations.sort(key=lambda x: x.start_time)
|
|
115
|
+
observation_tree = build_observation_forest(observations)
|
|
116
|
+
return observation_tree
|
|
117
|
+
|
|
118
|
+
class ObsNode:
|
|
119
|
+
def __init__(self, obs: Any):
|
|
120
|
+
self.obs = obs
|
|
121
|
+
self.children: List["ObsNode"] = []
|
|
122
|
+
self.parent: Optional["ObsNode"] = None
|
|
123
|
+
|
|
124
|
+
def __repr__(self):
|
|
125
|
+
return f"ObsNode(id={self.obs.id}, type={self.obs.type}, name={self.obs.name})"
|
|
126
|
+
|
|
127
|
+
def build_observation_forest(observations: List[Any]) -> List[ObsNode]:
|
|
128
|
+
"""Return list of root nodes; each has .children forming a tree."""
|
|
129
|
+
nodes: Dict[str, ObsNode] = {}
|
|
130
|
+
children_by_parent: Dict[Optional[str], List[ObsNode]] = defaultdict(list)
|
|
131
|
+
|
|
132
|
+
# 1. Create nodes for each observation
|
|
133
|
+
for o in observations:
|
|
134
|
+
node = ObsNode(o)
|
|
135
|
+
nodes[o.id] = node
|
|
136
|
+
parent_id = getattr(o, "parent_observation_id", None)
|
|
137
|
+
children_by_parent[parent_id].append(node)
|
|
138
|
+
|
|
139
|
+
# 2. Attach children to parents
|
|
140
|
+
for parent_id, child_nodes in children_by_parent.items():
|
|
141
|
+
if parent_id is None:
|
|
142
|
+
continue
|
|
143
|
+
parent_node = nodes.get(parent_id)
|
|
144
|
+
if parent_node:
|
|
145
|
+
parent_node.children.extend(child_nodes)
|
|
146
|
+
for child_node in child_nodes: child_node.parent = parent_node
|
|
147
|
+
|
|
148
|
+
# 3. Roots are those with parent_observation_id == None
|
|
149
|
+
roots = children_by_parent[None]
|
|
150
|
+
return roots
|
|
151
|
+
|
|
152
|
+
def dfs_(observation_tree: List[ObsNode]):
|
|
153
|
+
ret =[]
|
|
154
|
+
for node in observation_tree:
|
|
155
|
+
ret.append(node)
|
|
156
|
+
ret.extend(dfs_(node.children))
|
|
157
|
+
return ret
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
if __name__ == "__main__":
|
|
161
|
+
messages = parse_session(session_id="93a24957-dd7a-425d-b821-88de49940a6e")
|
|
162
|
+
|
|
163
|
+
print(messages)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import Literal
|
|
3
|
+
from hashlib import md5
|
|
4
|
+
|
|
5
|
+
# TODO: This is not a fully chat completions message.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclasses.dataclass
|
|
9
|
+
class Function:
|
|
10
|
+
name: str
|
|
11
|
+
arguments: str
|
|
12
|
+
|
|
13
|
+
def __str__(self):
|
|
14
|
+
return f"{self.name}:{self.arguments}"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclasses.dataclass
|
|
18
|
+
class ToolCall:
|
|
19
|
+
id: str
|
|
20
|
+
function: "Function"
|
|
21
|
+
type: Literal["function"] = "function"
|
|
22
|
+
|
|
23
|
+
def __str__(self):
|
|
24
|
+
return f"{self.id}:{self.type}:{self.function}"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclasses.dataclass
|
|
28
|
+
class Message:
|
|
29
|
+
role: Literal["user", "assistant", "tool", "system"]
|
|
30
|
+
content: str | None = None
|
|
31
|
+
tool_calls: list["ToolCall"] | None = None
|
|
32
|
+
tool_call_id: str | None = None
|
|
33
|
+
|
|
34
|
+
def __str__(self):
|
|
35
|
+
return f'{self.role}:{self.content}:{":".join(map(str, self.tool_calls or []))}:{self.tool_call_id}'
|
|
36
|
+
|
|
37
|
+
def hash(self):
|
|
38
|
+
return md5(self.__str__().encode("utf-8")).hexdigest()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_system_message(trace):
|
|
6
|
+
sys_instruction_json = trace.metadata.get('attributes', {}).get('gen_ai.system_instructions')
|
|
7
|
+
if not sys_instruction_json: return None
|
|
8
|
+
instruction = json.loads(sys_instruction_json)[0]['content']
|
|
9
|
+
return OTelParserMessage(role='system', content=instruction, type=ContentType.text)
|
|
10
|
+
|
|
11
|
+
def parse_observations(observation_tree, dfs_observations):
|
|
12
|
+
messages = []
|
|
13
|
+
for obs in dfs_observations:
|
|
14
|
+
if obs.obs.type == 'GENERATION':
|
|
15
|
+
messages.extend(_get_messages(obs.obs.input))
|
|
16
|
+
messages.extend(_get_messages(obs.obs.output))
|
|
17
|
+
return messages
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_messages(data):
|
|
21
|
+
messages = []
|
|
22
|
+
for msg in data:
|
|
23
|
+
if msg['role'] == 'user':
|
|
24
|
+
parts = msg['parts']
|
|
25
|
+
for part in parts:
|
|
26
|
+
content_type = part['type']
|
|
27
|
+
if content_type == 'text':
|
|
28
|
+
content = part['content']
|
|
29
|
+
messages.append(OTelParserMessage(role='user', content=content, type=ContentType.text))
|
|
30
|
+
elif content_type == 'tool_call_response':
|
|
31
|
+
tool_call_id = part['id']
|
|
32
|
+
result = json.dumps(part['result'])
|
|
33
|
+
messages.append(OTelParserMessage(role='tool', tool_call_id=tool_call_id, content=result, type=ContentType.tool_response))
|
|
34
|
+
elif msg['role'] == 'assistant':
|
|
35
|
+
parts = msg['parts']
|
|
36
|
+
# TODO: assuming that only one message is present in the assistant parts
|
|
37
|
+
content_type = parts[0]['type']
|
|
38
|
+
if content_type == 'text':
|
|
39
|
+
content = parts[0]['content']
|
|
40
|
+
messages.append(OTelParserMessage(role='assistant', content=content, type=ContentType.text))
|
|
41
|
+
elif content_type == 'tool_call':
|
|
42
|
+
tool_calls = []
|
|
43
|
+
for part in parts:
|
|
44
|
+
tool_call_id = part['id']
|
|
45
|
+
tool_call_name = part['name']
|
|
46
|
+
tool_call_arguments = part['arguments']
|
|
47
|
+
tool_call = OTelParserToolCall(id=tool_call_id, function=OTelParserFunction(name=tool_call_name, arguments=tool_call_arguments))
|
|
48
|
+
tool_calls.append(tool_call)
|
|
49
|
+
messages.append(OTelParserMessage(role='assistant', content='', tool_calls=tool_calls, type=ContentType.tool_call))
|
|
50
|
+
return messages
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from .parser_types import ToolCall, Function
|
|
3
|
+
|
|
4
|
+
def to_tool_call(tool_call):
|
|
5
|
+
return ToolCall(
|
|
6
|
+
id=tool_call['id'],
|
|
7
|
+
type=tool_call['type'],
|
|
8
|
+
function=to_function(tool_call)
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
def to_function(func):
|
|
12
|
+
return Function(
|
|
13
|
+
name=func['name'],
|
|
14
|
+
arguments=json.dumps(func['args'])
|
|
15
|
+
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
|
|
3
|
+
|
|
4
|
+
def parse_observations(observation_tree, dfs_observations):
|
|
5
|
+
messages = []
|
|
6
|
+
for node in dfs_observations:
|
|
7
|
+
if node.obs.type == 'GENERATION':
|
|
8
|
+
if node.parent.obs.name == 'invoke_agent':
|
|
9
|
+
messages.extend(_get_input_messages(node.parent.obs.input))
|
|
10
|
+
messages.extend(_get_output_message(node.parent.obs.output))
|
|
11
|
+
return messages
|
|
12
|
+
|
|
13
|
+
def _get_input_messages(data):
|
|
14
|
+
messages = []
|
|
15
|
+
for msg in data['messages']:
|
|
16
|
+
if msg['type'] == 'system': messages.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
|
|
17
|
+
elif msg['type'] == 'human': messages.append(OTelParserMessage(role='user', content=msg['content'], type=ContentType.text))
|
|
18
|
+
elif msg['type'] == 'tool': messages.append(OTelParserMessage(role='tool', content=msg['content'], tool_call_id=msg['tool_call_id']))
|
|
19
|
+
elif msg['type'] == 'ai':
|
|
20
|
+
if msg.get('additional_kwargs', {}).get('tool_calls', None) is not None:
|
|
21
|
+
msg_tool_calls = msg['additional_kwargs']['tool_calls']
|
|
22
|
+
tool_calls = []
|
|
23
|
+
for tc in msg_tool_calls:
|
|
24
|
+
tool_calls.append(OTelParserToolCall(id=tc['id'], function=OTelParserFunction(name=tc['function']['name'], arguments=tc['function']['arguments'])))
|
|
25
|
+
else:
|
|
26
|
+
tool_calls = None
|
|
27
|
+
messages.append(OTelParserMessage(role='assistant', content=msg['content'], tool_calls=tool_calls, type=ContentType.tool_call))
|
|
28
|
+
return messages
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_output_message(data):
|
|
32
|
+
role = 'assistant'
|
|
33
|
+
content = data.get('content', '')
|
|
34
|
+
if data.get('tool_calls', None):
|
|
35
|
+
tool_calls = []
|
|
36
|
+
for tc in data['tool_calls']:
|
|
37
|
+
tool_calls.append(OTelParserToolCall(id=tc['id'], function=OTelParserFunction(name=tc['name'], arguments=json.dumps(tc.get('args', {})))))
|
|
38
|
+
else: tool_calls = None
|
|
39
|
+
return [OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls)]
|
|
@@ -1,8 +1,17 @@
|
|
|
1
1
|
import json
|
|
2
|
+
|
|
2
3
|
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
3
|
-
from wxo_agentic_evaluation.type import
|
|
4
|
+
from wxo_agentic_evaluation.type import (
|
|
5
|
+
ContentType,
|
|
6
|
+
EvaluationData,
|
|
7
|
+
EventTypes,
|
|
8
|
+
Message,
|
|
9
|
+
)
|
|
4
10
|
|
|
5
|
-
with open(
|
|
11
|
+
with open(
|
|
12
|
+
"/Users/haodeqi/git/tau-bench/historical_trajectories/gpt-4o-airline.json",
|
|
13
|
+
"r",
|
|
14
|
+
) as f:
|
|
6
15
|
test_data = json.load(f)
|
|
7
16
|
|
|
8
17
|
|
|
@@ -14,14 +23,19 @@ goal_details = []
|
|
|
14
23
|
i = 0
|
|
15
24
|
for action in test_data[0]["info"]["task"]["actions"]:
|
|
16
25
|
goal_temp.append(action["name"] + f"_{i}")
|
|
17
|
-
goal_detail = {
|
|
26
|
+
goal_detail = {
|
|
27
|
+
"type": "tool_call",
|
|
28
|
+
"name": action["name"] + f"_{i}",
|
|
29
|
+
"tool_name": action["name"],
|
|
30
|
+
"args": {k: str(v) for k, v in action["kwargs"].items()},
|
|
31
|
+
}
|
|
18
32
|
goal_details.append(goal_detail)
|
|
19
33
|
|
|
20
34
|
if len(goal_temp) == 1:
|
|
21
35
|
goals[goal_temp[0]] = []
|
|
22
36
|
else:
|
|
23
|
-
for i in range(len(goal_temp)-1):
|
|
24
|
-
goals.update({goal_temp[i]: goal_temp[i+1]})
|
|
37
|
+
for i in range(len(goal_temp) - 1):
|
|
38
|
+
goals.update({goal_temp[i]: goal_temp[i + 1]})
|
|
25
39
|
|
|
26
40
|
gt_data = {
|
|
27
41
|
"agent": "airline_agent",
|
|
@@ -41,10 +55,30 @@ history = []
|
|
|
41
55
|
for msg in test_data[0]["traj"]:
|
|
42
56
|
if msg["role"] == "tool":
|
|
43
57
|
print(msg["content"])
|
|
44
|
-
history.append(
|
|
45
|
-
|
|
58
|
+
history.append(
|
|
59
|
+
Message(
|
|
60
|
+
role=msg["role"],
|
|
61
|
+
content=json.dumps(
|
|
62
|
+
{
|
|
63
|
+
"type": "tool_call",
|
|
64
|
+
"args": json.loads(msg["content"]),
|
|
65
|
+
"name": msg["name"],
|
|
66
|
+
"tool_call_id": msg["tool_call_id"],
|
|
67
|
+
}
|
|
68
|
+
),
|
|
69
|
+
type=ContentType.tool_call,
|
|
70
|
+
event=EventTypes.message_created,
|
|
71
|
+
)
|
|
72
|
+
)
|
|
46
73
|
else:
|
|
47
|
-
history.append(
|
|
74
|
+
history.append(
|
|
75
|
+
Message(
|
|
76
|
+
role=msg["role"],
|
|
77
|
+
content=str(msg["content"]),
|
|
78
|
+
type=ContentType.text,
|
|
79
|
+
event=EventTypes.message_created,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
48
82
|
|
|
49
83
|
print(f"length of history {history}")
|
|
50
84
|
|
|
@@ -53,7 +87,7 @@ evaluation_package = EvaluationPackage(
|
|
|
53
87
|
messages=history,
|
|
54
88
|
ground_truth=gt_data,
|
|
55
89
|
conversational_search_data=None,
|
|
56
|
-
resource_map=None
|
|
90
|
+
resource_map=None,
|
|
57
91
|
)
|
|
58
92
|
print("1")
|
|
59
93
|
(
|
|
@@ -64,4 +98,4 @@ print("1")
|
|
|
64
98
|
) = evaluation_package.generate_summary()
|
|
65
99
|
|
|
66
100
|
|
|
67
|
-
print(metrics)
|
|
101
|
+
print(metrics)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from typing import Any, Dict, List, Union, Optional
|
|
2
|
-
from wxo_agentic_evaluation.type import Message, ContentType, EventTypes
|
|
3
1
|
import json
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.type import ContentType, EventTypes, Message
|
|
4
5
|
|
|
5
6
|
# with open("src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
|
|
6
7
|
# data = json.load(f)
|
|
@@ -16,6 +17,13 @@ def convert_otel_to_message(otel_traces):
|
|
|
16
17
|
print(row.keys())
|
|
17
18
|
role = row.get("role", "assistant")
|
|
18
19
|
|
|
19
|
-
history.append(
|
|
20
|
+
history.append(
|
|
21
|
+
Message(
|
|
22
|
+
role=role,
|
|
23
|
+
content=content,
|
|
24
|
+
type=ContentType.text,
|
|
25
|
+
event=EventTypes.message_created,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
20
28
|
|
|
21
|
-
return history
|
|
29
|
+
return history
|