ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
from dotenv import load_dotenv
|
|
2
|
+
load_dotenv()
|
|
3
|
+
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from functools import partial
|
|
6
|
+
from typing import Dict, List, Any, Optional
|
|
7
|
+
|
|
8
|
+
from langfuse import get_client
|
|
9
|
+
langfuse_client = get_client()
|
|
10
|
+
|
|
11
|
+
from wxo_agentic_evaluation.type import OTelParserMessage
|
|
12
|
+
from . import langgraph_parser, pydantic_parser, langflow_parser, wxo_parser
|
|
13
|
+
|
|
14
|
+
def parse_session(session_id):
|
|
15
|
+
traces = get_traces(session_id)
|
|
16
|
+
messages: list[OTelParserMessage] = []
|
|
17
|
+
for tr in traces.data:
|
|
18
|
+
trace_messages = parse_trace(tr)
|
|
19
|
+
messages = add_messages(messages, trace_messages)
|
|
20
|
+
return messages
|
|
21
|
+
|
|
22
|
+
def parse_trace(trace):
|
|
23
|
+
messages: list[OTelParserMessage] = []
|
|
24
|
+
agent_framework = get_agent_framework(trace)
|
|
25
|
+
if agent_framework == 'langfuse.langgraph_agent': parser_func = langgraph_parser.parse_observations
|
|
26
|
+
elif agent_framework == 'langfuse.langflow': parser_func = partial(langflow_parser.parse_observations, dfs_callable=dfs_)
|
|
27
|
+
elif agent_framework == 'langgraph_agent': parser_func = langgraph_parser.parse_observations
|
|
28
|
+
elif agent_framework == 'pydantic_ai':
|
|
29
|
+
parser_func = pydantic_parser.parse_observations
|
|
30
|
+
sys_message = pydantic_parser.get_system_message(trace)
|
|
31
|
+
if sys_message: messages.append(sys_message)
|
|
32
|
+
else:
|
|
33
|
+
parser_func = None
|
|
34
|
+
parsers_to_try = [
|
|
35
|
+
partial(langflow_parser.parse_observations, dfs_callable=dfs_),
|
|
36
|
+
wxo_parser.parse_observations,
|
|
37
|
+
]
|
|
38
|
+
observations = get_observations(trace.observations)
|
|
39
|
+
dfs_observations = dfs_(observations)
|
|
40
|
+
if parser_func:
|
|
41
|
+
parsed_messages = parser_func(observations, dfs_observations)
|
|
42
|
+
messages = add_messages(messages, parsed_messages)
|
|
43
|
+
else:
|
|
44
|
+
for parser_func in parsers_to_try:
|
|
45
|
+
try:
|
|
46
|
+
parsed_messages = parser_func(observations, dfs_observations)
|
|
47
|
+
if not parsed_messages: continue
|
|
48
|
+
messages = add_messages(messages, parsed_messages)
|
|
49
|
+
break
|
|
50
|
+
except Exception as e:
|
|
51
|
+
print(e)
|
|
52
|
+
return messages
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def add_messages(messages: list[OTelParserMessage], parsed_messages: list[OTelParserMessage]):
|
|
56
|
+
ret: list[OTelParserMessage] = []
|
|
57
|
+
seen: set[str] = set()
|
|
58
|
+
for msg in messages:
|
|
59
|
+
msg_hash = msg.hash()
|
|
60
|
+
if msg_hash in seen: continue
|
|
61
|
+
seen.add(msg_hash)
|
|
62
|
+
ret.append(msg)
|
|
63
|
+
for msg in parsed_messages:
|
|
64
|
+
msg_hash = msg.hash()
|
|
65
|
+
if msg_hash in seen: continue
|
|
66
|
+
seen.add(msg_hash)
|
|
67
|
+
ret.append(msg)
|
|
68
|
+
return ret
|
|
69
|
+
|
|
70
|
+
def get_agent_framework(trace):
|
|
71
|
+
"""
|
|
72
|
+
Supported frameworks:
|
|
73
|
+
- OpenInference
|
|
74
|
+
- Pydantic AI
|
|
75
|
+
- Langflow
|
|
76
|
+
- Langgraph Agent
|
|
77
|
+
- LangFuse
|
|
78
|
+
- Langflow
|
|
79
|
+
- LangGraph Agent
|
|
80
|
+
"""
|
|
81
|
+
md_attrs = trace.metadata.get('attributes', {})
|
|
82
|
+
scope = trace.metadata.get('scope', {})
|
|
83
|
+
scope_name = scope.get('name', '')
|
|
84
|
+
|
|
85
|
+
if scope_name == 'langfuse-sdk':
|
|
86
|
+
if trace.name == 'LangGraph': return 'langfuse.langgraph_agent'
|
|
87
|
+
|
|
88
|
+
if 'langflow.project.name' in md_attrs.keys(): return 'langflow'
|
|
89
|
+
if 'pydantic-ai' in scope_name: return 'pydantic_ai'
|
|
90
|
+
if 'openinference.instrumentation.langchain' in scope_name:
|
|
91
|
+
# TODO: need to find a better way to detect Langgraph Agent
|
|
92
|
+
return 'langgraph_agent'
|
|
93
|
+
|
|
94
|
+
# check for langflow
|
|
95
|
+
# get observations for trace
|
|
96
|
+
observations = dfs_(get_observations(trace.observations))
|
|
97
|
+
for obs in observations:
|
|
98
|
+
if 'from_langflow_component' in obs.obs.metadata.keys(): return 'langfuse.langflow'
|
|
99
|
+
return "UNKNOWN"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_traces(session_id):
|
|
103
|
+
traces = langfuse_client.api.trace.list(
|
|
104
|
+
session_id=session_id,
|
|
105
|
+
limit=100,
|
|
106
|
+
)
|
|
107
|
+
# sort by timestamp
|
|
108
|
+
traces.data.sort(key=lambda x: x.timestamp)
|
|
109
|
+
return traces
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def get_observations(observations_ids):
|
|
113
|
+
observations = [langfuse_client.api.observations.get(obs_id) for obs_id in observations_ids]
|
|
114
|
+
observations.sort(key=lambda x: x.start_time)
|
|
115
|
+
observation_tree = build_observation_forest(observations)
|
|
116
|
+
return observation_tree
|
|
117
|
+
|
|
118
|
+
class ObsNode:
|
|
119
|
+
def __init__(self, obs: Any):
|
|
120
|
+
self.obs = obs
|
|
121
|
+
self.children: List["ObsNode"] = []
|
|
122
|
+
self.parent: Optional["ObsNode"] = None
|
|
123
|
+
|
|
124
|
+
def __repr__(self):
|
|
125
|
+
return f"ObsNode(id={self.obs.id}, type={self.obs.type}, name={self.obs.name})"
|
|
126
|
+
|
|
127
|
+
def build_observation_forest(observations: List[Any]) -> List[ObsNode]:
|
|
128
|
+
"""Return list of root nodes; each has .children forming a tree."""
|
|
129
|
+
nodes: Dict[str, ObsNode] = {}
|
|
130
|
+
children_by_parent: Dict[Optional[str], List[ObsNode]] = defaultdict(list)
|
|
131
|
+
|
|
132
|
+
# 1. Create nodes for each observation
|
|
133
|
+
for o in observations:
|
|
134
|
+
node = ObsNode(o)
|
|
135
|
+
nodes[o.id] = node
|
|
136
|
+
parent_id = getattr(o, "parent_observation_id", None)
|
|
137
|
+
children_by_parent[parent_id].append(node)
|
|
138
|
+
|
|
139
|
+
# 2. Attach children to parents
|
|
140
|
+
for parent_id, child_nodes in children_by_parent.items():
|
|
141
|
+
if parent_id is None:
|
|
142
|
+
continue
|
|
143
|
+
parent_node = nodes.get(parent_id)
|
|
144
|
+
if parent_node:
|
|
145
|
+
parent_node.children.extend(child_nodes)
|
|
146
|
+
for child_node in child_nodes: child_node.parent = parent_node
|
|
147
|
+
|
|
148
|
+
# 3. Roots are those with parent_observation_id == None
|
|
149
|
+
roots = children_by_parent[None]
|
|
150
|
+
return roots
|
|
151
|
+
|
|
152
|
+
def dfs_(observation_tree: List[ObsNode]):
|
|
153
|
+
ret =[]
|
|
154
|
+
for node in observation_tree:
|
|
155
|
+
ret.append(node)
|
|
156
|
+
ret.extend(dfs_(node.children))
|
|
157
|
+
return ret
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
if __name__ == "__main__":
|
|
161
|
+
messages = parse_session(session_id="93a24957-dd7a-425d-b821-88de49940a6e")
|
|
162
|
+
|
|
163
|
+
print(messages)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import Literal
|
|
3
|
+
from hashlib import md5
|
|
4
|
+
|
|
5
|
+
# TODO: This is not a fully chat completions message.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclasses.dataclass
|
|
9
|
+
class Function:
|
|
10
|
+
name: str
|
|
11
|
+
arguments: str
|
|
12
|
+
|
|
13
|
+
def __str__(self):
|
|
14
|
+
return f"{self.name}:{self.arguments}"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclasses.dataclass
|
|
18
|
+
class ToolCall:
|
|
19
|
+
id: str
|
|
20
|
+
function: "Function"
|
|
21
|
+
type: Literal["function"] = "function"
|
|
22
|
+
|
|
23
|
+
def __str__(self):
|
|
24
|
+
return f"{self.id}:{self.type}:{self.function}"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclasses.dataclass
|
|
28
|
+
class Message:
|
|
29
|
+
role: Literal["user", "assistant", "tool", "system"]
|
|
30
|
+
content: str | None = None
|
|
31
|
+
tool_calls: list["ToolCall"] | None = None
|
|
32
|
+
tool_call_id: str | None = None
|
|
33
|
+
|
|
34
|
+
def __str__(self):
|
|
35
|
+
return f'{self.role}:{self.content}:{":".join(map(str, self.tool_calls or []))}:{self.tool_call_id}'
|
|
36
|
+
|
|
37
|
+
def hash(self):
|
|
38
|
+
return md5(self.__str__().encode("utf-8")).hexdigest()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_system_message(trace):
|
|
6
|
+
sys_instruction_json = trace.metadata.get('attributes', {}).get('gen_ai.system_instructions')
|
|
7
|
+
if not sys_instruction_json: return None
|
|
8
|
+
instruction = json.loads(sys_instruction_json)[0]['content']
|
|
9
|
+
return OTelParserMessage(role='system', content=instruction, type=ContentType.text)
|
|
10
|
+
|
|
11
|
+
def parse_observations(observation_tree, dfs_observations):
|
|
12
|
+
messages = []
|
|
13
|
+
for obs in dfs_observations:
|
|
14
|
+
if obs.obs.type == 'GENERATION':
|
|
15
|
+
messages.extend(_get_messages(obs.obs.input))
|
|
16
|
+
messages.extend(_get_messages(obs.obs.output))
|
|
17
|
+
return messages
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_messages(data):
|
|
21
|
+
messages = []
|
|
22
|
+
for msg in data:
|
|
23
|
+
if msg['role'] == 'user':
|
|
24
|
+
parts = msg['parts']
|
|
25
|
+
for part in parts:
|
|
26
|
+
content_type = part['type']
|
|
27
|
+
if content_type == 'text':
|
|
28
|
+
content = part['content']
|
|
29
|
+
messages.append(OTelParserMessage(role='user', content=content, type=ContentType.text))
|
|
30
|
+
elif content_type == 'tool_call_response':
|
|
31
|
+
tool_call_id = part['id']
|
|
32
|
+
result = json.dumps(part['result'])
|
|
33
|
+
messages.append(OTelParserMessage(role='tool', tool_call_id=tool_call_id, content=result, type=ContentType.tool_response))
|
|
34
|
+
elif msg['role'] == 'assistant':
|
|
35
|
+
parts = msg['parts']
|
|
36
|
+
# TODO: assuming that only one message is present in the assistant parts
|
|
37
|
+
content_type = parts[0]['type']
|
|
38
|
+
if content_type == 'text':
|
|
39
|
+
content = parts[0]['content']
|
|
40
|
+
messages.append(OTelParserMessage(role='assistant', content=content, type=ContentType.text))
|
|
41
|
+
elif content_type == 'tool_call':
|
|
42
|
+
tool_calls = []
|
|
43
|
+
for part in parts:
|
|
44
|
+
tool_call_id = part['id']
|
|
45
|
+
tool_call_name = part['name']
|
|
46
|
+
tool_call_arguments = part['arguments']
|
|
47
|
+
tool_call = OTelParserToolCall(id=tool_call_id, function=OTelParserFunction(name=tool_call_name, arguments=tool_call_arguments))
|
|
48
|
+
tool_calls.append(tool_call)
|
|
49
|
+
messages.append(OTelParserMessage(role='assistant', content='', tool_calls=tool_calls, type=ContentType.tool_call))
|
|
50
|
+
return messages
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from .parser_types import ToolCall, Function
|
|
3
|
+
|
|
4
|
+
def to_tool_call(tool_call):
|
|
5
|
+
return ToolCall(
|
|
6
|
+
id=tool_call['id'],
|
|
7
|
+
type=tool_call['type'],
|
|
8
|
+
function=to_function(tool_call)
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
def to_function(func):
|
|
12
|
+
return Function(
|
|
13
|
+
name=func['name'],
|
|
14
|
+
arguments=json.dumps(func['args'])
|
|
15
|
+
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
|
|
3
|
+
|
|
4
|
+
def parse_observations(observation_tree, dfs_observations):
|
|
5
|
+
messages = []
|
|
6
|
+
for node in dfs_observations:
|
|
7
|
+
if node.obs.type == 'GENERATION':
|
|
8
|
+
if node.parent.obs.name == 'invoke_agent':
|
|
9
|
+
messages.extend(_get_input_messages(node.parent.obs.input))
|
|
10
|
+
messages.extend(_get_output_message(node.parent.obs.output))
|
|
11
|
+
return messages
|
|
12
|
+
|
|
13
|
+
def _get_input_messages(data):
|
|
14
|
+
messages = []
|
|
15
|
+
for msg in data['messages']:
|
|
16
|
+
if msg['type'] == 'system': messages.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
|
|
17
|
+
elif msg['type'] == 'human': messages.append(OTelParserMessage(role='user', content=msg['content'], type=ContentType.text))
|
|
18
|
+
elif msg['type'] == 'tool': messages.append(OTelParserMessage(role='tool', content=msg['content'], tool_call_id=msg['tool_call_id']))
|
|
19
|
+
elif msg['type'] == 'ai':
|
|
20
|
+
if msg.get('additional_kwargs', {}).get('tool_calls', None) is not None:
|
|
21
|
+
msg_tool_calls = msg['additional_kwargs']['tool_calls']
|
|
22
|
+
tool_calls = []
|
|
23
|
+
for tc in msg_tool_calls:
|
|
24
|
+
tool_calls.append(OTelParserToolCall(id=tc['id'], function=OTelParserFunction(name=tc['function']['name'], arguments=tc['function']['arguments'])))
|
|
25
|
+
else:
|
|
26
|
+
tool_calls = None
|
|
27
|
+
messages.append(OTelParserMessage(role='assistant', content=msg['content'], tool_calls=tool_calls, type=ContentType.tool_call))
|
|
28
|
+
return messages
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_output_message(data):
|
|
32
|
+
role = 'assistant'
|
|
33
|
+
content = data.get('content', '')
|
|
34
|
+
if data.get('tool_calls', None):
|
|
35
|
+
tool_calls = []
|
|
36
|
+
for tc in data['tool_calls']:
|
|
37
|
+
tool_calls.append(OTelParserToolCall(id=tc['id'], function=OTelParserFunction(name=tc['name'], arguments=json.dumps(tc.get('args', {})))))
|
|
38
|
+
else: tool_calls = None
|
|
39
|
+
return [OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls)]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
4
|
+
from wxo_agentic_evaluation.type import (
|
|
5
|
+
ContentType,
|
|
6
|
+
EvaluationData,
|
|
7
|
+
EventTypes,
|
|
8
|
+
Message,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
with open(
|
|
12
|
+
"/Users/haodeqi/git/tau-bench/historical_trajectories/gpt-4o-airline.json",
|
|
13
|
+
"r",
|
|
14
|
+
) as f:
|
|
15
|
+
test_data = json.load(f)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
goal_temp = []
|
|
19
|
+
|
|
20
|
+
goals = {}
|
|
21
|
+
goal_details = []
|
|
22
|
+
|
|
23
|
+
i = 0
|
|
24
|
+
for action in test_data[0]["info"]["task"]["actions"]:
|
|
25
|
+
goal_temp.append(action["name"] + f"_{i}")
|
|
26
|
+
goal_detail = {
|
|
27
|
+
"type": "tool_call",
|
|
28
|
+
"name": action["name"] + f"_{i}",
|
|
29
|
+
"tool_name": action["name"],
|
|
30
|
+
"args": {k: str(v) for k, v in action["kwargs"].items()},
|
|
31
|
+
}
|
|
32
|
+
goal_details.append(goal_detail)
|
|
33
|
+
|
|
34
|
+
if len(goal_temp) == 1:
|
|
35
|
+
goals[goal_temp[0]] = []
|
|
36
|
+
else:
|
|
37
|
+
for i in range(len(goal_temp) - 1):
|
|
38
|
+
goals.update({goal_temp[i]: goal_temp[i + 1]})
|
|
39
|
+
|
|
40
|
+
gt_data = {
|
|
41
|
+
"agent": "airline_agent",
|
|
42
|
+
"goals": goals,
|
|
43
|
+
"goal_details": goal_details,
|
|
44
|
+
"story": test_data[0]["info"]["task"]["instruction"],
|
|
45
|
+
"starting_sentence": "",
|
|
46
|
+
}
|
|
47
|
+
print("2")
|
|
48
|
+
gt_data = EvaluationData.model_validate(gt_data)
|
|
49
|
+
|
|
50
|
+
tc_name = "airline_1"
|
|
51
|
+
|
|
52
|
+
print(test_data[0]["traj"][0])
|
|
53
|
+
|
|
54
|
+
history = []
|
|
55
|
+
for msg in test_data[0]["traj"]:
|
|
56
|
+
if msg["role"] == "tool":
|
|
57
|
+
print(msg["content"])
|
|
58
|
+
history.append(
|
|
59
|
+
Message(
|
|
60
|
+
role=msg["role"],
|
|
61
|
+
content=json.dumps(
|
|
62
|
+
{
|
|
63
|
+
"type": "tool_call",
|
|
64
|
+
"args": json.loads(msg["content"]),
|
|
65
|
+
"name": msg["name"],
|
|
66
|
+
"tool_call_id": msg["tool_call_id"],
|
|
67
|
+
}
|
|
68
|
+
),
|
|
69
|
+
type=ContentType.tool_call,
|
|
70
|
+
event=EventTypes.message_created,
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
else:
|
|
74
|
+
history.append(
|
|
75
|
+
Message(
|
|
76
|
+
role=msg["role"],
|
|
77
|
+
content=str(msg["content"]),
|
|
78
|
+
type=ContentType.text,
|
|
79
|
+
event=EventTypes.message_created,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
print(f"length of history {history}")
|
|
84
|
+
|
|
85
|
+
evaluation_package = EvaluationPackage(
|
|
86
|
+
test_case_name=tc_name,
|
|
87
|
+
messages=history,
|
|
88
|
+
ground_truth=gt_data,
|
|
89
|
+
conversational_search_data=None,
|
|
90
|
+
resource_map=None,
|
|
91
|
+
)
|
|
92
|
+
print("1")
|
|
93
|
+
(
|
|
94
|
+
keyword_semantic_matches,
|
|
95
|
+
knowledge_base_metrics,
|
|
96
|
+
messages_with_reason,
|
|
97
|
+
metrics,
|
|
98
|
+
) = evaluation_package.generate_summary()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
print(metrics)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.type import ContentType, EventTypes, Message
|
|
5
|
+
|
|
6
|
+
# with open("src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
|
|
7
|
+
# data = json.load(f)
|
|
8
|
+
#
|
|
9
|
+
# otel_traces = data["calls"][-1]["messages"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_otel_to_message(otel_traces):
|
|
13
|
+
history = []
|
|
14
|
+
for row in otel_traces:
|
|
15
|
+
print(row)
|
|
16
|
+
content = row["content"]
|
|
17
|
+
print(row.keys())
|
|
18
|
+
role = row.get("role", "assistant")
|
|
19
|
+
|
|
20
|
+
history.append(
|
|
21
|
+
Message(
|
|
22
|
+
role=role,
|
|
23
|
+
content=content,
|
|
24
|
+
type=ContentType.text,
|
|
25
|
+
event=EventTypes.message_created,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
return history
|