ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,86 @@
1
+ import json
2
+ from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
3
+
4
+ def parse_observations(observation_tree, dfs_observations, dfs_callable: callable):
5
+ messages = []
6
+ for node in dfs_observations:
7
+ # assume there will only be one AgentExecutor in the trace!
8
+ if node.obs.name == 'AgentExecutor': return _parse_agent_executor(node.children, dfs_callable(node.children))
9
+ return messages
10
+
11
+
12
+ def _parse_agent_executor(observation_tree, dfs_observations):
13
+ messages = []
14
+ for node in dfs_observations:
15
+ if node.obs.type == 'GENERATION':
16
+ print(node.obs.id)
17
+ messages.extend(_get_messages(node.obs.input))
18
+ # get intemediate steps from parent
19
+ messages.extend(_get_intermediate_steps(node.parent))
20
+ messages.extend(_get_messages([node.obs.output]))
21
+ return messages
22
+
23
+
24
+ def _get_messages(data):
25
+ messages = []
26
+ for msg in data:
27
+ if msg['role'] == 'system': messages.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
28
+ elif msg['role'] == 'user':
29
+ content = ''
30
+ if isinstance(msg['content'], list):
31
+ content = []
32
+ for item in msg['content']:
33
+ if item['type'] == ['text']: content.append(item['text'])
34
+ content = ' '.join(content)
35
+ elif isinstance(msg['content'], str):
36
+ content = msg['content']
37
+
38
+ messages.append(OTelParserMessage(role='user', content=content, type=ContentType.text))
39
+ elif msg['role'] == 'assistant':
40
+ content = msg['content'] or ''
41
+ additional_kwargs = msg.get('additional_kwargs', {})
42
+ tool_calls = None
43
+ if 'tool_calls' in additional_kwargs:
44
+ tool_calls = []
45
+ for tc in additional_kwargs['tool_calls']:
46
+ id_ = tc['id']
47
+ function = OTelParserFunction(name=tc['function']['name'], arguments=tc['function']['arguments'])
48
+ tool_calls.append(OTelParserToolCall(id=id_, function=function))
49
+ messages.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=ContentType.tool_call))
50
+ return messages
51
+
52
+ def _get_intermediate_steps(node):
53
+ messages = []
54
+ tool_calls_n_responses = node.obs.input['intermediate_steps']
55
+ for tc, tr in tool_calls_n_responses:
56
+ if 'tool' in tc and 'tool_input' in tc and 'tool_call_id' in tc:
57
+ tool_call_id = tc['tool_call_id']
58
+ if isinstance(tr, str):
59
+ messages.append(OTelParserMessage(role='tool', content=tr, tool_call_id=tool_call_id, type=ContentType.tool_response))
60
+ continue
61
+ elif (isinstance(tr, dict) and 'content' not in tr):
62
+ messages.append(OTelParserMessage(role='tool', content=json.dumps(tr), tool_call_id=tool_call_id, type=ContentType.tool_response))
63
+ continue
64
+ elif isinstance(tr, dict) and 'content' in tr:
65
+ content = tr['content']
66
+ if isinstance(content, str):
67
+ messages.append(OTelParserMessage(role='tool', content=content, tool_call_id=tool_call_id, type=ContentType.tool_response))
68
+ continue
69
+ elif isinstance(content, list):
70
+ for part in content:
71
+ if isinstance(part, dict) and part['type'] == 'text':
72
+ text = part['text']
73
+ if isinstance(text, dict): text = json.dumps(text)
74
+ messages.append(OTelParserMessage(role='tool', content=text, tool_call_id=tool_call_id, type=ContentType.tool_response))
75
+ continue
76
+ else:
77
+ raise ValueError(f"Unexpected part type: {type(part)} or part[type] '{part['type']}' != 'text'")
78
+ else:
79
+ raise ValueError(f"Unexpected content type: {type(content)}")
80
+ else:
81
+ raise ValueError(f"Unexpected tool response: Type: {type(tr)}, Value: {tr}")
82
+
83
+ else:
84
+ print('Tool Call:', tc)
85
+ print('Tool Response:', tr)
86
+ return messages
@@ -0,0 +1,61 @@
1
+ import json
2
+ from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
3
+
4
+ def parse_observations(observation_tree, dfs_observations):
5
+ messages = []
6
+ is_first_generation = True
7
+ for obs in dfs_observations:
8
+ if obs.obs.type == 'GENERATION':
9
+ if is_first_generation:
10
+ messages.extend(_get_input_message(obs))
11
+ is_first_generation = False
12
+ parent = obs.parent
13
+ if parent.obs.type == 'CHAIN':
14
+ # TODO: messages is a list. confirm, we will only see one message in the list.
15
+ msg = parent.obs.output['messages'][0]
16
+ content = msg['content'] or ''
17
+ msg_type = ContentType.text
18
+ tool_calls = msg['tool_calls'] or None
19
+ if tool_calls is not None:
20
+ msg_type = ContentType.tool_call
21
+ tool_calls = [_to_tool_call(tc) for tc in tool_calls]
22
+ messages.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=msg_type))
23
+ elif obs.obs.type == 'TOOL':
24
+ parent_node = obs.parent
25
+ if parent_node.obs.type == 'CHAIN':
26
+ for tool_response in parent_node.obs.output['messages']:
27
+ messages.append(OTelParserMessage(role='tool', content=tool_response['content'], tool_call_id=tool_response['tool_call_id'], type=ContentType.tool_response))
28
+ return messages
29
+
30
+
31
+ def _get_input_message(obs_node):
32
+ ret = []
33
+ parent = obs_node.parent
34
+ if parent.obs.type == 'CHAIN':
35
+ for msg in parent.obs.input['messages']:
36
+ if msg['type'] == 'system': ret.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
37
+ elif msg['type'] == 'human': ret.append(OTelParserMessage(role='user', content=msg['content'], type=ContentType.text))
38
+ elif msg['type'] == 'tool': ret.append(OTelParserMessage(role='tool', content=msg['content'], tool_call_id=msg['tool_call_id'], type=ContentType.tool_response))
39
+ elif msg['type'] == 'ai':
40
+ content = msg['content'] or ''
41
+ tool_calls = msg['tool_calls'] or None
42
+ msg_type = ContentType.text
43
+ if tool_calls is not None:
44
+ msg_type = ContentType.tool_call
45
+ tool_calls = [_to_tool_call(tc) for tc in tool_calls]
46
+ ret.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=msg_type))
47
+ return ret
48
+
49
+
50
+ def _to_tool_call(tool_call):
51
+ return OTelParserToolCall(
52
+ id=tool_call['id'],
53
+ type='function', # OTelParserToolCall expects literal 'function'
54
+ function=_to_function(tool_call)
55
+ )
56
+
57
+ def _to_function(func):
58
+ return OTelParserFunction(
59
+ name=func['name'],
60
+ arguments=json.dumps(func['args'])
61
+ )
@@ -0,0 +1,163 @@
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+
4
+ from collections import defaultdict
5
+ from functools import partial
6
+ from typing import Dict, List, Any, Optional
7
+
8
+ from langfuse import get_client
9
+ langfuse_client = get_client()
10
+
11
+ from wxo_agentic_evaluation.type import OTelParserMessage
12
+ from . import langgraph_parser, pydantic_parser, langflow_parser, wxo_parser
13
+
14
+ def parse_session(session_id):
15
+ traces = get_traces(session_id)
16
+ messages: list[OTelParserMessage] = []
17
+ for tr in traces.data:
18
+ trace_messages = parse_trace(tr)
19
+ messages = add_messages(messages, trace_messages)
20
+ return messages
21
+
22
+ def parse_trace(trace):
23
+ messages: list[OTelParserMessage] = []
24
+ agent_framework = get_agent_framework(trace)
25
+ if agent_framework == 'langfuse.langgraph_agent': parser_func = langgraph_parser.parse_observations
26
+ elif agent_framework == 'langfuse.langflow': parser_func = partial(langflow_parser.parse_observations, dfs_callable=dfs_)
27
+ elif agent_framework == 'langgraph_agent': parser_func = langgraph_parser.parse_observations
28
+ elif agent_framework == 'pydantic_ai':
29
+ parser_func = pydantic_parser.parse_observations
30
+ sys_message = pydantic_parser.get_system_message(trace)
31
+ if sys_message: messages.append(sys_message)
32
+ else:
33
+ parser_func = None
34
+ parsers_to_try = [
35
+ partial(langflow_parser.parse_observations, dfs_callable=dfs_),
36
+ wxo_parser.parse_observations,
37
+ ]
38
+ observations = get_observations(trace.observations)
39
+ dfs_observations = dfs_(observations)
40
+ if parser_func:
41
+ parsed_messages = parser_func(observations, dfs_observations)
42
+ messages = add_messages(messages, parsed_messages)
43
+ else:
44
+ for parser_func in parsers_to_try:
45
+ try:
46
+ parsed_messages = parser_func(observations, dfs_observations)
47
+ if not parsed_messages: continue
48
+ messages = add_messages(messages, parsed_messages)
49
+ break
50
+ except Exception as e:
51
+ print(e)
52
+ return messages
53
+
54
+
55
+ def add_messages(messages: list[OTelParserMessage], parsed_messages: list[OTelParserMessage]):
56
+ ret: list[OTelParserMessage] = []
57
+ seen: set[str] = set()
58
+ for msg in messages:
59
+ msg_hash = msg.hash()
60
+ if msg_hash in seen: continue
61
+ seen.add(msg_hash)
62
+ ret.append(msg)
63
+ for msg in parsed_messages:
64
+ msg_hash = msg.hash()
65
+ if msg_hash in seen: continue
66
+ seen.add(msg_hash)
67
+ ret.append(msg)
68
+ return ret
69
+
70
+ def get_agent_framework(trace):
71
+ """
72
+ Supported frameworks:
73
+ - OpenInference
74
+ - Pydantic AI
75
+ - Langflow
76
+ - Langgraph Agent
77
+ - LangFuse
78
+ - Langflow
79
+ - LangGraph Agent
80
+ """
81
+ md_attrs = trace.metadata.get('attributes', {})
82
+ scope = trace.metadata.get('scope', {})
83
+ scope_name = scope.get('name', '')
84
+
85
+ if scope_name == 'langfuse-sdk':
86
+ if trace.name == 'LangGraph': return 'langfuse.langgraph_agent'
87
+
88
+ if 'langflow.project.name' in md_attrs.keys(): return 'langflow'
89
+ if 'pydantic-ai' in scope_name: return 'pydantic_ai'
90
+ if 'openinference.instrumentation.langchain' in scope_name:
91
+ # TODO: need to find a better way to detect Langgraph Agent
92
+ return 'langgraph_agent'
93
+
94
+ # check for langflow
95
+ # get observations for trace
96
+ observations = dfs_(get_observations(trace.observations))
97
+ for obs in observations:
98
+ if 'from_langflow_component' in obs.obs.metadata.keys(): return 'langfuse.langflow'
99
+ return "UNKNOWN"
100
+
101
+
102
+ def get_traces(session_id):
103
+ traces = langfuse_client.api.trace.list(
104
+ session_id=session_id,
105
+ limit=100,
106
+ )
107
+ # sort by timestamp
108
+ traces.data.sort(key=lambda x: x.timestamp)
109
+ return traces
110
+
111
+
112
+ def get_observations(observations_ids):
113
+ observations = [langfuse_client.api.observations.get(obs_id) for obs_id in observations_ids]
114
+ observations.sort(key=lambda x: x.start_time)
115
+ observation_tree = build_observation_forest(observations)
116
+ return observation_tree
117
+
118
+ class ObsNode:
119
+ def __init__(self, obs: Any):
120
+ self.obs = obs
121
+ self.children: List["ObsNode"] = []
122
+ self.parent: Optional["ObsNode"] = None
123
+
124
+ def __repr__(self):
125
+ return f"ObsNode(id={self.obs.id}, type={self.obs.type}, name={self.obs.name})"
126
+
127
+ def build_observation_forest(observations: List[Any]) -> List[ObsNode]:
128
+ """Return list of root nodes; each has .children forming a tree."""
129
+ nodes: Dict[str, ObsNode] = {}
130
+ children_by_parent: Dict[Optional[str], List[ObsNode]] = defaultdict(list)
131
+
132
+ # 1. Create nodes for each observation
133
+ for o in observations:
134
+ node = ObsNode(o)
135
+ nodes[o.id] = node
136
+ parent_id = getattr(o, "parent_observation_id", None)
137
+ children_by_parent[parent_id].append(node)
138
+
139
+ # 2. Attach children to parents
140
+ for parent_id, child_nodes in children_by_parent.items():
141
+ if parent_id is None:
142
+ continue
143
+ parent_node = nodes.get(parent_id)
144
+ if parent_node:
145
+ parent_node.children.extend(child_nodes)
146
+ for child_node in child_nodes: child_node.parent = parent_node
147
+
148
+ # 3. Roots are those with parent_observation_id == None
149
+ roots = children_by_parent[None]
150
+ return roots
151
+
152
+ def dfs_(observation_tree: List[ObsNode]):
153
+ ret =[]
154
+ for node in observation_tree:
155
+ ret.append(node)
156
+ ret.extend(dfs_(node.children))
157
+ return ret
158
+
159
+
160
+ if __name__ == "__main__":
161
+ messages = parse_session(session_id="93a24957-dd7a-425d-b821-88de49940a6e")
162
+
163
+ print(messages)
@@ -0,0 +1,38 @@
1
+ import dataclasses
2
+ from typing import Literal
3
+ from hashlib import md5
4
+
5
+ # TODO: This is not a fully chat completions message.
6
+
7
+
8
+ @dataclasses.dataclass
9
+ class Function:
10
+ name: str
11
+ arguments: str
12
+
13
+ def __str__(self):
14
+ return f"{self.name}:{self.arguments}"
15
+
16
+
17
+ @dataclasses.dataclass
18
+ class ToolCall:
19
+ id: str
20
+ function: "Function"
21
+ type: Literal["function"] = "function"
22
+
23
+ def __str__(self):
24
+ return f"{self.id}:{self.type}:{self.function}"
25
+
26
+
27
+ @dataclasses.dataclass
28
+ class Message:
29
+ role: Literal["user", "assistant", "tool", "system"]
30
+ content: str | None = None
31
+ tool_calls: list["ToolCall"] | None = None
32
+ tool_call_id: str | None = None
33
+
34
+ def __str__(self):
35
+ return f'{self.role}:{self.content}:{":".join(map(str, self.tool_calls or []))}:{self.tool_call_id}'
36
+
37
+ def hash(self):
38
+ return md5(self.__str__().encode("utf-8")).hexdigest()
@@ -0,0 +1,50 @@
1
+ import json
2
+ from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
3
+
4
+
5
+ def get_system_message(trace):
6
+ sys_instruction_json = trace.metadata.get('attributes', {}).get('gen_ai.system_instructions')
7
+ if not sys_instruction_json: return None
8
+ instruction = json.loads(sys_instruction_json)[0]['content']
9
+ return OTelParserMessage(role='system', content=instruction, type=ContentType.text)
10
+
11
+ def parse_observations(observation_tree, dfs_observations):
12
+ messages = []
13
+ for obs in dfs_observations:
14
+ if obs.obs.type == 'GENERATION':
15
+ messages.extend(_get_messages(obs.obs.input))
16
+ messages.extend(_get_messages(obs.obs.output))
17
+ return messages
18
+
19
+
20
+ def _get_messages(data):
21
+ messages = []
22
+ for msg in data:
23
+ if msg['role'] == 'user':
24
+ parts = msg['parts']
25
+ for part in parts:
26
+ content_type = part['type']
27
+ if content_type == 'text':
28
+ content = part['content']
29
+ messages.append(OTelParserMessage(role='user', content=content, type=ContentType.text))
30
+ elif content_type == 'tool_call_response':
31
+ tool_call_id = part['id']
32
+ result = json.dumps(part['result'])
33
+ messages.append(OTelParserMessage(role='tool', tool_call_id=tool_call_id, content=result, type=ContentType.tool_response))
34
+ elif msg['role'] == 'assistant':
35
+ parts = msg['parts']
36
+ # TODO: assuming that only one message is present in the assistant parts
37
+ content_type = parts[0]['type']
38
+ if content_type == 'text':
39
+ content = parts[0]['content']
40
+ messages.append(OTelParserMessage(role='assistant', content=content, type=ContentType.text))
41
+ elif content_type == 'tool_call':
42
+ tool_calls = []
43
+ for part in parts:
44
+ tool_call_id = part['id']
45
+ tool_call_name = part['name']
46
+ tool_call_arguments = part['arguments']
47
+ tool_call = OTelParserToolCall(id=tool_call_id, function=OTelParserFunction(name=tool_call_name, arguments=tool_call_arguments))
48
+ tool_calls.append(tool_call)
49
+ messages.append(OTelParserMessage(role='assistant', content='', tool_calls=tool_calls, type=ContentType.tool_call))
50
+ return messages
@@ -0,0 +1,15 @@
1
+ import json
2
+ from .parser_types import ToolCall, Function
3
+
4
+ def to_tool_call(tool_call):
5
+ return ToolCall(
6
+ id=tool_call['id'],
7
+ type=tool_call['type'],
8
+ function=to_function(tool_call)
9
+ )
10
+
11
+ def to_function(func):
12
+ return Function(
13
+ name=func['name'],
14
+ arguments=json.dumps(func['args'])
15
+ )
@@ -0,0 +1,39 @@
1
+ import json
2
+ from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
3
+
4
+ def parse_observations(observation_tree, dfs_observations):
5
+ messages = []
6
+ for node in dfs_observations:
7
+ if node.obs.type == 'GENERATION':
8
+ if node.parent.obs.name == 'invoke_agent':
9
+ messages.extend(_get_input_messages(node.parent.obs.input))
10
+ messages.extend(_get_output_message(node.parent.obs.output))
11
+ return messages
12
+
13
+ def _get_input_messages(data):
14
+ messages = []
15
+ for msg in data['messages']:
16
+ if msg['type'] == 'system': messages.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
17
+ elif msg['type'] == 'human': messages.append(OTelParserMessage(role='user', content=msg['content'], type=ContentType.text))
18
+ elif msg['type'] == 'tool': messages.append(OTelParserMessage(role='tool', content=msg['content'], tool_call_id=msg['tool_call_id']))
19
+ elif msg['type'] == 'ai':
20
+ if msg.get('additional_kwargs', {}).get('tool_calls', None) is not None:
21
+ msg_tool_calls = msg['additional_kwargs']['tool_calls']
22
+ tool_calls = []
23
+ for tc in msg_tool_calls:
24
+ tool_calls.append(OTelParserToolCall(id=tc['id'], function=OTelParserFunction(name=tc['function']['name'], arguments=tc['function']['arguments'])))
25
+ else:
26
+ tool_calls = None
27
+ messages.append(OTelParserMessage(role='assistant', content=msg['content'], tool_calls=tool_calls, type=ContentType.tool_call))
28
+ return messages
29
+
30
+
31
+ def _get_output_message(data):
32
+ role = 'assistant'
33
+ content = data.get('content', '')
34
+ if data.get('tool_calls', None):
35
+ tool_calls = []
36
+ for tc in data['tool_calls']:
37
+ tool_calls.append(OTelParserToolCall(id=tc['id'], function=OTelParserFunction(name=tc['name'], arguments=json.dumps(tc.get('args', {})))))
38
+ else: tool_calls = None
39
+ return [OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls)]
@@ -1,8 +1,17 @@
1
1
  import json
2
+
2
3
  from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
3
- from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
4
+ from wxo_agentic_evaluation.type import (
5
+ ContentType,
6
+ EvaluationData,
7
+ EventTypes,
8
+ Message,
9
+ )
4
10
 
5
- with open("/Users/haodeqi/git/tau-bench/historical_trajectories/gpt-4o-airline.json", "r") as f:
11
+ with open(
12
+ "/Users/haodeqi/git/tau-bench/historical_trajectories/gpt-4o-airline.json",
13
+ "r",
14
+ ) as f:
6
15
  test_data = json.load(f)
7
16
 
8
17
 
@@ -14,14 +23,19 @@ goal_details = []
14
23
  i = 0
15
24
  for action in test_data[0]["info"]["task"]["actions"]:
16
25
  goal_temp.append(action["name"] + f"_{i}")
17
- goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": {k: str(v) for k,v in action["kwargs"].items()}}
26
+ goal_detail = {
27
+ "type": "tool_call",
28
+ "name": action["name"] + f"_{i}",
29
+ "tool_name": action["name"],
30
+ "args": {k: str(v) for k, v in action["kwargs"].items()},
31
+ }
18
32
  goal_details.append(goal_detail)
19
33
 
20
34
  if len(goal_temp) == 1:
21
35
  goals[goal_temp[0]] = []
22
36
  else:
23
- for i in range(len(goal_temp)-1):
24
- goals.update({goal_temp[i]: goal_temp[i+1]})
37
+ for i in range(len(goal_temp) - 1):
38
+ goals.update({goal_temp[i]: goal_temp[i + 1]})
25
39
 
26
40
  gt_data = {
27
41
  "agent": "airline_agent",
@@ -41,10 +55,30 @@ history = []
41
55
  for msg in test_data[0]["traj"]:
42
56
  if msg["role"] == "tool":
43
57
  print(msg["content"])
44
- history.append(Message(role=msg["role"], content=json.dumps({"type": "tool_call", "args": json.loads(msg["content"]), "name": msg["name"], "tool_call_id": msg["tool_call_id"]}), type=ContentType.tool_call,
45
- event=EventTypes.message_created))
58
+ history.append(
59
+ Message(
60
+ role=msg["role"],
61
+ content=json.dumps(
62
+ {
63
+ "type": "tool_call",
64
+ "args": json.loads(msg["content"]),
65
+ "name": msg["name"],
66
+ "tool_call_id": msg["tool_call_id"],
67
+ }
68
+ ),
69
+ type=ContentType.tool_call,
70
+ event=EventTypes.message_created,
71
+ )
72
+ )
46
73
  else:
47
- history.append(Message(role=msg["role"], content=str(msg["content"]), type=ContentType.text, event=EventTypes.message_created))
74
+ history.append(
75
+ Message(
76
+ role=msg["role"],
77
+ content=str(msg["content"]),
78
+ type=ContentType.text,
79
+ event=EventTypes.message_created,
80
+ )
81
+ )
48
82
 
49
83
  print(f"length of history {history}")
50
84
 
@@ -53,7 +87,7 @@ evaluation_package = EvaluationPackage(
53
87
  messages=history,
54
88
  ground_truth=gt_data,
55
89
  conversational_search_data=None,
56
- resource_map=None
90
+ resource_map=None,
57
91
  )
58
92
  print("1")
59
93
  (
@@ -64,4 +98,4 @@ print("1")
64
98
  ) = evaluation_package.generate_summary()
65
99
 
66
100
 
67
- print(metrics)
101
+ print(metrics)
@@ -1,6 +1,7 @@
1
- from typing import Any, Dict, List, Union, Optional
2
- from wxo_agentic_evaluation.type import Message, ContentType, EventTypes
3
1
  import json
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ from wxo_agentic_evaluation.type import ContentType, EventTypes, Message
4
5
 
5
6
  # with open("src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
6
7
  # data = json.load(f)
@@ -16,6 +17,13 @@ def convert_otel_to_message(otel_traces):
16
17
  print(row.keys())
17
18
  role = row.get("role", "assistant")
18
19
 
19
- history.append(Message(role = role, content= content, type=ContentType.text, event=EventTypes.message_created))
20
+ history.append(
21
+ Message(
22
+ role=role,
23
+ content=content,
24
+ type=ContentType.text,
25
+ event=EventTypes.message_created,
26
+ )
27
+ )
20
28
 
21
- return history
29
+ return history