ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,163 @@
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+
4
+ from collections import defaultdict
5
+ from functools import partial
6
+ from typing import Dict, List, Any, Optional
7
+
8
+ from langfuse import get_client
9
+ langfuse_client = get_client()
10
+
11
+ from wxo_agentic_evaluation.type import OTelParserMessage
12
+ from . import langgraph_parser, pydantic_parser, langflow_parser, wxo_parser
13
+
14
+ def parse_session(session_id):
15
+ traces = get_traces(session_id)
16
+ messages: list[OTelParserMessage] = []
17
+ for tr in traces.data:
18
+ trace_messages = parse_trace(tr)
19
+ messages = add_messages(messages, trace_messages)
20
+ return messages
21
+
22
+ def parse_trace(trace):
23
+ messages: list[OTelParserMessage] = []
24
+ agent_framework = get_agent_framework(trace)
25
+ if agent_framework == 'langfuse.langgraph_agent': parser_func = langgraph_parser.parse_observations
26
+ elif agent_framework == 'langfuse.langflow': parser_func = partial(langflow_parser.parse_observations, dfs_callable=dfs_)
27
+ elif agent_framework == 'langgraph_agent': parser_func = langgraph_parser.parse_observations
28
+ elif agent_framework == 'pydantic_ai':
29
+ parser_func = pydantic_parser.parse_observations
30
+ sys_message = pydantic_parser.get_system_message(trace)
31
+ if sys_message: messages.append(sys_message)
32
+ else:
33
+ parser_func = None
34
+ parsers_to_try = [
35
+ partial(langflow_parser.parse_observations, dfs_callable=dfs_),
36
+ wxo_parser.parse_observations,
37
+ ]
38
+ observations = get_observations(trace.observations)
39
+ dfs_observations = dfs_(observations)
40
+ if parser_func:
41
+ parsed_messages = parser_func(observations, dfs_observations)
42
+ messages = add_messages(messages, parsed_messages)
43
+ else:
44
+ for parser_func in parsers_to_try:
45
+ try:
46
+ parsed_messages = parser_func(observations, dfs_observations)
47
+ if not parsed_messages: continue
48
+ messages = add_messages(messages, parsed_messages)
49
+ break
50
+ except Exception as e:
51
+ print(e)
52
+ return messages
53
+
54
+
55
+ def add_messages(messages: list[OTelParserMessage], parsed_messages: list[OTelParserMessage]):
56
+ ret: list[OTelParserMessage] = []
57
+ seen: set[str] = set()
58
+ for msg in messages:
59
+ msg_hash = msg.hash()
60
+ if msg_hash in seen: continue
61
+ seen.add(msg_hash)
62
+ ret.append(msg)
63
+ for msg in parsed_messages:
64
+ msg_hash = msg.hash()
65
+ if msg_hash in seen: continue
66
+ seen.add(msg_hash)
67
+ ret.append(msg)
68
+ return ret
69
+
70
+ def get_agent_framework(trace):
71
+ """
72
+ Supported frameworks:
73
+ - OpenInference
74
+ - Pydantic AI
75
+ - Langflow
76
+ - Langgraph Agent
77
+ - LangFuse
78
+ - Langflow
79
+ - LangGraph Agent
80
+ """
81
+ md_attrs = trace.metadata.get('attributes', {})
82
+ scope = trace.metadata.get('scope', {})
83
+ scope_name = scope.get('name', '')
84
+
85
+ if scope_name == 'langfuse-sdk':
86
+ if trace.name == 'LangGraph': return 'langfuse.langgraph_agent'
87
+
88
+ if 'langflow.project.name' in md_attrs.keys(): return 'langflow'
89
+ if 'pydantic-ai' in scope_name: return 'pydantic_ai'
90
+ if 'openinference.instrumentation.langchain' in scope_name:
91
+ # TODO: need to find a better way to detect Langgraph Agent
92
+ return 'langgraph_agent'
93
+
94
+ # check for langflow
95
+ # get observations for trace
96
+ observations = dfs_(get_observations(trace.observations))
97
+ for obs in observations:
98
+ if 'from_langflow_component' in obs.obs.metadata.keys(): return 'langfuse.langflow'
99
+ return "UNKNOWN"
100
+
101
+
102
+ def get_traces(session_id):
103
+ traces = langfuse_client.api.trace.list(
104
+ session_id=session_id,
105
+ limit=100,
106
+ )
107
+ # sort by timestamp
108
+ traces.data.sort(key=lambda x: x.timestamp)
109
+ return traces
110
+
111
+
112
+ def get_observations(observations_ids):
113
+ observations = [langfuse_client.api.observations.get(obs_id) for obs_id in observations_ids]
114
+ observations.sort(key=lambda x: x.start_time)
115
+ observation_tree = build_observation_forest(observations)
116
+ return observation_tree
117
+
118
+ class ObsNode:
119
+ def __init__(self, obs: Any):
120
+ self.obs = obs
121
+ self.children: List["ObsNode"] = []
122
+ self.parent: Optional["ObsNode"] = None
123
+
124
+ def __repr__(self):
125
+ return f"ObsNode(id={self.obs.id}, type={self.obs.type}, name={self.obs.name})"
126
+
127
+ def build_observation_forest(observations: List[Any]) -> List[ObsNode]:
128
+ """Return list of root nodes; each has .children forming a tree."""
129
+ nodes: Dict[str, ObsNode] = {}
130
+ children_by_parent: Dict[Optional[str], List[ObsNode]] = defaultdict(list)
131
+
132
+ # 1. Create nodes for each observation
133
+ for o in observations:
134
+ node = ObsNode(o)
135
+ nodes[o.id] = node
136
+ parent_id = getattr(o, "parent_observation_id", None)
137
+ children_by_parent[parent_id].append(node)
138
+
139
+ # 2. Attach children to parents
140
+ for parent_id, child_nodes in children_by_parent.items():
141
+ if parent_id is None:
142
+ continue
143
+ parent_node = nodes.get(parent_id)
144
+ if parent_node:
145
+ parent_node.children.extend(child_nodes)
146
+ for child_node in child_nodes: child_node.parent = parent_node
147
+
148
+ # 3. Roots are those with parent_observation_id == None
149
+ roots = children_by_parent[None]
150
+ return roots
151
+
152
+ def dfs_(observation_tree: List[ObsNode]):
153
+ ret =[]
154
+ for node in observation_tree:
155
+ ret.append(node)
156
+ ret.extend(dfs_(node.children))
157
+ return ret
158
+
159
+
160
+ if __name__ == "__main__":
161
+ messages = parse_session(session_id="93a24957-dd7a-425d-b821-88de49940a6e")
162
+
163
+ print(messages)
@@ -0,0 +1,38 @@
1
+ import dataclasses
2
+ from typing import Literal
3
+ from hashlib import md5
4
+
5
+ # TODO: This is not a fully chat completions message.
6
+
7
+
8
+ @dataclasses.dataclass
9
+ class Function:
10
+ name: str
11
+ arguments: str
12
+
13
+ def __str__(self):
14
+ return f"{self.name}:{self.arguments}"
15
+
16
+
17
+ @dataclasses.dataclass
18
+ class ToolCall:
19
+ id: str
20
+ function: "Function"
21
+ type: Literal["function"] = "function"
22
+
23
+ def __str__(self):
24
+ return f"{self.id}:{self.type}:{self.function}"
25
+
26
+
27
+ @dataclasses.dataclass
28
+ class Message:
29
+ role: Literal["user", "assistant", "tool", "system"]
30
+ content: str | None = None
31
+ tool_calls: list["ToolCall"] | None = None
32
+ tool_call_id: str | None = None
33
+
34
+ def __str__(self):
35
+ return f'{self.role}:{self.content}:{":".join(map(str, self.tool_calls or []))}:{self.tool_call_id}'
36
+
37
+ def hash(self):
38
+ return md5(self.__str__().encode("utf-8")).hexdigest()
@@ -0,0 +1,50 @@
1
+ import json
2
+ from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
3
+
4
+
5
+ def get_system_message(trace):
6
+ sys_instruction_json = trace.metadata.get('attributes', {}).get('gen_ai.system_instructions')
7
+ if not sys_instruction_json: return None
8
+ instruction = json.loads(sys_instruction_json)[0]['content']
9
+ return OTelParserMessage(role='system', content=instruction, type=ContentType.text)
10
+
11
+ def parse_observations(observation_tree, dfs_observations):
12
+ messages = []
13
+ for obs in dfs_observations:
14
+ if obs.obs.type == 'GENERATION':
15
+ messages.extend(_get_messages(obs.obs.input))
16
+ messages.extend(_get_messages(obs.obs.output))
17
+ return messages
18
+
19
+
20
+ def _get_messages(data):
21
+ messages = []
22
+ for msg in data:
23
+ if msg['role'] == 'user':
24
+ parts = msg['parts']
25
+ for part in parts:
26
+ content_type = part['type']
27
+ if content_type == 'text':
28
+ content = part['content']
29
+ messages.append(OTelParserMessage(role='user', content=content, type=ContentType.text))
30
+ elif content_type == 'tool_call_response':
31
+ tool_call_id = part['id']
32
+ result = json.dumps(part['result'])
33
+ messages.append(OTelParserMessage(role='tool', tool_call_id=tool_call_id, content=result, type=ContentType.tool_response))
34
+ elif msg['role'] == 'assistant':
35
+ parts = msg['parts']
36
+ # TODO: assuming that only one message is present in the assistant parts
37
+ content_type = parts[0]['type']
38
+ if content_type == 'text':
39
+ content = parts[0]['content']
40
+ messages.append(OTelParserMessage(role='assistant', content=content, type=ContentType.text))
41
+ elif content_type == 'tool_call':
42
+ tool_calls = []
43
+ for part in parts:
44
+ tool_call_id = part['id']
45
+ tool_call_name = part['name']
46
+ tool_call_arguments = part['arguments']
47
+ tool_call = OTelParserToolCall(id=tool_call_id, function=OTelParserFunction(name=tool_call_name, arguments=tool_call_arguments))
48
+ tool_calls.append(tool_call)
49
+ messages.append(OTelParserMessage(role='assistant', content='', tool_calls=tool_calls, type=ContentType.tool_call))
50
+ return messages
@@ -0,0 +1,15 @@
1
+ import json
2
+ from .parser_types import ToolCall, Function
3
+
4
+ def to_tool_call(tool_call):
5
+ return ToolCall(
6
+ id=tool_call['id'],
7
+ type=tool_call['type'],
8
+ function=to_function(tool_call)
9
+ )
10
+
11
+ def to_function(func):
12
+ return Function(
13
+ name=func['name'],
14
+ arguments=json.dumps(func['args'])
15
+ )
@@ -0,0 +1,39 @@
1
+ import json
2
+ from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
3
+
4
+ def parse_observations(observation_tree, dfs_observations):
5
+ messages = []
6
+ for node in dfs_observations:
7
+ if node.obs.type == 'GENERATION':
8
+ if node.parent.obs.name == 'invoke_agent':
9
+ messages.extend(_get_input_messages(node.parent.obs.input))
10
+ messages.extend(_get_output_message(node.parent.obs.output))
11
+ return messages
12
+
13
+ def _get_input_messages(data):
14
+ messages = []
15
+ for msg in data['messages']:
16
+ if msg['type'] == 'system': messages.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
17
+ elif msg['type'] == 'human': messages.append(OTelParserMessage(role='user', content=msg['content'], type=ContentType.text))
18
+ elif msg['type'] == 'tool': messages.append(OTelParserMessage(role='tool', content=msg['content'], tool_call_id=msg['tool_call_id']))
19
+ elif msg['type'] == 'ai':
20
+ if msg.get('additional_kwargs', {}).get('tool_calls', None) is not None:
21
+ msg_tool_calls = msg['additional_kwargs']['tool_calls']
22
+ tool_calls = []
23
+ for tc in msg_tool_calls:
24
+ tool_calls.append(OTelParserToolCall(id=tc['id'], function=OTelParserFunction(name=tc['function']['name'], arguments=tc['function']['arguments'])))
25
+ else:
26
+ tool_calls = None
27
+ messages.append(OTelParserMessage(role='assistant', content=msg['content'], tool_calls=tool_calls, type=ContentType.tool_call))
28
+ return messages
29
+
30
+
31
+ def _get_output_message(data):
32
+ role = 'assistant'
33
+ content = data.get('content', '')
34
+ if data.get('tool_calls', None):
35
+ tool_calls = []
36
+ for tc in data['tool_calls']:
37
+ tool_calls.append(OTelParserToolCall(id=tc['id'], function=OTelParserFunction(name=tc['name'], arguments=json.dumps(tc.get('args', {})))))
38
+ else: tool_calls = None
39
+ return [OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls)]
@@ -0,0 +1,101 @@
1
+ import json
2
+
3
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
4
+ from wxo_agentic_evaluation.type import (
5
+ ContentType,
6
+ EvaluationData,
7
+ EventTypes,
8
+ Message,
9
+ )
10
+
11
+ with open(
12
+ "/Users/haodeqi/git/tau-bench/historical_trajectories/gpt-4o-airline.json",
13
+ "r",
14
+ ) as f:
15
+ test_data = json.load(f)
16
+
17
+
18
+ goal_temp = []
19
+
20
+ goals = {}
21
+ goal_details = []
22
+
23
+ i = 0
24
+ for action in test_data[0]["info"]["task"]["actions"]:
25
+ goal_temp.append(action["name"] + f"_{i}")
26
+ goal_detail = {
27
+ "type": "tool_call",
28
+ "name": action["name"] + f"_{i}",
29
+ "tool_name": action["name"],
30
+ "args": {k: str(v) for k, v in action["kwargs"].items()},
31
+ }
32
+ goal_details.append(goal_detail)
33
+
34
+ if len(goal_temp) == 1:
35
+ goals[goal_temp[0]] = []
36
+ else:
37
+ for i in range(len(goal_temp) - 1):
38
+ goals.update({goal_temp[i]: goal_temp[i + 1]})
39
+
40
+ gt_data = {
41
+ "agent": "airline_agent",
42
+ "goals": goals,
43
+ "goal_details": goal_details,
44
+ "story": test_data[0]["info"]["task"]["instruction"],
45
+ "starting_sentence": "",
46
+ }
47
+ print("2")
48
+ gt_data = EvaluationData.model_validate(gt_data)
49
+
50
+ tc_name = "airline_1"
51
+
52
+ print(test_data[0]["traj"][0])
53
+
54
+ history = []
55
+ for msg in test_data[0]["traj"]:
56
+ if msg["role"] == "tool":
57
+ print(msg["content"])
58
+ history.append(
59
+ Message(
60
+ role=msg["role"],
61
+ content=json.dumps(
62
+ {
63
+ "type": "tool_call",
64
+ "args": json.loads(msg["content"]),
65
+ "name": msg["name"],
66
+ "tool_call_id": msg["tool_call_id"],
67
+ }
68
+ ),
69
+ type=ContentType.tool_call,
70
+ event=EventTypes.message_created,
71
+ )
72
+ )
73
+ else:
74
+ history.append(
75
+ Message(
76
+ role=msg["role"],
77
+ content=str(msg["content"]),
78
+ type=ContentType.text,
79
+ event=EventTypes.message_created,
80
+ )
81
+ )
82
+
83
+ print(f"length of history {history}")
84
+
85
+ evaluation_package = EvaluationPackage(
86
+ test_case_name=tc_name,
87
+ messages=history,
88
+ ground_truth=gt_data,
89
+ conversational_search_data=None,
90
+ resource_map=None,
91
+ )
92
+ print("1")
93
+ (
94
+ keyword_semantic_matches,
95
+ knowledge_base_metrics,
96
+ messages_with_reason,
97
+ metrics,
98
+ ) = evaluation_package.generate_summary()
99
+
100
+
101
+ print(metrics)
@@ -0,0 +1,29 @@
1
+ import json
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ from wxo_agentic_evaluation.type import ContentType, EventTypes, Message
5
+
6
+ # with open("src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
7
+ # data = json.load(f)
8
+ #
9
+ # otel_traces = data["calls"][-1]["messages"]
10
+
11
+
12
+ def convert_otel_to_message(otel_traces):
13
+ history = []
14
+ for row in otel_traces:
15
+ print(row)
16
+ content = row["content"]
17
+ print(row.keys())
18
+ role = row.get("role", "assistant")
19
+
20
+ history.append(
21
+ Message(
22
+ role=role,
23
+ content=content,
24
+ type=ContentType.text,
25
+ event=EventTypes.message_created,
26
+ )
27
+ )
28
+
29
+ return history