ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,114 @@
1
+ from typing import List
2
+
3
+ from wxo_agentic_evaluation.base_user import BaseUserSimulator
4
+ from wxo_agentic_evaluation.prompt.template_render import UserTemplateRenderer
5
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
6
+ from wxo_agentic_evaluation.type import ContentType, Message
7
+
8
+
9
+ class LLMUserV2(BaseUserSimulator):
10
+ def __init__(
11
+ self,
12
+ llm_client: Provider,
13
+ user_prompt_path: str,
14
+ ):
15
+ self.llm_client = llm_client
16
+ self.user_prompt_path = user_prompt_path
17
+ self.prompt_template = UserTemplateRenderer(
18
+ template_path=user_prompt_path
19
+ )
20
+
21
+ def _get_system_prompt(
22
+ self, user_story: str, user_response_style: List[str] = None
23
+ ) -> Message:
24
+ # Get the user system prompt
25
+ prompt_messages = self.prompt_template.render(
26
+ user_story=user_story,
27
+ user_response_style=user_response_style,
28
+ )
29
+ return Message(**prompt_messages[0], type=ContentType.text)
30
+
31
+ def _get_message_dicts(self, messages: List[Message]) -> List[dict]:
32
+ # Convert messages to dictionary format for the llm client
33
+ return [message.model_dump() for message in messages]
34
+
35
+ def _filter_conversation_history(
36
+ self, conversation_history: List[Message]
37
+ ) -> List[Message]:
38
+ # Filter out the agent system prompt
39
+ return [
40
+ message
41
+ for message in conversation_history
42
+ if message.role != "system"
43
+ ]
44
+
45
+ def flip_message_roles(self, messages: List[Message]) -> List[Message]:
46
+ # We flip the roles of messages in conversation history to basically prompt the
47
+ # user simulator with the assistant message as the user input message
48
+ # This helps to get the llm to respond as a natural user with the given story.
49
+ new_messages = []
50
+ for message in messages:
51
+ if message.role == "user":
52
+ new_messages.append(
53
+ Message(
54
+ role="assistant",
55
+ content=message.content,
56
+ type=ContentType.text,
57
+ )
58
+ )
59
+ else:
60
+ new_messages.append(
61
+ Message(
62
+ role="user",
63
+ content=message.content,
64
+ type=ContentType.text,
65
+ )
66
+ )
67
+ return new_messages
68
+
69
+ def generate_user_input(
70
+ self,
71
+ user_story: str,
72
+ conversation_history: List[Message],
73
+ user_response_style: List[str] = None,
74
+ starting_user_input: Message = None,
75
+ **kwargs,
76
+ ) -> Message:
77
+ # Get the user system prompt
78
+ system_prompt = self._get_system_prompt(user_story, user_response_style)
79
+
80
+ conversation_history = self._filter_conversation_history(
81
+ conversation_history
82
+ )
83
+
84
+ ## Adding dummy message if not provided from the simulation side.
85
+ if len(conversation_history) == 0:
86
+ conversation_history.append(
87
+ Message(
88
+ role="assistant",
89
+ content="Hi! How can I help you today?",
90
+ type=ContentType.text,
91
+ )
92
+ )
93
+
94
+ conversation_history = self.flip_message_roles(conversation_history)
95
+
96
+ # build the conversation history with the system prompt
97
+ messages = [system_prompt] + conversation_history
98
+
99
+ if starting_user_input is not None:
100
+ # If starting user input is provided, return it as is for the initial turn
101
+ return starting_user_input
102
+ else:
103
+
104
+ # Get response from LLM for simulation
105
+ response = self.llm_client.chat(
106
+ messages=self._get_message_dicts(messages)
107
+ )
108
+ response_message = Message(
109
+ role="user",
110
+ content=response.choices[0].message.content,
111
+ type=ContentType.text,
112
+ )
113
+
114
+ return response_message
@@ -1,383 +1,152 @@
1
- import csv
2
1
  import dataclasses
3
- import glob
4
2
  import json
5
3
  import os
6
- import traceback
7
- from concurrent.futures import ThreadPoolExecutor
4
+ import pathlib
5
+ from datetime import datetime
8
6
  from pathlib import Path
9
- from typing import List
10
7
 
11
- import rich
12
8
  import yaml
13
9
  from jsonargparse import CLI
14
- from rich.progress import Progress
15
10
 
16
11
  from wxo_agentic_evaluation.arg_configs import TestConfig
17
- from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
18
- from wxo_agentic_evaluation.inference_backend import (
19
- EvaluationController,
20
- WXOInferenceBackend,
21
- get_wxo_client,
22
- )
23
- from wxo_agentic_evaluation.llm_user import LLMUser
12
+ from wxo_agentic_evaluation.clients import bootstrap_clients
24
13
  from wxo_agentic_evaluation.metrics.metrics import (
25
- KnowledgeBaseMetricSummary,
26
- TextMatchType,
27
- ToolCallAndRoutingMetrics,
14
+ extract_metrics,
15
+ format_metrics_for_display,
28
16
  )
29
- from wxo_agentic_evaluation.prompt.template_render import (
30
- LlamaUserTemplateRenderer,
17
+ from wxo_agentic_evaluation.runner import process_test_case
18
+ from wxo_agentic_evaluation.scheduler import (
19
+ discover_tests,
20
+ enumerate_jobs,
21
+ run_jobs,
31
22
  )
32
- from wxo_agentic_evaluation.resource_map import ResourceMap
33
- from wxo_agentic_evaluation.service_provider import get_provider
34
- from wxo_agentic_evaluation.type import EvaluationData
35
- from wxo_agentic_evaluation.utils import json_dump
36
23
  from wxo_agentic_evaluation.utils.utils import (
37
24
  SummaryPanel,
38
25
  create_table,
39
- safe_divide,
26
+ csv_dump,
40
27
  )
41
-
42
-
43
- def process_test_case(
44
- task_n, test_case, config, inference_backend, resource_map, llm_user
45
- ):
46
- summary_results_for_path = []
47
- tc_name = os.path.basename(test_case).replace(".json", "")
48
- with open(test_case, "r") as f:
49
- test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
50
-
51
- evaluation_controller = EvaluationController(
52
- wxo_inference_backend=inference_backend,
53
- llm_user=llm_user,
54
- config=config,
55
- )
56
- rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
57
- (
58
- history,
59
- call_tracker,
60
- conversational_search_data,
61
- ) = evaluation_controller.run(
62
- task_n,
63
- test_case.story,
64
- agent_name=test_case.agent,
65
- starting_user_input=test_case.starting_sentence,
66
- )
67
- result = list()
68
- for message in history:
69
- result.append(message.model_dump())
70
-
71
- json_dump(
72
- os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
73
- result,
74
- )
75
-
76
- if len(conversational_search_data) > 0:
77
- fn = tc_name + ".retrieval_context.json"
78
- out_folder = Path(config.output_dir) / "knowledge_base_metrics"
79
- out_folder.mkdir(exist_ok=True)
80
- rc = [context.model_dump() for context in conversational_search_data]
81
- json_dump(out_folder / fn, rc)
82
-
83
- # If data annotation run, skip summary generation
84
- if config.data_annotation_run:
85
- return summary_results_for_path # empty result set, skip summary
86
-
87
- evaluation_package = EvaluationPackage(
88
- test_case_name=tc_name,
89
- messages=history,
90
- ground_truth=test_case,
91
- conversational_search_data=conversational_search_data,
92
- resource_map=resource_map,
93
- )
94
- (
95
- keyword_semantic_matches,
96
- knowledge_base_metrics,
97
- messages_with_reason,
98
- metrics,
99
- ) = evaluation_package.generate_summary()
100
- temp = []
101
- for message in messages_with_reason:
102
- temp.append(message.model_dump())
103
- json_dump(
104
- os.path.join(
105
- config.output_dir, "messages", tc_name + ".messages.analyze.json"
106
- ),
107
- temp,
108
- )
109
-
110
- json_dump(
111
- os.path.join(config.output_dir, "messages", tc_name + ".metrics.json"),
112
- metrics.model_dump(),
113
- )
114
-
115
- metrics.dataset_name = tc_name
116
- metrics.avg_resp_time = (
117
- sum(call_tracker.generic) + sum(call_tracker.tool_call)
118
- ) / (len(call_tracker.generic) + len(call_tracker.tool_call))
119
- metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
120
-
121
- summary_results_for_path.append((metrics, knowledge_base_metrics))
122
-
123
- return summary_results_for_path
28
+ from wxo_agentic_evaluation.langfuse_collection import LangfuseCollection
29
+ from wxo_agentic_evaluation.metrics.journey_success import JourneySuccessMetric
30
+ from wxo_agentic_evaluation.metrics.tool_calling import ToolCalling
31
+ from wxo_agentic_evaluation.langfuse_evaluation_package import EvaluationRunner, sample_aggregator
124
32
 
125
33
 
126
34
  def main(config: TestConfig):
127
- executor = ThreadPoolExecutor(max_workers=config.num_workers)
128
- if config.num_workers > 1 and config.enable_manual_user_input:
129
- rich.print(
130
- "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
35
+ # setup
36
+ clients = bootstrap_clients(config)
37
+ if not getattr(config, "skip_available_results", False):
38
+ ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
39
+ config.output_dir = os.path.join(config.output_dir, ts)
40
+
41
+ if not config.skip_legacy_evaluation:
42
+ knowledge_base_output_folder = (
43
+ Path(config.output_dir) / "knowledge_base_metrics"
131
44
  )
132
- config.enable_manual_user_input = (
133
- False # disable manual user input for parallel execution
45
+ knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
46
+ detailed_rag_output_file = (
47
+ knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
134
48
  )
135
- # reason: threads continue to stream messages while waiting for user input, which is not desired
136
- # and the manual input prompt is not labelled properly in the UI
137
- wxo_client = get_wxo_client(
138
- config.auth_config.url,
139
- config.auth_config.tenant_name,
140
- config.auth_config.token,
141
- )
142
- resource_map = ResourceMap(wxo_client)
143
- inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
144
- llm_user = LLMUser(
145
- wai_client=get_provider(
146
- config=config.provider_config,
147
- model_id=config.llm_user_config.model_id,
148
- ),
149
- template=LlamaUserTemplateRenderer(
150
- config.llm_user_config.prompt_config
151
- ),
152
- user_response_style=config.llm_user_config.user_response_style,
153
- )
154
-
155
- print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
156
-
157
- results_list = []
49
+ summary_rag_output_file = (
50
+ Path(config.output_dir) / "knowledge_base_summary_metrics.json"
51
+ )
52
+ os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
158
53
 
159
- knowledge_base_output_folder = (
160
- Path(config.output_dir) / "knowledge_base_metrics"
54
+ # discover & schedule tests
55
+ test_cases = discover_tests(
56
+ config.test_paths, config.enable_recursive_search
161
57
  )
162
- knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
163
- detailed_rag_output_file = (
164
- knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
58
+ jobs = enumerate_jobs(
59
+ test_cases,
60
+ config.n_runs,
61
+ config.skip_available_results,
62
+ config.output_dir,
165
63
  )
166
- summary_rag_output_file = (
167
- Path(config.output_dir) / "knowledge_base_summary_metrics.json"
64
+ results = run_jobs(
65
+ jobs, config, clients, process_test_case, config.num_workers
168
66
  )
169
67
 
170
- os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
171
- available_res = set()
172
- if config.skip_available_results:
173
- available_res = set(
174
- [
175
- os.path.basename(f).replace(".messages", "")
176
- for f in glob.glob(
177
- os.path.join(
178
- config.output_dir, "messages", "*.messages.json"
179
- )
180
- )
181
- ]
182
- )
183
-
184
- test_cases = []
185
- for test_path in config.test_paths:
186
- if os.path.isdir(test_path):
187
- test_path = os.path.join(test_path, "*.json")
188
- test_cases.extend(sorted(glob.glob(test_path)))
189
-
190
- futures = []
191
- task_n = 0
192
- for test_case in test_cases:
193
- if not test_case.endswith(".json") or test_case.endswith("agent.json"):
194
- continue
195
- if config.skip_available_results:
196
- if test_case in available_res:
197
- print(
198
- f"Skipping test case {test_case} as results already exist."
199
- )
200
- continue
201
-
202
- future = executor.submit(
203
- process_test_case,
204
- task_n,
205
- test_case,
206
- config,
207
- inference_backend,
208
- resource_map,
209
- llm_user,
210
- )
211
-
212
- futures.append((test_case, future))
213
- task_n += 1
214
-
215
- if futures:
216
- with Progress() as progress:
217
- task1 = progress.add_task(
218
- f"[purple]Evaluating {len(futures)} tasks...",
219
- total=len(futures),
220
- )
221
- for test_case, future in futures:
222
- try:
223
- results_list.extend(future.result())
224
- except Exception as e:
225
- rich.print(f"test case {test_case} fails with {e}")
226
- traceback.print_exc()
227
- finally:
228
- progress.update(task1, advance=1)
229
-
230
- tool_call_metrics = [metric[0] for metric in results_list]
231
- knowledge_base_metrics = [metric[1] for metric in results_list]
68
+ # extract
69
+ tool_metrics, kb_summary, custom_metrics = extract_metrics(results)
232
70
 
233
- rag_metric_summary = KnowledgeBaseMetricSummary(
234
- knowledge_base_metrics=knowledge_base_metrics
235
- )
236
- SummaryPanel(rag_metric_summary).print()
237
-
238
- with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
239
- json.dump(
240
- rag_metric_summary.model_dump(by_alias=True)["detailed"],
241
- f,
242
- indent=4,
71
+ if not config.skip_legacy_evaluation:
72
+ # write results
73
+ csv_dump(
74
+ pathlib.Path(config.output_dir) / "summary_metrics.csv",
75
+ rows=[metric.model_dump() for metric in tool_metrics],
243
76
  )
244
-
245
- with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
246
- json.dump(
247
- rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4
77
+ for file_path, key in [
78
+ (detailed_rag_output_file, "detailed"),
79
+ (summary_rag_output_file, "summary"),
80
+ ]:
81
+ with open(file_path, "w+", encoding="utf-8") as f:
82
+ json.dump(kb_summary.model_dump(by_alias=True)[key], f, indent=4)
83
+
84
+ # print results
85
+ SummaryPanel(kb_summary).print()
86
+ tool_table = create_table(
87
+ format_metrics_for_display(tool_metrics), title="Agent Metrics"
248
88
  )
249
-
250
- if len(tool_call_metrics) > 0:
251
- # remove the average row if exist
252
- tool_call_metrics = [
253
- row
254
- for row in tool_call_metrics
255
- if row.dataset_name != "Summary (Average)"
256
- ]
257
-
258
- def filter_display_only_values(
259
- tool_call_metric: ToolCallAndRoutingMetrics,
260
- ):
261
- row = {
262
- "Dataset": tool_call_metric.dataset_name,
263
- "Total Steps": tool_call_metric.total_steps,
264
- "LLM Steps": tool_call_metric.llm_step,
265
- "Total Tool Calls": tool_call_metric.total_tool_calls,
266
- "Tool Call Precision": tool_call_metric.tool_call_precision,
267
- "Tool Call Recall": tool_call_metric.tool_call_recall,
268
- "Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy,
269
- "Text Match": tool_call_metric.text_match,
270
- "Journey Success": tool_call_metric.is_success,
271
- "Avg Resp Time (sec)": tool_call_metric.avg_resp_time,
272
- }
273
- return row
274
-
275
- def create_avg_row(metrics: List[dict]):
276
- avg_row = {
277
- "Dataset": "Summary (Average)",
278
- "Total Steps": 0,
279
- "LLM Steps": 0,
280
- "Total Tool Calls": 0,
281
- "Tool Call Precision": 0,
282
- "Tool Call Recall": 0,
283
- "Agent Routing Accuracy": 0,
284
- "Text Match": 0,
285
- "Journey Success": 0,
286
- "Avg Resp Time (sec)": 0,
287
- }
288
- if metrics:
289
- for row in metrics:
290
- avg_row["Total Steps"] += row["Total Steps"]
291
- avg_row["LLM Steps"] += row["LLM Steps"]
292
- avg_row["Total Tool Calls"] += row["Total Tool Calls"]
293
- avg_row["Tool Call Precision"] += row["Tool Call Precision"]
294
- avg_row["Tool Call Recall"] += row["Tool Call Recall"]
295
- avg_row["Agent Routing Accuracy"] += row[
296
- "Agent Routing Accuracy"
297
- ]
298
- avg_row["Text Match"] += (
299
- row["Text Match"] == TextMatchType.text_match.value
300
- )
301
- avg_row["Journey Success"] += row["Journey Success"]
302
- avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
303
-
304
- avg_row["Total Steps"] = round(
305
- safe_divide(avg_row["Total Steps"], len(metrics)), 2
306
- )
307
- avg_row["LLM Steps"] = round(
308
- safe_divide(avg_row["LLM Steps"], len(metrics)), 2
309
- )
310
- avg_row["Total Tool Calls"] = round(
311
- safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2
312
- )
313
- avg_row["Tool Call Precision"] = round(
314
- safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2
315
- )
316
- avg_row["Tool Call Recall"] = round(
317
- safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2
318
- )
319
- avg_row["Agent Routing Accuracy"] = round(
320
- safe_divide(
321
- avg_row["Agent Routing Accuracy"], len(metrics)
322
- ),
323
- 2,
324
- )
325
- avg_row["Text Match"] = round(
326
- safe_divide(
327
- avg_row["Text Match"],
328
- len(
329
- [
330
- row
331
- for row in metrics
332
- if row["Text Match"]
333
- != TextMatchType.text_match.na
334
- ]
335
- ),
336
- ),
337
- 2,
338
- )
339
- avg_row["Journey Success"] = round(
340
- safe_divide(avg_row["Journey Success"], len(metrics)), 2
341
- )
342
- avg_row["Avg Resp Time (sec)"] = round(
343
- safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2
344
- )
345
- return avg_row
346
-
347
- tool_call_metrics_for_display = []
348
- for row in tool_call_metrics:
349
- tool_call_metrics_for_display.append(
350
- filter_display_only_values(row)
351
- )
352
- tool_call_metrics_for_display.append(
353
- create_avg_row(tool_call_metrics_for_display)
89
+ if tool_table:
90
+ tool_table.print()
91
+ if any(cm.custom_metrics for cm in custom_metrics):
92
+ rows = []
93
+ for cm in custom_metrics:
94
+ row = {"dataset_name": cm.dataset_name}
95
+ for m in cm.custom_metrics:
96
+ row[m.eval_name] = str(
97
+ m.value
98
+ ) # Convert to string to avoid type issues
99
+ rows.append(row)
100
+ custom_metrics_table = create_table(rows, title="Custom Metrics")
101
+ if custom_metrics_table:
102
+ custom_metrics_table.print()
103
+ else:
104
+ collection_name = os.path.basename(config.output_dir) + "_collection"
105
+ collection = LangfuseCollection(
106
+ name=collection_name,
107
+ description="",
354
108
  )
355
- tool_call_table_for_display = create_table(
356
- tool_call_metrics_for_display
109
+ dataset_paths = []
110
+ session_ids = []
111
+ for test_case in test_cases:
112
+ name = os.path.basename(test_case).replace(".json", "")
113
+ with open(os.path.join(config.output_dir, f"{name}.metadata.json"), "r") as f:
114
+ metadata = json.load(f)
115
+ session_id = metadata["thread_id"]
116
+ dataset_paths.append(test_case)
117
+ session_ids.append(session_id)
118
+
119
+ collection.upload(paths=dataset_paths)
120
+
121
+ langfuse_collection = LangfuseCollection(name=collection_name)
122
+
123
+ journey_sucess_metric = JourneySuccessMetric()
124
+ tool_calling = ToolCalling()
125
+
126
+ run = EvaluationRunner(
127
+ evaluation_name=os.path.basename(config.output_dir) + "_evaluation",
128
+ run_name=os.path.basename(config.output_dir) + "_run",
129
+ session_ids=session_ids,
130
+ collection=langfuse_collection,
131
+ metrics=[journey_sucess_metric, tool_calling],
132
+ aggregator=sample_aggregator
357
133
  )
358
134
 
359
- if tool_call_table_for_display:
360
- tool_call_table_for_display.print()
361
-
362
- if len(tool_call_metrics) > 0:
363
- tool_call_metrics = [
364
- metric.model_dump() for metric in tool_call_metrics
365
- ]
366
- output_file = os.path.join(config.output_dir, "summary_metrics.csv")
367
- header = list(tool_call_metrics[0].keys())
368
-
369
- with open(output_file, "w") as file:
370
- csv_writer = csv.writer(file)
371
- csv_writer.writerow(header)
372
- for entry in tool_call_metrics:
373
- csv_writer.writerow([entry[name] for name in header])
135
+ run.evaluate()
374
136
 
137
+ # persist config
375
138
  with open(
376
- os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"
139
+ pathlib.Path(config.output_dir) / "config.yml", "w", encoding="utf-8"
377
140
  ) as f:
378
141
  yaml.safe_dump(dataclasses.asdict(config), f)
379
142
 
380
- print(f"Results saved to {config.output_dir}")
143
+ if not config.skip_legacy_evaluation:
144
+ print(f"Results saved to {config.output_dir}")
145
+ else:
146
+ print(f"Config and metadata saved to {config.output_dir}")
147
+ print(f"Langfuse Evaluation run completed for collection {collection_name}:")
148
+ for session_id in session_ids:
149
+ print(f" - http://localhost:3010/project/orchestrate-lite/sessions/{session_id}")
381
150
 
382
151
 
383
152
  if __name__ == "__main__":
@@ -0,0 +1,15 @@
1
+ from wxo_agentic_evaluation.metrics.evaluations import Evaluation
2
+ from wxo_agentic_evaluation.metrics.metrics import (
3
+ Annotation,
4
+ FailedSemanticTestCases,
5
+ FailedStaticTestCases,
6
+ )
7
+
8
+ def argument_matching(expected, actual):
9
+ if actual is None:
10
+ return False
11
+ for field in actual:
12
+ if field not in expected:
13
+ return False
14
+
15
+ return True
@@ -0,0 +1,16 @@
1
+ from langfuse.api.resources.commons.types.score_data_type import ScoreDataType
2
+
3
+ from wxo_agentic_evaluation.metrics import Evaluation
4
+ from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
5
+
6
+ class DummyMetric(Evaluation):
7
+ def __init__(self, llm_client = None):
8
+ super().__init__(llm_client)
9
+
10
+ def evaluate(self, messages, ground_truth, extracted_context, metadata = ..., **kwargs):
11
+ return LangfuseMetric(
12
+ eval_name="dummy_metric",
13
+ value=True,
14
+ metadata=metadata,
15
+ data_type="BOOLEAN",
16
+ )