ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,270 +1,152 @@
1
- from wxo_agentic_evaluation.service_provider import get_provider
2
- from wxo_agentic_evaluation.resource_map import ResourceMap
3
- from wxo_agentic_evaluation.llm_user import LLMUser
4
- from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
5
- from wxo_agentic_evaluation.inference_backend import (
6
- EvaluationController,
7
- get_wxo_client,
8
- WXOInferenceBackend
9
- )
10
- from typing import List
11
- from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
12
- from wxo_agentic_evaluation.type import EvaluationData
1
+ import dataclasses
2
+ import json
3
+ import os
4
+ import pathlib
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+
8
+ import yaml
9
+ from jsonargparse import CLI
13
10
 
14
11
  from wxo_agentic_evaluation.arg_configs import TestConfig
12
+ from wxo_agentic_evaluation.clients import bootstrap_clients
13
+ from wxo_agentic_evaluation.metrics.metrics import (
14
+ extract_metrics,
15
+ format_metrics_for_display,
16
+ )
17
+ from wxo_agentic_evaluation.runner import process_test_case
18
+ from wxo_agentic_evaluation.scheduler import (
19
+ discover_tests,
20
+ enumerate_jobs,
21
+ run_jobs,
22
+ )
15
23
  from wxo_agentic_evaluation.utils.utils import (
16
- create_table,
17
24
  SummaryPanel,
18
- safe_divide
25
+ create_table,
26
+ csv_dump,
19
27
  )
20
- from wxo_agentic_evaluation.utils import json_dump
21
- from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary, ToolCallAndRoutingMetrics, TextMatchType
22
- import os
23
- import json
24
- import traceback
25
- import yaml
26
- import dataclasses
27
- import glob
28
- import rich
29
- import csv
30
- from rich.progress import Progress
31
- from pathlib import Path
32
- from concurrent.futures import ThreadPoolExecutor
33
- from jsonargparse import CLI
34
-
28
+ from wxo_agentic_evaluation.langfuse_collection import LangfuseCollection
29
+ from wxo_agentic_evaluation.metrics.journey_success import JourneySuccessMetric
30
+ from wxo_agentic_evaluation.metrics.tool_calling import ToolCalling
31
+ from wxo_agentic_evaluation.langfuse_evaluation_package import EvaluationRunner, sample_aggregator
35
32
 
36
- def process_test_case(task_n, test_case, config, inference_backend, resource_map, llm_user):
37
- summary_results_for_path = []
38
- tc_name = os.path.basename(test_case).replace(".json", "")
39
- with open(test_case, "r") as f:
40
- test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
41
33
 
42
- evaluation_controller = EvaluationController(
43
- wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
44
- )
45
- rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
46
- history, call_tracker, conversational_search_data = evaluation_controller.run(
47
- task_n,
48
- test_case.story,
49
- agent_name=test_case.agent,
50
- starting_user_input=test_case.starting_sentence,
51
- )
52
- result = list()
53
- for message in history:
54
- result.append(message.model_dump())
55
-
56
- json_dump(
57
- os.path.join(config.output_dir, "messages", tc_name + ".messages.json"), result
58
- )
59
-
60
- if len(conversational_search_data) > 0:
61
- fn = tc_name + ".retrieval_context.json"
62
- out_folder = Path(config.output_dir) / "knowledge_base_metrics"
63
- out_folder.mkdir(exist_ok=True)
64
- rc = [context.model_dump() for context in conversational_search_data]
65
- json_dump(out_folder / fn, rc)
66
-
67
- # If data annotation run, skip summary generation
68
- if config.data_annotation_run:
69
- return summary_results_for_path # empty result set, skip summary
70
-
71
- evaluation_package = EvaluationPackage(
72
- test_case_name=tc_name,
73
- messages=history,
74
- ground_truth=test_case,
75
- conversational_search_data=conversational_search_data,
76
- resource_map=resource_map
77
- )
78
- (
79
- keyword_semantic_matches,
80
- knowledge_base_metrics,
81
- messages_with_reason,
82
- metrics,
83
- ) = evaluation_package.generate_summary()
84
- temp = []
85
- for message in messages_with_reason:
86
- temp.append(message.model_dump())
87
- json_dump(
88
- os.path.join(config.output_dir, "messages", tc_name + ".messages.analyze.json"),
89
- temp,
90
- )
34
+ def main(config: TestConfig):
35
+ # setup
36
+ clients = bootstrap_clients(config)
37
+ if not getattr(config, "skip_available_results", False):
38
+ ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
39
+ config.output_dir = os.path.join(config.output_dir, ts)
40
+
41
+ if not config.skip_legacy_evaluation:
42
+ knowledge_base_output_folder = (
43
+ Path(config.output_dir) / "knowledge_base_metrics"
44
+ )
45
+ knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
46
+ detailed_rag_output_file = (
47
+ knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
48
+ )
49
+ summary_rag_output_file = (
50
+ Path(config.output_dir) / "knowledge_base_summary_metrics.json"
51
+ )
52
+ os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
91
53
 
92
- json_dump(
93
- os.path.join(config.output_dir, "messages", tc_name + ".metrics.json"),
94
- metrics.model_dump(),
54
+ # discover & schedule tests
55
+ test_cases = discover_tests(
56
+ config.test_paths, config.enable_recursive_search
95
57
  )
96
-
97
- metrics.dataset_name = tc_name
98
- metrics.avg_resp_time = (
99
- sum(call_tracker.generic) + sum(call_tracker.tool_call)
100
- ) / (len(call_tracker.generic) + len(call_tracker.tool_call))
101
- metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
102
-
103
- summary_results_for_path.append((metrics, knowledge_base_metrics))
104
-
105
- return summary_results_for_path
106
-
107
-
108
- def main(config: TestConfig):
109
- executor = ThreadPoolExecutor(max_workers=config.num_workers)
110
- wxo_client = get_wxo_client(
111
- config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
58
+ jobs = enumerate_jobs(
59
+ test_cases,
60
+ config.n_runs,
61
+ config.skip_available_results,
62
+ config.output_dir,
112
63
  )
113
- resource_map = ResourceMap(wxo_client)
114
- inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
115
- llm_user = LLMUser(
116
- wai_client=get_provider(config=config.provider_config, model_id=config.llm_user_config.model_id),
117
- template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
118
- user_response_style=config.llm_user_config.user_response_style,
64
+ results = run_jobs(
65
+ jobs, config, clients, process_test_case, config.num_workers
119
66
  )
120
67
 
121
- print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
68
+ # extract
69
+ tool_metrics, kb_summary, custom_metrics = extract_metrics(results)
122
70
 
123
- results_list = []
124
-
125
- knowledge_base_output_folder = Path(config.output_dir) / "knowledge_base_metrics"
126
- knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
127
- detailed_rag_output_file = (
128
- knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
129
- )
130
- summary_rag_output_file = (
131
- Path(config.output_dir) / "knowledge_base_summary_metrics.json"
132
- )
133
-
134
- os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
135
- available_res = set()
136
- if config.skip_available_results:
137
- available_res = set(
138
- [
139
- os.path.basename(f).replace(".messages", "")
140
- for f in glob.glob(
141
- os.path.join(config.output_dir, "messages", "*.messages.json")
142
- )
143
- ]
71
+ if not config.skip_legacy_evaluation:
72
+ # write results
73
+ csv_dump(
74
+ pathlib.Path(config.output_dir) / "summary_metrics.csv",
75
+ rows=[metric.model_dump() for metric in tool_metrics],
144
76
  )
145
-
146
- test_cases = []
147
- for test_path in config.test_paths:
148
- if os.path.isdir(test_path):
149
- test_path = os.path.join(test_path, "*.json")
150
- test_cases.extend(sorted(glob.glob(test_path)))
151
-
152
- futures = []
153
- task_n = 0
154
- for test_case in test_cases:
155
- if not test_case.endswith(".json") or test_case.endswith("agent.json"):
156
- continue
157
- if config.skip_available_results:
158
- if test_case in available_res:
159
- print(f"Skipping test case {test_case} as results already exist.")
160
- continue
161
-
162
- future = executor.submit(
163
- process_test_case,
164
- task_n,
165
- test_case,
166
- config,
167
- inference_backend,
168
- resource_map,
169
- llm_user,
77
+ for file_path, key in [
78
+ (detailed_rag_output_file, "detailed"),
79
+ (summary_rag_output_file, "summary"),
80
+ ]:
81
+ with open(file_path, "w+", encoding="utf-8") as f:
82
+ json.dump(kb_summary.model_dump(by_alias=True)[key], f, indent=4)
83
+
84
+ # print results
85
+ SummaryPanel(kb_summary).print()
86
+ tool_table = create_table(
87
+ format_metrics_for_display(tool_metrics), title="Agent Metrics"
88
+ )
89
+ if tool_table:
90
+ tool_table.print()
91
+ if any(cm.custom_metrics for cm in custom_metrics):
92
+ rows = []
93
+ for cm in custom_metrics:
94
+ row = {"dataset_name": cm.dataset_name}
95
+ for m in cm.custom_metrics:
96
+ row[m.eval_name] = str(
97
+ m.value
98
+ ) # Convert to string to avoid type issues
99
+ rows.append(row)
100
+ custom_metrics_table = create_table(rows, title="Custom Metrics")
101
+ if custom_metrics_table:
102
+ custom_metrics_table.print()
103
+ else:
104
+ collection_name = os.path.basename(config.output_dir) + "_collection"
105
+ collection = LangfuseCollection(
106
+ name=collection_name,
107
+ description="",
108
+ )
109
+ dataset_paths = []
110
+ session_ids = []
111
+ for test_case in test_cases:
112
+ name = os.path.basename(test_case).replace(".json", "")
113
+ with open(os.path.join(config.output_dir, f"{name}.metadata.json"), "r") as f:
114
+ metadata = json.load(f)
115
+ session_id = metadata["thread_id"]
116
+ dataset_paths.append(test_case)
117
+ session_ids.append(session_id)
118
+
119
+ collection.upload(paths=dataset_paths)
120
+
121
+ langfuse_collection = LangfuseCollection(name=collection_name)
122
+
123
+ journey_sucess_metric = JourneySuccessMetric()
124
+ tool_calling = ToolCalling()
125
+
126
+ run = EvaluationRunner(
127
+ evaluation_name=os.path.basename(config.output_dir) + "_evaluation",
128
+ run_name=os.path.basename(config.output_dir) + "_run",
129
+ session_ids=session_ids,
130
+ collection=langfuse_collection,
131
+ metrics=[journey_sucess_metric, tool_calling],
132
+ aggregator=sample_aggregator
170
133
  )
171
134
 
172
- futures.append((test_case, future))
173
- task_n += 1
174
-
175
- if futures:
176
- with Progress() as progress:
177
- task1 = progress.add_task(
178
- f"[purple]Evaluating {len(futures)} tasks...", total=len(futures)
179
- )
180
- for test_case, future in futures:
181
- try:
182
- results_list.extend(future.result())
183
- except Exception as e:
184
- rich.print(f"test case {test_case} fails with {e}")
185
- traceback.print_exc()
186
- finally:
187
- progress.update(task1, advance=1)
188
-
189
- tool_call_metrics = [metric[0] for metric in results_list]
190
- knowledge_base_metrics = [metric[1] for metric in results_list]
191
-
192
- rag_metric_summary = KnowledgeBaseMetricSummary(
193
- knowledge_base_metrics=knowledge_base_metrics
194
- )
195
- SummaryPanel(rag_metric_summary).print()
196
-
197
- with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
198
- json.dump(rag_metric_summary.model_dump(by_alias=True)["detailed"], f, indent=4)
199
-
200
- with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
201
- json.dump(rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4)
202
-
203
- if len(tool_call_metrics) > 0:
204
- # remove the average row if exist
205
- tool_call_metrics = [
206
- row for row in tool_call_metrics if row.dataset_name != "Summary (Average)"
207
- ]
208
-
209
- def filter_display_only_values(tool_call_metric: ToolCallAndRoutingMetrics):
210
- row = {"Dataset": tool_call_metric.dataset_name, "Total Steps": tool_call_metric.total_steps,
211
- "LLM Steps": tool_call_metric.llm_step, "Total Tool Calls":tool_call_metric.total_tool_calls, "Tool Call Precision": tool_call_metric.tool_call_precision, "Tool Call Recall": tool_call_metric.tool_call_recall,
212
- "Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy, "Text Match": tool_call_metric.text_match, "Journey Success": tool_call_metric.is_success, "Avg Resp Time (sec)": tool_call_metric.avg_resp_time}
213
- return row
214
-
215
- def create_avg_row(metrics: List[dict]):
216
- avg_row = {"Dataset": "Summary (Average)", "Total Steps": 0,
217
- "LLM Steps": 0, "Total Tool Calls":0, "Tool Call Precision": 0, "Tool Call Recall": 0, "Agent Routing Accuracy": 0,
218
- "Text Match": 0, "Journey Success": 0, "Avg Resp Time (sec)": 0}
219
- if metrics:
220
- for row in metrics:
221
- avg_row["Total Steps"] += row["Total Steps"]
222
- avg_row["LLM Steps"] += row["LLM Steps"]
223
- avg_row["Total Tool Calls"] += row["Total Tool Calls"]
224
- avg_row["Tool Call Precision"] += row["Tool Call Precision"]
225
- avg_row["Tool Call Recall"] += row["Tool Call Recall"]
226
- avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
227
- avg_row["Text Match"] += row["Text Match"] == TextMatchType.text_match.value
228
- avg_row["Journey Success"] += row["Journey Success"]
229
- avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
230
-
231
- avg_row["Total Steps"] = round(safe_divide(avg_row["Total Steps"], len(metrics)), 2)
232
- avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], len(metrics)), 2)
233
- avg_row["Total Tool Calls"] = round(safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2)
234
- avg_row["Tool Call Precision"] = round(safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2)
235
- avg_row["Tool Call Recall"] = round(safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2)
236
- avg_row["Agent Routing Accuracy"] = round(safe_divide(avg_row["Agent Routing Accuracy"], len(metrics)), 2)
237
- avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], len([row for row in metrics if row["Text Match"] != TextMatchType.text_match.na])), 2)
238
- avg_row["Journey Success"] = round(safe_divide(avg_row["Journey Success"], len(metrics)), 2)
239
- avg_row["Avg Resp Time (sec)"] = round(safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2)
240
- return avg_row
241
-
242
- tool_call_metrics_for_display = []
243
- for row in tool_call_metrics:
244
- tool_call_metrics_for_display.append(filter_display_only_values(row))
245
- tool_call_metrics_for_display.append(create_avg_row(tool_call_metrics_for_display))
246
- tool_call_table_for_display = create_table(tool_call_metrics_for_display)
247
-
248
- if tool_call_table_for_display:
249
- tool_call_table_for_display.print()
250
-
251
- if len(tool_call_metrics) > 0:
252
- tool_call_metrics = [metric.model_dump() for metric in tool_call_metrics]
253
- output_file = os.path.join(config.output_dir, "summary_metrics.csv")
254
- header = list(tool_call_metrics[0].keys())
255
-
256
- with open(output_file, "w") as file:
257
- csv_writer = csv.writer(file)
258
- csv_writer.writerow(header)
259
- for entry in tool_call_metrics:
260
- csv_writer.writerow([entry[name] for name in header])
135
+ run.evaluate()
261
136
 
137
+ # persist config
262
138
  with open(
263
- os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"
139
+ pathlib.Path(config.output_dir) / "config.yml", "w", encoding="utf-8"
264
140
  ) as f:
265
141
  yaml.safe_dump(dataclasses.asdict(config), f)
266
142
 
267
- print(f"Results saved to {config.output_dir}")
143
+ if not config.skip_legacy_evaluation:
144
+ print(f"Results saved to {config.output_dir}")
145
+ else:
146
+ print(f"Config and metadata saved to {config.output_dir}")
147
+ print(f"Langfuse Evaluation run completed for collection {collection_name}:")
148
+ for session_id in session_ids:
149
+ print(f" - http://localhost:3010/project/orchestrate-lite/sessions/{session_id}")
268
150
 
269
151
 
270
152
  if __name__ == "__main__":
@@ -0,0 +1,15 @@
1
+ from wxo_agentic_evaluation.metrics.evaluations import Evaluation
2
+ from wxo_agentic_evaluation.metrics.metrics import (
3
+ Annotation,
4
+ FailedSemanticTestCases,
5
+ FailedStaticTestCases,
6
+ )
7
+
8
+ def argument_matching(expected, actual):
9
+ if actual is None:
10
+ return False
11
+ for field in actual:
12
+ if field not in expected:
13
+ return False
14
+
15
+ return True
@@ -0,0 +1,16 @@
1
+ from langfuse.api.resources.commons.types.score_data_type import ScoreDataType
2
+
3
+ from wxo_agentic_evaluation.metrics import Evaluation
4
+ from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
5
+
6
+ class DummyMetric(Evaluation):
7
+ def __init__(self, llm_client = None):
8
+ super().__init__(llm_client)
9
+
10
+ def evaluate(self, messages, ground_truth, extracted_context, metadata = ..., **kwargs):
11
+ return LangfuseMetric(
12
+ eval_name="dummy_metric",
13
+ value=True,
14
+ metadata=metadata,
15
+ data_type="BOOLEAN",
16
+ )
@@ -0,0 +1,107 @@
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Dict, Optional
4
+
5
+ from wxo_agentic_evaluation.metrics.metrics import Metric
6
+ from wxo_agentic_evaluation.prompt.template_render import LLMaaJTemplateRenderer
7
+ from wxo_agentic_evaluation.service_provider.provider import Provider
8
+ from wxo_agentic_evaluation.type import Message, OrchestrateDataset
9
+ from wxo_agentic_evaluation.utils.messages_parser import ParsedMessages
10
+
11
+ root_dir: str = os.path.dirname(os.path.dirname(__file__))
12
+ LLMAAJ_PROMPT_PATH = os.path.join(root_dir, "prompt", "llmaaj_prompt.jinja2")
13
+
14
+
15
+ class Evaluation(ABC):
16
+ """Abstract base class for all evaluations."""
17
+
18
+ def __init__(self, llm_client: Optional[Provider] = None) -> None:
19
+ self._llm_client = llm_client
20
+
21
+ @property
22
+ def llm_client(self) -> Any:
23
+ """Access client, require it if used."""
24
+ if self._llm_client is None:
25
+ raise RuntimeError(
26
+ f"{self.__class__.__name__} requires a client, but none was provided"
27
+ )
28
+ return self._llm_client
29
+
30
+ @property
31
+ @abstractmethod
32
+ def name(self) -> str:
33
+ """Unique name for the evaluator."""
34
+ raise NotImplementedError
35
+
36
+ @abstractmethod
37
+ def evaluate(
38
+ self,
39
+ messages: list[Message],
40
+ ground_truth: OrchestrateDataset,
41
+ extracted_context: Dict[str, Any],
42
+ ) -> Optional[Metric]:
43
+ """
44
+ Evaluation method.
45
+
46
+ Args:
47
+ messages: agent and user conversational messages (includes tool calls)
48
+ ground_truth: ground truth data
49
+ extracted_context: dictionary containing data derived from the messages
50
+
51
+ Returns:
52
+ Metic
53
+ """
54
+ raise NotImplementedError
55
+
56
+
57
+ class LLMaaJEvaluation(Evaluation, ABC):
58
+ """Evaluation metric for LLMaaJ."""
59
+
60
+ @property
61
+ @abstractmethod
62
+ def llmaaj_instructions(self) -> str:
63
+ """LLMaaJ instructions for the evaluator."""
64
+ raise NotImplementedError
65
+
66
+ @abstractmethod
67
+ def format_llm_output(self, string: str) -> int | float | bool | str:
68
+ """Format the output of the LLMaaJ query."""
69
+ raise NotImplementedError
70
+
71
+ @property
72
+ def selected_context_keys(self) -> set[str]:
73
+ """Override to implement context keys to pass to the prompt."""
74
+ return set()
75
+
76
+ def select_context(
77
+ self, extracted_context: Dict[str, Any]
78
+ ) -> dict[str, Any]:
79
+ """Additional context to be added to the prompt."""
80
+ selected_context = {
81
+ key: value
82
+ for key, value in extracted_context.items()
83
+ if key in self.selected_context_keys
84
+ }
85
+
86
+ return selected_context
87
+
88
+ def evaluate(
89
+ self,
90
+ messages: list[Message],
91
+ ground_truth: OrchestrateDataset,
92
+ extracted_context: Dict[str, Any],
93
+ ) -> Optional[Metric]:
94
+ renderer = LLMaaJTemplateRenderer(LLMAAJ_PROMPT_PATH)
95
+ parsed = ParsedMessages(messages=messages)
96
+ if parsed.user_input is None or parsed.agent_response is None:
97
+ return None
98
+ context = str(self.select_context(extracted_context))
99
+ prompt = renderer.render(
100
+ user_input=parsed.user_input,
101
+ agent_answer=parsed.agent_response,
102
+ llmaaj_instructions=self.llmaaj_instructions,
103
+ context=context,
104
+ )
105
+ score_str = self.llm_client.query(prompt)
106
+ value = self.format_llm_output(score_str)
107
+ return Metric(eval_name=self.name, value=value)
@@ -0,0 +1,137 @@
1
+ import json
2
+ from collections import defaultdict
3
+
4
+ from langfuse.api.resources.commons.types.score_data_type import ScoreDataType
5
+
6
+ from wxo_agentic_evaluation.metrics import Evaluation, argument_matching
7
+ from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
8
+
9
+ ## fix later
10
+ from wxo_agentic_evaluation.otel_parser.parser_types import (
11
+ Message as OtelMessage,
12
+ )
13
+
14
+ """
15
+ - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
16
+ - purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
17
+ a dummy node to the ground truth and the labelled messages to take into account
18
+ single, summary step goals.
19
+ """
20
+ DUMMY_GRAPH_NODE_NAME = "dummy-goal"
21
+
22
+
23
+ class JourneySuccessMetric(Evaluation):
24
+ def __init__(self, llm_client=None):
25
+ super().__init__(llm_client)
26
+ self.is_strict = True
27
+
28
+ @property
29
+ def name(self):
30
+ return "Journey Success"
31
+
32
+ def find_terminal_nodes(self, graph: dict[str, list[str]]) -> set[str]:
33
+ """Finds terminal nodes (nodes with no outgoing edges).
34
+
35
+ Args:
36
+ graph: the input graph
37
+
38
+ Returns:
39
+ a set of the terminal nodes
40
+ """
41
+
42
+ seen_nodes = set() # track seen nodes
43
+ non_terminal_nodes = set() # track nodes with children
44
+
45
+ for node in graph:
46
+ seen_nodes.add(node)
47
+ if graph[node]:
48
+ non_terminal_nodes.add(node)
49
+ for n in graph[node]:
50
+ seen_nodes.add(n)
51
+ return seen_nodes - non_terminal_nodes
52
+
53
+ def is_topological_sort(
54
+ self,
55
+ graph: dict[str, list[str]],
56
+ ordering: list[str],
57
+ is_strict: bool = True,
58
+ ) -> bool:
59
+ """Graph traversal to check if every node in `graph` is visited in `ordering` only after all its dependencies are visited.
60
+
61
+ Args:
62
+ graph: the graph representing the ground truth, where keys represent nodes and values represent its dependent nodes
63
+ ordering: the nodes visited, in order
64
+
65
+ Returns:
66
+ Boolean representing if `ordering` visits all nodes in a valid order based on the dependencies in graph.
67
+ """
68
+ # No keyword match or goal details were achieved
69
+ if not ordering:
70
+ return False
71
+
72
+ if is_strict:
73
+ # strict matching: only consider most recent tool call
74
+ position = {node: [i] for i, node in enumerate(ordering)}
75
+ else:
76
+ # lenient matching: consider all tool calls (account for all indexes of the node)
77
+ position = defaultdict(list)
78
+ for i, node in enumerate(ordering):
79
+ position[node].append(i)
80
+
81
+ terminal_nodes = self.find_terminal_nodes(graph)
82
+ # adds a dummy node for each terminal node
83
+ next_idx = (
84
+ max(val for values in position.values() for val in values) + 1
85
+ )
86
+
87
+ for n in terminal_nodes:
88
+ graph[n] = [DUMMY_GRAPH_NODE_NAME]
89
+ graph[DUMMY_GRAPH_NODE_NAME] = []
90
+ position[DUMMY_GRAPH_NODE_NAME] = [next_idx]
91
+ next_idx += 1
92
+
93
+ for node in graph:
94
+ for child_nodes in graph[node]:
95
+ # Current node/children doesn't show up in made calls
96
+ if node not in position or child_nodes not in position:
97
+ return False
98
+ # Current node doesn't show up before any of its child
99
+ # all index in current nodes are larger than every child nodes' index
100
+ if all(
101
+ curr >= max(position[child_nodes])
102
+ for curr in position[node]
103
+ ):
104
+ return False
105
+ return True
106
+
107
+ def evaluate(
108
+ self, messages, ground_truth, extracted_context, metadata, **kwargs
109
+ ):
110
+ labeled_messages = extracted_context.get("labeled_messages")
111
+ correct_tool_calls = []
112
+
113
+ for message_idx, matching_goal_details in labeled_messages.items():
114
+ msg_tool_call = messages[message_idx]
115
+ msg_tool_call = msg_tool_call.tool_calls[0].function
116
+ for goal_detail in matching_goal_details:
117
+ args_match = argument_matching(
118
+ expected=goal_detail.args,
119
+ actual=None if len(msg_tool_call.arguments) == 0 else json.loads(msg_tool_call.arguments),
120
+ )
121
+
122
+ if args_match:
123
+ correct_tool_calls.append(goal_detail.name)
124
+
125
+ is_topological_sort = self.is_topological_sort(
126
+ graph=ground_truth.goals,
127
+ ordering=correct_tool_calls,
128
+ is_strict=self.is_strict,
129
+ )
130
+
131
+ return LangfuseMetric(
132
+ eval_name=self.name,
133
+ comment="",
134
+ value=is_topological_sort,
135
+ data_type="NUMERIC",
136
+ metadata=metadata,
137
+ )