ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Callable, List
|
|
3
|
+
|
|
4
|
+
import rich
|
|
5
|
+
from langfuse import get_client
|
|
6
|
+
from langfuse.experiment import ExperimentResult
|
|
7
|
+
|
|
8
|
+
from wxo_agentic_evaluation.langfuse_collection import LangfuseCollection
|
|
9
|
+
from wxo_agentic_evaluation.metrics import Evaluation
|
|
10
|
+
from wxo_agentic_evaluation.metrics.dummy_metric import DummyMetric
|
|
11
|
+
from wxo_agentic_evaluation.metrics.journey_success import JourneySuccessMetric
|
|
12
|
+
from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
|
|
13
|
+
from wxo_agentic_evaluation.metrics.tool_calling import ToolCalling
|
|
14
|
+
from wxo_agentic_evaluation.otel_parser import parser as otel_parser
|
|
15
|
+
from wxo_agentic_evaluation.otel_parser.parser_types import (
|
|
16
|
+
Message as OtelMessage,
|
|
17
|
+
)
|
|
18
|
+
from wxo_agentic_evaluation.type import (
|
|
19
|
+
ExperimentResult,
|
|
20
|
+
LangfuseDatasetModel,
|
|
21
|
+
_convert_to_langfuse_format,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
from wxo_agentic_evaluation.extractors import ExtractLabeledMessages
|
|
25
|
+
|
|
26
|
+
LANGFUSE_CLIENT = get_client()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def upload(name, session_id, value, data_type, metadata):
|
|
30
|
+
try:
|
|
31
|
+
LANGFUSE_CLIENT.create_score(
|
|
32
|
+
name=name,
|
|
33
|
+
session_id=session_id,
|
|
34
|
+
value=value,
|
|
35
|
+
data_type=data_type,
|
|
36
|
+
metadata=metadata,
|
|
37
|
+
)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
rich.print(
|
|
40
|
+
f"[r] Uploading {name} with value {value} failed with exception {e}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def sample_aggregator(session_results: List[List[Evaluation]]):
|
|
45
|
+
metric_names = [
|
|
46
|
+
"journey_success",
|
|
47
|
+
"total_tool_calls",
|
|
48
|
+
"correct_tool_calls",
|
|
49
|
+
"expected_tool_calls",
|
|
50
|
+
"tool_calls_with_incorrect_parameter",
|
|
51
|
+
"tool_call_recall",
|
|
52
|
+
"tool_call_precision",
|
|
53
|
+
]
|
|
54
|
+
group_metrics = defaultdict(list)
|
|
55
|
+
|
|
56
|
+
for result in session_results:
|
|
57
|
+
for metric in result:
|
|
58
|
+
if metric["eval_name"] in metric_names:
|
|
59
|
+
group_metrics[metric["eval_name"]].append(
|
|
60
|
+
{"value": metric["value"], "metadata": metric["metadata"]}
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
average_metric = []
|
|
64
|
+
for metric_name, values in group_metrics.items():
|
|
65
|
+
aggr = []
|
|
66
|
+
for value in values:
|
|
67
|
+
aggr.append(value.get("value"))
|
|
68
|
+
|
|
69
|
+
metric_value = LangfuseMetric(
|
|
70
|
+
eval_name=f"Average_{metric_name}",
|
|
71
|
+
value=round(sum(aggr) / len(aggr), 2),
|
|
72
|
+
metadata=values[0]["metadata"],
|
|
73
|
+
)
|
|
74
|
+
average_metric.append(metric_value)
|
|
75
|
+
|
|
76
|
+
return average_metric
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class EvaluationRunner:
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
evaluation_name: str,
|
|
83
|
+
run_name: str,
|
|
84
|
+
session_ids: List[str],
|
|
85
|
+
collection: LangfuseCollection,
|
|
86
|
+
metrics: List[Evaluation],
|
|
87
|
+
aggregator: Callable,
|
|
88
|
+
):
|
|
89
|
+
self.evaluation_name = evaluation_name
|
|
90
|
+
self.run_name = run_name
|
|
91
|
+
|
|
92
|
+
self.experiment_id = f"{self.evaluation_name}.{self.run_name}"
|
|
93
|
+
|
|
94
|
+
self.collection = collection
|
|
95
|
+
langfuse_dataset = LANGFUSE_CLIENT.get_dataset(self.collection.name)
|
|
96
|
+
self.test_cases: List[LangfuseDatasetModel] = []
|
|
97
|
+
for item in langfuse_dataset.items:
|
|
98
|
+
data_model = _convert_to_langfuse_format(item)
|
|
99
|
+
self.test_cases.append(data_model)
|
|
100
|
+
|
|
101
|
+
self.session_ids = session_ids
|
|
102
|
+
self.messages = [otel_parser.parse_session(id) for id in self.session_ids]
|
|
103
|
+
|
|
104
|
+
assert (
|
|
105
|
+
len(self.session_ids) == len(self.messages) == len(self.test_cases)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
self.metrics = metrics
|
|
109
|
+
self.aggregator = aggregator
|
|
110
|
+
|
|
111
|
+
def evaluate(self):
|
|
112
|
+
metadata = {"experiment_id": self.experiment_id}
|
|
113
|
+
|
|
114
|
+
total_metrics = []
|
|
115
|
+
for idx, test_case in enumerate(self.test_cases):
|
|
116
|
+
metric_results = []
|
|
117
|
+
messages = self.messages[idx]
|
|
118
|
+
extracted_context = ExtractLabeledMessages.extract(messages, test_case)
|
|
119
|
+
for metric in self.metrics:
|
|
120
|
+
result = metric.evaluate(
|
|
121
|
+
messages=messages,
|
|
122
|
+
ground_truth=test_case,
|
|
123
|
+
extracted_context=extracted_context,
|
|
124
|
+
metadata=metadata
|
|
125
|
+
)
|
|
126
|
+
if isinstance(result, list):
|
|
127
|
+
metric_results.extend([r.model_dump() for r in result])
|
|
128
|
+
for r in result:
|
|
129
|
+
upload(
|
|
130
|
+
name=r.eval_name,
|
|
131
|
+
session_id=self.session_ids[idx],
|
|
132
|
+
value=r.value,
|
|
133
|
+
data_type=r.data_type,
|
|
134
|
+
metadata=r.metadata,
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
metric_results.append(result.model_dump())
|
|
138
|
+
upload(
|
|
139
|
+
name=result.eval_name,
|
|
140
|
+
session_id=self.session_ids[idx],
|
|
141
|
+
value=result.value,
|
|
142
|
+
data_type=result.data_type,
|
|
143
|
+
metadata=result.metadata,
|
|
144
|
+
)
|
|
145
|
+
total_metrics.append(metric_results)
|
|
146
|
+
|
|
147
|
+
aggregate_metrics = self.aggregator(total_metrics)
|
|
148
|
+
for metric in aggregate_metrics:
|
|
149
|
+
try:
|
|
150
|
+
LANGFUSE_CLIENT.create_score(
|
|
151
|
+
name=metric.eval_name,
|
|
152
|
+
value=metric.value,
|
|
153
|
+
metadata=metric.metadata,
|
|
154
|
+
data_type="NUMERIC",
|
|
155
|
+
dataset_run_id=metric.metadata["experiment_id"],
|
|
156
|
+
)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
rich.print(
|
|
159
|
+
f"[r] Uploading {metric.name} with value {metric.value} failed with exception {e}"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
return ExperimentResult(
|
|
163
|
+
experiment_name=self.evaluation_name,
|
|
164
|
+
run_id=self.run_name,
|
|
165
|
+
experiment_id=self.experiment_id,
|
|
166
|
+
metrics=total_metrics,
|
|
167
|
+
session_ids=self.session_ids
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
if __name__ == "__main__":
|
|
172
|
+
collection_name = "HR AGENT DEMO"
|
|
173
|
+
langfuse_collection = LangfuseCollection(name=collection_name)
|
|
174
|
+
journey_sucess_metric = JourneySuccessMetric()
|
|
175
|
+
tool_calling = ToolCalling()
|
|
176
|
+
|
|
177
|
+
SESSION_ID = "agent-demo-session-id-NEW"
|
|
178
|
+
|
|
179
|
+
run = EvaluationRunner(
|
|
180
|
+
evaluation_name="sample_evaluation",
|
|
181
|
+
run_name="1",
|
|
182
|
+
session_ids=[
|
|
183
|
+
"agent-demo-session-id-NEW-0",
|
|
184
|
+
"agent-demo-session-id-NEW-1",
|
|
185
|
+
],
|
|
186
|
+
collection=langfuse_collection,
|
|
187
|
+
metrics=[journey_sucess_metric, tool_calling],
|
|
188
|
+
aggregator=sample_aggregator,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
experiment_results = run.evaluate()
|
|
192
|
+
rich.print(experiment_results.model_dump())
|
|
@@ -1,9 +1,22 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
LLM Matching Module with Cosine Similarity Support
|
|
3
|
+
|
|
4
|
+
This module provides functionality for matching text using:
|
|
5
|
+
1. LLM-based matching (using a language model to determine semantic equivalence)
|
|
6
|
+
2. Embedding-based matching (using cosine similarity between text embeddings)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import math
|
|
10
|
+
from typing import List
|
|
11
|
+
|
|
12
|
+
from fuzzywuzzy import fuzz
|
|
13
|
+
|
|
2
14
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
3
15
|
KeywordMatchingTemplateRenderer,
|
|
4
16
|
SemanticMatchingTemplateRenderer,
|
|
5
17
|
)
|
|
6
|
-
from
|
|
18
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
19
|
+
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
7
20
|
|
|
8
21
|
|
|
9
22
|
class LLMMatcher:
|
|
@@ -12,10 +25,18 @@ class LLMMatcher:
|
|
|
12
25
|
llm_client: Provider,
|
|
13
26
|
keyword_template: KeywordMatchingTemplateRenderer,
|
|
14
27
|
semantic_template: SemanticMatchingTemplateRenderer,
|
|
28
|
+
use_llm_for_semantic: bool = True,
|
|
29
|
+
embedding_model_id: str = "sentence-transformers/all-minilm-l6-v2",
|
|
30
|
+
similarity_threshold: float = 0.8,
|
|
31
|
+
enable_fuzzy_matching: bool = False,
|
|
15
32
|
):
|
|
16
33
|
self.llm_client = llm_client
|
|
17
34
|
self.keyword_template = keyword_template
|
|
18
35
|
self.semantic_template = semantic_template
|
|
36
|
+
self.embedding_model_id = embedding_model_id
|
|
37
|
+
self.use_llm_for_semantic = use_llm_for_semantic
|
|
38
|
+
self.similarity_threshold = similarity_threshold
|
|
39
|
+
self.enable_fuzzy_matching = enable_fuzzy_matching
|
|
19
40
|
|
|
20
41
|
def keywords_match(self, response_text: str, keywords: List[str]) -> bool:
|
|
21
42
|
if len(keywords) == 0:
|
|
@@ -26,14 +47,96 @@ class LLMMatcher:
|
|
|
26
47
|
prompt = self.keyword_template.render(
|
|
27
48
|
keywords_text=keywords_text, response_text=response_text
|
|
28
49
|
)
|
|
29
|
-
output:str = self.llm_client.query(prompt)
|
|
50
|
+
output: str = self.llm_client.query(prompt)
|
|
30
51
|
result = output.strip().lower()
|
|
31
52
|
return result.startswith("true")
|
|
32
53
|
|
|
33
|
-
def
|
|
54
|
+
def generate_embeddings(
|
|
55
|
+
self, prediction: str, ground_truth: str
|
|
56
|
+
) -> List[List[float]]:
|
|
57
|
+
|
|
58
|
+
embeddings = self.llm_client.encode([prediction, ground_truth])
|
|
59
|
+
|
|
60
|
+
return embeddings
|
|
61
|
+
|
|
62
|
+
def compute_cosine_similarity(
|
|
63
|
+
self, vec1: List[float], vec2: List[float]
|
|
64
|
+
) -> float:
|
|
65
|
+
"""Calculate cosine similarity between two vectors using pure Python"""
|
|
66
|
+
|
|
67
|
+
# Manual dot product calculation
|
|
68
|
+
dot_product = sum(a * b for a, b in zip(vec1, vec2))
|
|
69
|
+
|
|
70
|
+
# Manual magnitude calculations
|
|
71
|
+
magnitude1 = math.sqrt(sum(a * a for a in vec1))
|
|
72
|
+
magnitude2 = math.sqrt(sum(b * b for b in vec2))
|
|
73
|
+
|
|
74
|
+
return safe_divide(dot_product, (magnitude1 * magnitude2))
|
|
75
|
+
|
|
76
|
+
def cosine_similarity_semantic_match(
|
|
77
|
+
self, prediction: str, ground_truth: str
|
|
78
|
+
) -> bool:
|
|
79
|
+
embeddings = self.generate_embeddings(prediction, ground_truth)
|
|
80
|
+
cosine_similarity = self.compute_cosine_similarity(
|
|
81
|
+
embeddings[0], embeddings[1]
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return cosine_similarity >= self.similarity_threshold
|
|
85
|
+
|
|
86
|
+
def llm_semantic_match(
|
|
87
|
+
self, context, prediction: str, ground_truth: str
|
|
88
|
+
) -> bool:
|
|
89
|
+
"""Performs semantic matching for the agent's final response and the expected response using the starting sentence of the conversation as the context
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
context: The starting sentence of the conversation. TODO can also consider using the LLM user's story
|
|
93
|
+
prediction: the predicted string
|
|
94
|
+
ground_truth: the expected string
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
a boolean indicating if the sentences match.
|
|
98
|
+
"""
|
|
99
|
+
|
|
34
100
|
prompt = self.semantic_template.render(
|
|
35
|
-
expected_text=ground_truth, actual_text=prediction
|
|
101
|
+
context=context, expected_text=ground_truth, actual_text=prediction
|
|
36
102
|
)
|
|
37
103
|
output: str = self.llm_client.query(prompt)
|
|
38
104
|
result = output.strip().lower()
|
|
105
|
+
|
|
39
106
|
return result.startswith("true")
|
|
107
|
+
|
|
108
|
+
def fuzzywuzzy_semantic_match(
|
|
109
|
+
self, prediction: str, ground_truth: str
|
|
110
|
+
) -> bool:
|
|
111
|
+
|
|
112
|
+
similarity_score = fuzz.WRatio(prediction, ground_truth)
|
|
113
|
+
|
|
114
|
+
return similarity_score > self.similarity_threshold
|
|
115
|
+
|
|
116
|
+
def semantic_match(
|
|
117
|
+
self,
|
|
118
|
+
context: str,
|
|
119
|
+
prediction: str,
|
|
120
|
+
ground_truth: str,
|
|
121
|
+
enable_fuzzy_matching: bool = False,
|
|
122
|
+
) -> bool:
|
|
123
|
+
## TODO arjun-gupta1 10/06/2025: revist retry with exponential backoff. Opted for direct fallback to cosine similarity to avoid latency for now.
|
|
124
|
+
try:
|
|
125
|
+
return self.llm_semantic_match(context, prediction, ground_truth)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print(f"LLM semantic match failed: {e}")
|
|
128
|
+
|
|
129
|
+
if enable_fuzzy_matching:
|
|
130
|
+
print("falling back to fuzzy matching")
|
|
131
|
+
# Fallback to cosine similarity if LLM matching is not used or failed
|
|
132
|
+
try:
|
|
133
|
+
return self.cosine_similarity_semantic_match(
|
|
134
|
+
prediction, ground_truth
|
|
135
|
+
)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
print(
|
|
138
|
+
f"Cosine similarity failed: {e}. Falling back to fuzzywuzzy."
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Final fallback to fuzzywuzzy
|
|
142
|
+
return self.fuzzywuzzy_semantic_match(prediction, ground_truth)
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
-
from typing import List
|
|
2
1
|
import json
|
|
2
|
+
from typing import List
|
|
3
3
|
|
|
4
|
-
from wxo_agentic_evaluation.
|
|
4
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
5
|
+
AnswerRelevancy,
|
|
6
|
+
Faithfulness,
|
|
7
|
+
)
|
|
5
8
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
6
|
-
FaithfulnessTemplateRenderer,
|
|
7
9
|
AnswerRelevancyTemplateRenderer,
|
|
10
|
+
FaithfulnessTemplateRenderer,
|
|
8
11
|
)
|
|
9
|
-
from wxo_agentic_evaluation.
|
|
12
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
10
13
|
|
|
11
14
|
|
|
12
15
|
class LLMJudge:
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
5
|
+
AnswerDerailment,
|
|
6
|
+
AnswerUnsafeTopic,
|
|
7
|
+
)
|
|
8
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
9
|
+
DerailmentTemplateRenderer,
|
|
10
|
+
UnsafeTopicTemplateRenderer,
|
|
11
|
+
)
|
|
12
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LLMSafetyJudge:
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
llm_client: Provider,
|
|
19
|
+
answer_derailment: DerailmentTemplateRenderer,
|
|
20
|
+
answer_unsafe_topic: UnsafeTopicTemplateRenderer,
|
|
21
|
+
):
|
|
22
|
+
self.llm_client = llm_client
|
|
23
|
+
self.answer_derailment_template = answer_derailment
|
|
24
|
+
self.answer_unsafe_topic_template = answer_unsafe_topic
|
|
25
|
+
|
|
26
|
+
def judge_derailment_in_answer(
|
|
27
|
+
self, question: str, instructions: str, answer: str
|
|
28
|
+
) -> AnswerDerailment:
|
|
29
|
+
prompt = self.answer_derailment_template.render(
|
|
30
|
+
question=question, instructions=instructions, answer=answer
|
|
31
|
+
)
|
|
32
|
+
output = self.llm_client.query(prompt)
|
|
33
|
+
result = output.strip().lower()
|
|
34
|
+
try:
|
|
35
|
+
json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
|
|
36
|
+
thejson = json.loads(json_str)
|
|
37
|
+
answer_derailment = AnswerDerailment.model_validate(thejson)
|
|
38
|
+
except Exception:
|
|
39
|
+
answer_derailment = AnswerDerailment(
|
|
40
|
+
in_scope="not sure", statement=answer, reason=result
|
|
41
|
+
)
|
|
42
|
+
return answer_derailment
|
|
43
|
+
|
|
44
|
+
def judge_unsafe_topic_in_answer(
|
|
45
|
+
self, question: str, instructions: str, answer: str
|
|
46
|
+
) -> AnswerUnsafeTopic:
|
|
47
|
+
"""
|
|
48
|
+
Use LLM to judge if the answer contains unsafe topics.
|
|
49
|
+
"""
|
|
50
|
+
prompt = self.answer_unsafe_topic_template.render(
|
|
51
|
+
question=question, instructions=instructions, answer=answer
|
|
52
|
+
)
|
|
53
|
+
output = self.llm_client.query(prompt)
|
|
54
|
+
result = output.strip().lower()
|
|
55
|
+
try:
|
|
56
|
+
json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
|
|
57
|
+
thejson = json.loads(json_str)
|
|
58
|
+
answer_unsafe = AnswerUnsafeTopic.model_validate(thejson)
|
|
59
|
+
except Exception:
|
|
60
|
+
answer_unsafe = AnswerUnsafeTopic(
|
|
61
|
+
is_safe="not sure", statement=answer, reason=result
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return answer_unsafe
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
from typing import List, TypeVar
|
|
2
|
-
|
|
3
|
-
from wxo_agentic_evaluation.
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.base_user import BaseUserSimulator
|
|
4
4
|
from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
|
|
5
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
6
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
5
7
|
|
|
6
8
|
T = TypeVar("T", bound=JinjaTemplateRenderer)
|
|
7
9
|
|
|
8
10
|
|
|
9
|
-
class LLMUser:
|
|
11
|
+
class LLMUser(BaseUserSimulator):
|
|
10
12
|
def __init__(
|
|
11
|
-
self, wai_client: Provider, template: T, user_response_style: List[str]
|
|
13
|
+
self, wai_client: Provider, template: T, user_response_style: List[str] | None = None
|
|
12
14
|
):
|
|
13
15
|
self.wai_client = wai_client
|
|
14
16
|
self.prompt_template = template
|
|
@@ -17,8 +19,11 @@ class LLMUser:
|
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
def generate_user_input(
|
|
20
|
-
self,
|
|
21
|
-
|
|
22
|
+
self,
|
|
23
|
+
user_story,
|
|
24
|
+
conversation_history: List[Message],
|
|
25
|
+
attack_instructions: str | None = None,
|
|
26
|
+
) -> Message:
|
|
22
27
|
# the tool response is already summarized, we don't need that to take over the chat history context window
|
|
23
28
|
prompt_input = self.prompt_template.render(
|
|
24
29
|
conversation_history=[
|
|
@@ -28,6 +33,7 @@ class LLMUser:
|
|
|
28
33
|
],
|
|
29
34
|
user_story=user_story,
|
|
30
35
|
user_response_style=self.user_response_style,
|
|
36
|
+
attack_instructions=attack_instructions,
|
|
31
37
|
)
|
|
32
38
|
user_input = self.wai_client.query(prompt_input)
|
|
33
39
|
user_input = Message(
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.base_user import BaseUserSimulator
|
|
4
|
+
from wxo_agentic_evaluation.prompt.template_render import UserTemplateRenderer
|
|
5
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
6
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LLMUserV2(BaseUserSimulator):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
llm_client: Provider,
|
|
13
|
+
user_prompt_path: str,
|
|
14
|
+
):
|
|
15
|
+
self.llm_client = llm_client
|
|
16
|
+
self.user_prompt_path = user_prompt_path
|
|
17
|
+
self.prompt_template = UserTemplateRenderer(
|
|
18
|
+
template_path=user_prompt_path
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
def _get_system_prompt(
|
|
22
|
+
self, user_story: str, user_response_style: List[str] = None
|
|
23
|
+
) -> Message:
|
|
24
|
+
# Get the user system prompt
|
|
25
|
+
prompt_messages = self.prompt_template.render(
|
|
26
|
+
user_story=user_story,
|
|
27
|
+
user_response_style=user_response_style,
|
|
28
|
+
)
|
|
29
|
+
return Message(**prompt_messages[0], type=ContentType.text)
|
|
30
|
+
|
|
31
|
+
def _get_message_dicts(self, messages: List[Message]) -> List[dict]:
|
|
32
|
+
# Convert messages to dictionary format for the llm client
|
|
33
|
+
return [message.model_dump() for message in messages]
|
|
34
|
+
|
|
35
|
+
def _filter_conversation_history(
|
|
36
|
+
self, conversation_history: List[Message]
|
|
37
|
+
) -> List[Message]:
|
|
38
|
+
# Filter out the agent system prompt
|
|
39
|
+
return [
|
|
40
|
+
message
|
|
41
|
+
for message in conversation_history
|
|
42
|
+
if message.role != "system"
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
def flip_message_roles(self, messages: List[Message]) -> List[Message]:
|
|
46
|
+
# We flip the roles of messages in conversation history to basically prompt the
|
|
47
|
+
# user simulator with the assistant message as the user input message
|
|
48
|
+
# This helps to get the llm to respond as a natural user with the given story.
|
|
49
|
+
new_messages = []
|
|
50
|
+
for message in messages:
|
|
51
|
+
if message.role == "user":
|
|
52
|
+
new_messages.append(
|
|
53
|
+
Message(
|
|
54
|
+
role="assistant",
|
|
55
|
+
content=message.content,
|
|
56
|
+
type=ContentType.text,
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
else:
|
|
60
|
+
new_messages.append(
|
|
61
|
+
Message(
|
|
62
|
+
role="user",
|
|
63
|
+
content=message.content,
|
|
64
|
+
type=ContentType.text,
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
return new_messages
|
|
68
|
+
|
|
69
|
+
def generate_user_input(
|
|
70
|
+
self,
|
|
71
|
+
user_story: str,
|
|
72
|
+
conversation_history: List[Message],
|
|
73
|
+
user_response_style: List[str] = None,
|
|
74
|
+
starting_user_input: Message = None,
|
|
75
|
+
**kwargs,
|
|
76
|
+
) -> Message:
|
|
77
|
+
# Get the user system prompt
|
|
78
|
+
system_prompt = self._get_system_prompt(user_story, user_response_style)
|
|
79
|
+
|
|
80
|
+
conversation_history = self._filter_conversation_history(
|
|
81
|
+
conversation_history
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
## Adding dummy message if not provided from the simulation side.
|
|
85
|
+
if len(conversation_history) == 0:
|
|
86
|
+
conversation_history.append(
|
|
87
|
+
Message(
|
|
88
|
+
role="assistant",
|
|
89
|
+
content="Hi! How can I help you today?",
|
|
90
|
+
type=ContentType.text,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
conversation_history = self.flip_message_roles(conversation_history)
|
|
95
|
+
|
|
96
|
+
# build the conversation history with the system prompt
|
|
97
|
+
messages = [system_prompt] + conversation_history
|
|
98
|
+
|
|
99
|
+
if starting_user_input is not None:
|
|
100
|
+
# If starting user input is provided, return it as is for the initial turn
|
|
101
|
+
return starting_user_input
|
|
102
|
+
else:
|
|
103
|
+
|
|
104
|
+
# Get response from LLM for simulation
|
|
105
|
+
response = self.llm_client.chat(
|
|
106
|
+
messages=self._get_message_dicts(messages)
|
|
107
|
+
)
|
|
108
|
+
response_message = Message(
|
|
109
|
+
role="user",
|
|
110
|
+
content=response.choices[0].message.content,
|
|
111
|
+
type=ContentType.text,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return response_message
|