ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -41,15 +41,15 @@ class ExternalAgentValidation:
|
|
|
41
41
|
data = b""
|
|
42
42
|
for chunk in resp:
|
|
43
43
|
for line in chunk.splitlines(True):
|
|
44
|
-
if line.startswith(b"
|
|
45
|
-
|
|
46
|
-
if line.strip() == b"[DONE]":
|
|
47
|
-
return
|
|
44
|
+
if line.startswith(b"event:"):
|
|
45
|
+
continue
|
|
48
46
|
data += line
|
|
49
47
|
if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
|
|
50
48
|
# NOTE: edge case, "data" can be sent in two different chunks
|
|
51
49
|
if data.startswith(b"data:"):
|
|
52
50
|
data = data.replace(b"data:", b"")
|
|
51
|
+
if data.strip() == b"[DONE]":
|
|
52
|
+
return
|
|
53
53
|
yield data
|
|
54
54
|
data = b""
|
|
55
55
|
if data:
|
|
@@ -74,7 +74,9 @@ class ExternalAgentValidation:
|
|
|
74
74
|
payload = {"stream": True}
|
|
75
75
|
payload["messages"] = messages
|
|
76
76
|
resp = requests.post(
|
|
77
|
-
url=self.service_url,
|
|
77
|
+
url=self.service_url,
|
|
78
|
+
headers=self.header,
|
|
79
|
+
json=payload,
|
|
78
80
|
)
|
|
79
81
|
success, logged_events = self._validate_streaming_response(resp)
|
|
80
82
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, List, Literal, Mapping, Union
|
|
1
|
+
from typing import Any, List, Literal, Mapping, Optional, Union
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
@@ -46,7 +46,7 @@ class ThreadRunStepDeltaChoice(BaseModel):
|
|
|
46
46
|
class BaseEventData(BaseModel):
|
|
47
47
|
id: str
|
|
48
48
|
object: str
|
|
49
|
-
thread_id: str
|
|
49
|
+
thread_id: Optional[str] = None
|
|
50
50
|
model: str | None = None
|
|
51
51
|
created: int | None = None
|
|
52
52
|
|
|
@@ -62,13 +62,7 @@ class ThreadRunStepDeltaData(BaseEventData):
|
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
class UniversalData(BaseEventData):
|
|
65
|
-
object:
|
|
66
|
-
Literal["thread.message.delta"],
|
|
67
|
-
Literal["thread.run.step.delta"],
|
|
68
|
-
Literal["thread.run.step.created"],
|
|
69
|
-
Literal["thread.run.step.completed"],
|
|
70
|
-
]
|
|
71
|
-
choices: List[ThreadMessageDeltaChoice]
|
|
65
|
+
object: Optional[str]
|
|
72
66
|
choices: List[Union[ThreadMessageDeltaChoice, dict]]
|
|
73
67
|
|
|
74
68
|
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.type import Message
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Extractor(ABC):
|
|
8
|
+
@property
|
|
9
|
+
@abstractmethod
|
|
10
|
+
def name(self) -> str:
|
|
11
|
+
"""Unique name for the extractor."""
|
|
12
|
+
raise NotImplementedError
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def extract(
|
|
17
|
+
messages: list[Message],
|
|
18
|
+
**kwargs,
|
|
19
|
+
) -> Any:
|
|
20
|
+
"""Extract data from messages."""
|
|
21
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, List, Mapping
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.extractors.extractor_base import Extractor
|
|
5
|
+
from wxo_agentic_evaluation.type import ContentType, GoalDetail, Message
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ExtractLabeledMessages(Extractor):
|
|
9
|
+
def name(self):
|
|
10
|
+
return "Labelled Messages"
|
|
11
|
+
|
|
12
|
+
def extract(
|
|
13
|
+
messages: List[Message],
|
|
14
|
+
ground_truth,
|
|
15
|
+
**kwargs,
|
|
16
|
+
) -> Any:
|
|
17
|
+
|
|
18
|
+
tool_dictionary = (
|
|
19
|
+
{
|
|
20
|
+
goal_detail.name: goal_detail
|
|
21
|
+
for goal_detail in ground_truth.goal_details
|
|
22
|
+
if goal_detail.type == ContentType.tool_call
|
|
23
|
+
}
|
|
24
|
+
if ground_truth.goal_details
|
|
25
|
+
else {}
|
|
26
|
+
)
|
|
27
|
+
labeled_messages = {}
|
|
28
|
+
for idx, message in enumerate(messages):
|
|
29
|
+
# TODO: investigate this logic - `message` body might not be consistent across providers
|
|
30
|
+
if not (message.role == "assistant" and message.tool_calls):
|
|
31
|
+
continue
|
|
32
|
+
try:
|
|
33
|
+
msg_tool_call = message.tool_calls[0].function
|
|
34
|
+
except Exception:
|
|
35
|
+
# ignore malformed tool_call content
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
matching_goal_details = [
|
|
39
|
+
gd
|
|
40
|
+
for gd in tool_dictionary.values()
|
|
41
|
+
if gd.tool_name == msg_tool_call.name
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
if matching_goal_details:
|
|
45
|
+
labeled_messages[idx] = matching_goal_details
|
|
46
|
+
|
|
47
|
+
return {"labeled_messages": labeled_messages}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from langchain.tools import tool
|
|
2
|
+
from langchain.agents import create_agent
|
|
3
|
+
from langchain_openai import ChatOpenAI
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
@tool
|
|
9
|
+
def get_assignment_id_hr_usecase(username: str) -> str:
|
|
10
|
+
"""
|
|
11
|
+
get the assignment id from username
|
|
12
|
+
:param username: username of the employee
|
|
13
|
+
"""
|
|
14
|
+
if username=="nwaters":
|
|
15
|
+
return "15778303"
|
|
16
|
+
if username=="johndoe":
|
|
17
|
+
return "15338303"
|
|
18
|
+
return "not found"
|
|
19
|
+
|
|
20
|
+
def validate_datetime(date_text):
|
|
21
|
+
try:
|
|
22
|
+
format = "%Y-%m-%d"
|
|
23
|
+
datetime.strptime(date_text, format)
|
|
24
|
+
return True
|
|
25
|
+
except ValueError:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@tool
|
|
30
|
+
def get_timeoff_schedule_hr_usecase(assignment_id: str, start_date: str, end_date: str) -> str:
|
|
31
|
+
"""
|
|
32
|
+
get timeoff schedule for employee based on assignment id, start date and end date
|
|
33
|
+
:param assignment_id: assignment_id of the user
|
|
34
|
+
:param start_date: start date of the timeoff scheduel, in YYYY-MM-DD format
|
|
35
|
+
:param assignment_id: end date of the timeoff scheduel, in YYYY-MM-DD format
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
if not validate_datetime(start_date):
|
|
39
|
+
return f"Incorrect date format {start_date}, should be YYYY-MM-DD"
|
|
40
|
+
if not validate_datetime(end_date):
|
|
41
|
+
return f"Incorrect date format {end_date}, should be YYYY-MM-DD"
|
|
42
|
+
if assignment_id=="15338303":
|
|
43
|
+
return json.dumps(["20250411", "20250311", "20250101"])
|
|
44
|
+
if assignment_id=="15778303":
|
|
45
|
+
return json.dumps(["20250105"])
|
|
46
|
+
return []
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@tool
|
|
50
|
+
def get_direct_reports_hr_usecase(username: str) -> str:
|
|
51
|
+
"""
|
|
52
|
+
get direct reports for a given username
|
|
53
|
+
:param assignment_id: assignment_id of the user
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
return json.dumps(["nwaters", "johndoe"])
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
|
60
|
+
tools = [get_assignment_id_hr_usecase, get_timeoff_schedule_hr_usecase, get_direct_reports_hr_usecase]
|
|
61
|
+
system_prompt="""You are an HR Agent that can answer questions related to timeoff and holiday calendar. Use the tools provided to answer the user's question. If you do not have enough information to answer the question, say so. If you need more information, ask follow up questions."""
|
|
62
|
+
|
|
63
|
+
agent = create_agent(
|
|
64
|
+
tools=tools,
|
|
65
|
+
model=llm,
|
|
66
|
+
system_prompt=system_prompt
|
|
67
|
+
)
|
|
68
|
+
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import List, Mapping, Union
|
|
3
|
+
|
|
4
|
+
import rich
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.type import (
|
|
7
|
+
LangfuseCollectionModel,
|
|
8
|
+
LangfuseDatasetModel,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LangfuseCollection:
|
|
13
|
+
def __init__(self, name, description="", metadata: Mapping[str, str] = {}):
|
|
14
|
+
self.name = name
|
|
15
|
+
self.description = description
|
|
16
|
+
self.metadata = metadata
|
|
17
|
+
|
|
18
|
+
def upload(self, paths: Union[str, List[str]]):
|
|
19
|
+
from langfuse import get_client
|
|
20
|
+
|
|
21
|
+
langfuse_client = get_client()
|
|
22
|
+
|
|
23
|
+
datasets = []
|
|
24
|
+
if isinstance(paths, str):
|
|
25
|
+
paths = [paths]
|
|
26
|
+
|
|
27
|
+
for path in paths:
|
|
28
|
+
with open(path, encoding="utf-8") as f:
|
|
29
|
+
dataset = json.load(f)
|
|
30
|
+
dataset = LangfuseDatasetModel(
|
|
31
|
+
starting_sentence=dataset.get("starting_sentence", ""),
|
|
32
|
+
story=dataset.get("story", ""),
|
|
33
|
+
goals=dataset.get("goals"),
|
|
34
|
+
goal_details=dataset.get("goal_details"),
|
|
35
|
+
agent=dataset.get("agent")
|
|
36
|
+
)
|
|
37
|
+
datasets.append(dataset)
|
|
38
|
+
|
|
39
|
+
collection = LangfuseCollectionModel(
|
|
40
|
+
collection_name=self.name,
|
|
41
|
+
collection_description=self.description,
|
|
42
|
+
datasets=datasets,
|
|
43
|
+
metadata=self.metadata,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
rich.print(
|
|
47
|
+
f"[g] Uploading {len(collection.datasets)} datasets to '{collection.collection_name}'"
|
|
48
|
+
)
|
|
49
|
+
langfuse_client.create_dataset(
|
|
50
|
+
name=collection.collection_name,
|
|
51
|
+
description=collection.collection_description,
|
|
52
|
+
metadata=collection.metadata,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
for dataset in collection.datasets:
|
|
56
|
+
langfuse_client.create_dataset_item(
|
|
57
|
+
dataset_name=collection.collection_name,
|
|
58
|
+
input=dataset.langfuse_input,
|
|
59
|
+
expected_output=dataset.langfuse_output,
|
|
60
|
+
)
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Callable, List
|
|
3
|
+
|
|
4
|
+
import rich
|
|
5
|
+
from langfuse import get_client
|
|
6
|
+
from langfuse.experiment import ExperimentResult
|
|
7
|
+
|
|
8
|
+
from wxo_agentic_evaluation.langfuse_collection import LangfuseCollection
|
|
9
|
+
from wxo_agentic_evaluation.metrics import Evaluation
|
|
10
|
+
from wxo_agentic_evaluation.metrics.dummy_metric import DummyMetric
|
|
11
|
+
from wxo_agentic_evaluation.metrics.journey_success import JourneySuccessMetric
|
|
12
|
+
from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
|
|
13
|
+
from wxo_agentic_evaluation.metrics.tool_calling import ToolCalling
|
|
14
|
+
from wxo_agentic_evaluation.otel_parser import parser as otel_parser
|
|
15
|
+
from wxo_agentic_evaluation.otel_parser.parser_types import (
|
|
16
|
+
Message as OtelMessage,
|
|
17
|
+
)
|
|
18
|
+
from wxo_agentic_evaluation.type import (
|
|
19
|
+
ExperimentResult,
|
|
20
|
+
LangfuseDatasetModel,
|
|
21
|
+
_convert_to_langfuse_format,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
from wxo_agentic_evaluation.extractors import ExtractLabeledMessages
|
|
25
|
+
|
|
26
|
+
LANGFUSE_CLIENT = get_client()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def upload(name, session_id, value, data_type, metadata):
|
|
30
|
+
try:
|
|
31
|
+
LANGFUSE_CLIENT.create_score(
|
|
32
|
+
name=name,
|
|
33
|
+
session_id=session_id,
|
|
34
|
+
value=value,
|
|
35
|
+
data_type=data_type,
|
|
36
|
+
metadata=metadata,
|
|
37
|
+
)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
rich.print(
|
|
40
|
+
f"[r] Uploading {name} with value {value} failed with exception {e}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def sample_aggregator(session_results: List[List[Evaluation]]):
|
|
45
|
+
metric_names = [
|
|
46
|
+
"journey_success",
|
|
47
|
+
"total_tool_calls",
|
|
48
|
+
"correct_tool_calls",
|
|
49
|
+
"expected_tool_calls",
|
|
50
|
+
"tool_calls_with_incorrect_parameter",
|
|
51
|
+
"tool_call_recall",
|
|
52
|
+
"tool_call_precision",
|
|
53
|
+
]
|
|
54
|
+
group_metrics = defaultdict(list)
|
|
55
|
+
|
|
56
|
+
for result in session_results:
|
|
57
|
+
for metric in result:
|
|
58
|
+
if metric["eval_name"] in metric_names:
|
|
59
|
+
group_metrics[metric["eval_name"]].append(
|
|
60
|
+
{"value": metric["value"], "metadata": metric["metadata"]}
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
average_metric = []
|
|
64
|
+
for metric_name, values in group_metrics.items():
|
|
65
|
+
aggr = []
|
|
66
|
+
for value in values:
|
|
67
|
+
aggr.append(value.get("value"))
|
|
68
|
+
|
|
69
|
+
metric_value = LangfuseMetric(
|
|
70
|
+
eval_name=f"Average_{metric_name}",
|
|
71
|
+
value=round(sum(aggr) / len(aggr), 2),
|
|
72
|
+
metadata=values[0]["metadata"],
|
|
73
|
+
)
|
|
74
|
+
average_metric.append(metric_value)
|
|
75
|
+
|
|
76
|
+
return average_metric
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class EvaluationRunner:
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
evaluation_name: str,
|
|
83
|
+
run_name: str,
|
|
84
|
+
session_ids: List[str],
|
|
85
|
+
collection: LangfuseCollection,
|
|
86
|
+
metrics: List[Evaluation],
|
|
87
|
+
aggregator: Callable,
|
|
88
|
+
):
|
|
89
|
+
self.evaluation_name = evaluation_name
|
|
90
|
+
self.run_name = run_name
|
|
91
|
+
|
|
92
|
+
self.experiment_id = f"{self.evaluation_name}.{self.run_name}"
|
|
93
|
+
|
|
94
|
+
self.collection = collection
|
|
95
|
+
langfuse_dataset = LANGFUSE_CLIENT.get_dataset(self.collection.name)
|
|
96
|
+
self.test_cases: List[LangfuseDatasetModel] = []
|
|
97
|
+
for item in langfuse_dataset.items:
|
|
98
|
+
data_model = _convert_to_langfuse_format(item)
|
|
99
|
+
self.test_cases.append(data_model)
|
|
100
|
+
|
|
101
|
+
self.session_ids = session_ids
|
|
102
|
+
self.messages = [otel_parser.parse_session(id) for id in self.session_ids]
|
|
103
|
+
|
|
104
|
+
assert (
|
|
105
|
+
len(self.session_ids) == len(self.messages) == len(self.test_cases)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
self.metrics = metrics
|
|
109
|
+
self.aggregator = aggregator
|
|
110
|
+
|
|
111
|
+
def evaluate(self):
|
|
112
|
+
metadata = {"experiment_id": self.experiment_id}
|
|
113
|
+
|
|
114
|
+
total_metrics = []
|
|
115
|
+
for idx, test_case in enumerate(self.test_cases):
|
|
116
|
+
metric_results = []
|
|
117
|
+
messages = self.messages[idx]
|
|
118
|
+
extracted_context = ExtractLabeledMessages.extract(messages, test_case)
|
|
119
|
+
for metric in self.metrics:
|
|
120
|
+
result = metric.evaluate(
|
|
121
|
+
messages=messages,
|
|
122
|
+
ground_truth=test_case,
|
|
123
|
+
extracted_context=extracted_context,
|
|
124
|
+
metadata=metadata
|
|
125
|
+
)
|
|
126
|
+
if isinstance(result, list):
|
|
127
|
+
metric_results.extend([r.model_dump() for r in result])
|
|
128
|
+
for r in result:
|
|
129
|
+
upload(
|
|
130
|
+
name=r.eval_name,
|
|
131
|
+
session_id=self.session_ids[idx],
|
|
132
|
+
value=r.value,
|
|
133
|
+
data_type=r.data_type,
|
|
134
|
+
metadata=r.metadata,
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
metric_results.append(result.model_dump())
|
|
138
|
+
upload(
|
|
139
|
+
name=result.eval_name,
|
|
140
|
+
session_id=self.session_ids[idx],
|
|
141
|
+
value=result.value,
|
|
142
|
+
data_type=result.data_type,
|
|
143
|
+
metadata=result.metadata,
|
|
144
|
+
)
|
|
145
|
+
total_metrics.append(metric_results)
|
|
146
|
+
|
|
147
|
+
aggregate_metrics = self.aggregator(total_metrics)
|
|
148
|
+
for metric in aggregate_metrics:
|
|
149
|
+
try:
|
|
150
|
+
LANGFUSE_CLIENT.create_score(
|
|
151
|
+
name=metric.eval_name,
|
|
152
|
+
value=metric.value,
|
|
153
|
+
metadata=metric.metadata,
|
|
154
|
+
data_type="NUMERIC",
|
|
155
|
+
dataset_run_id=metric.metadata["experiment_id"],
|
|
156
|
+
)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
rich.print(
|
|
159
|
+
f"[r] Uploading {metric.name} with value {metric.value} failed with exception {e}"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
return ExperimentResult(
|
|
163
|
+
experiment_name=self.evaluation_name,
|
|
164
|
+
run_id=self.run_name,
|
|
165
|
+
experiment_id=self.experiment_id,
|
|
166
|
+
metrics=total_metrics,
|
|
167
|
+
session_ids=self.session_ids
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
if __name__ == "__main__":
|
|
172
|
+
collection_name = "HR AGENT DEMO"
|
|
173
|
+
langfuse_collection = LangfuseCollection(name=collection_name)
|
|
174
|
+
journey_sucess_metric = JourneySuccessMetric()
|
|
175
|
+
tool_calling = ToolCalling()
|
|
176
|
+
|
|
177
|
+
SESSION_ID = "agent-demo-session-id-NEW"
|
|
178
|
+
|
|
179
|
+
run = EvaluationRunner(
|
|
180
|
+
evaluation_name="sample_evaluation",
|
|
181
|
+
run_name="1",
|
|
182
|
+
session_ids=[
|
|
183
|
+
"agent-demo-session-id-NEW-0",
|
|
184
|
+
"agent-demo-session-id-NEW-1",
|
|
185
|
+
],
|
|
186
|
+
collection=langfuse_collection,
|
|
187
|
+
metrics=[journey_sucess_metric, tool_calling],
|
|
188
|
+
aggregator=sample_aggregator,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
experiment_results = run.evaluate()
|
|
192
|
+
rich.print(experiment_results.model_dump())
|
|
@@ -1,10 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM Matching Module with Cosine Similarity Support
|
|
3
|
+
|
|
4
|
+
This module provides functionality for matching text using:
|
|
5
|
+
1. LLM-based matching (using a language model to determine semantic equivalence)
|
|
6
|
+
2. Embedding-based matching (using cosine similarity between text embeddings)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import math
|
|
1
10
|
from typing import List
|
|
2
11
|
|
|
12
|
+
from fuzzywuzzy import fuzz
|
|
13
|
+
|
|
3
14
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
4
15
|
KeywordMatchingTemplateRenderer,
|
|
5
16
|
SemanticMatchingTemplateRenderer,
|
|
6
17
|
)
|
|
7
18
|
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
19
|
+
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
8
20
|
|
|
9
21
|
|
|
10
22
|
class LLMMatcher:
|
|
@@ -13,10 +25,18 @@ class LLMMatcher:
|
|
|
13
25
|
llm_client: Provider,
|
|
14
26
|
keyword_template: KeywordMatchingTemplateRenderer,
|
|
15
27
|
semantic_template: SemanticMatchingTemplateRenderer,
|
|
28
|
+
use_llm_for_semantic: bool = True,
|
|
29
|
+
embedding_model_id: str = "sentence-transformers/all-minilm-l6-v2",
|
|
30
|
+
similarity_threshold: float = 0.8,
|
|
31
|
+
enable_fuzzy_matching: bool = False,
|
|
16
32
|
):
|
|
17
33
|
self.llm_client = llm_client
|
|
18
34
|
self.keyword_template = keyword_template
|
|
19
35
|
self.semantic_template = semantic_template
|
|
36
|
+
self.embedding_model_id = embedding_model_id
|
|
37
|
+
self.use_llm_for_semantic = use_llm_for_semantic
|
|
38
|
+
self.similarity_threshold = similarity_threshold
|
|
39
|
+
self.enable_fuzzy_matching = enable_fuzzy_matching
|
|
20
40
|
|
|
21
41
|
def keywords_match(self, response_text: str, keywords: List[str]) -> bool:
|
|
22
42
|
if len(keywords) == 0:
|
|
@@ -31,10 +51,92 @@ class LLMMatcher:
|
|
|
31
51
|
result = output.strip().lower()
|
|
32
52
|
return result.startswith("true")
|
|
33
53
|
|
|
34
|
-
def
|
|
54
|
+
def generate_embeddings(
|
|
55
|
+
self, prediction: str, ground_truth: str
|
|
56
|
+
) -> List[List[float]]:
|
|
57
|
+
|
|
58
|
+
embeddings = self.llm_client.encode([prediction, ground_truth])
|
|
59
|
+
|
|
60
|
+
return embeddings
|
|
61
|
+
|
|
62
|
+
def compute_cosine_similarity(
|
|
63
|
+
self, vec1: List[float], vec2: List[float]
|
|
64
|
+
) -> float:
|
|
65
|
+
"""Calculate cosine similarity between two vectors using pure Python"""
|
|
66
|
+
|
|
67
|
+
# Manual dot product calculation
|
|
68
|
+
dot_product = sum(a * b for a, b in zip(vec1, vec2))
|
|
69
|
+
|
|
70
|
+
# Manual magnitude calculations
|
|
71
|
+
magnitude1 = math.sqrt(sum(a * a for a in vec1))
|
|
72
|
+
magnitude2 = math.sqrt(sum(b * b for b in vec2))
|
|
73
|
+
|
|
74
|
+
return safe_divide(dot_product, (magnitude1 * magnitude2))
|
|
75
|
+
|
|
76
|
+
def cosine_similarity_semantic_match(
|
|
77
|
+
self, prediction: str, ground_truth: str
|
|
78
|
+
) -> bool:
|
|
79
|
+
embeddings = self.generate_embeddings(prediction, ground_truth)
|
|
80
|
+
cosine_similarity = self.compute_cosine_similarity(
|
|
81
|
+
embeddings[0], embeddings[1]
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return cosine_similarity >= self.similarity_threshold
|
|
85
|
+
|
|
86
|
+
def llm_semantic_match(
|
|
87
|
+
self, context, prediction: str, ground_truth: str
|
|
88
|
+
) -> bool:
|
|
89
|
+
"""Performs semantic matching for the agent's final response and the expected response using the starting sentence of the conversation as the context
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
context: The starting sentence of the conversation. TODO can also consider using the LLM user's story
|
|
93
|
+
prediction: the predicted string
|
|
94
|
+
ground_truth: the expected string
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
a boolean indicating if the sentences match.
|
|
98
|
+
"""
|
|
99
|
+
|
|
35
100
|
prompt = self.semantic_template.render(
|
|
36
|
-
expected_text=ground_truth, actual_text=prediction
|
|
101
|
+
context=context, expected_text=ground_truth, actual_text=prediction
|
|
37
102
|
)
|
|
38
103
|
output: str = self.llm_client.query(prompt)
|
|
39
104
|
result = output.strip().lower()
|
|
105
|
+
|
|
40
106
|
return result.startswith("true")
|
|
107
|
+
|
|
108
|
+
def fuzzywuzzy_semantic_match(
|
|
109
|
+
self, prediction: str, ground_truth: str
|
|
110
|
+
) -> bool:
|
|
111
|
+
|
|
112
|
+
similarity_score = fuzz.WRatio(prediction, ground_truth)
|
|
113
|
+
|
|
114
|
+
return similarity_score > self.similarity_threshold
|
|
115
|
+
|
|
116
|
+
def semantic_match(
|
|
117
|
+
self,
|
|
118
|
+
context: str,
|
|
119
|
+
prediction: str,
|
|
120
|
+
ground_truth: str,
|
|
121
|
+
enable_fuzzy_matching: bool = False,
|
|
122
|
+
) -> bool:
|
|
123
|
+
## TODO arjun-gupta1 10/06/2025: revist retry with exponential backoff. Opted for direct fallback to cosine similarity to avoid latency for now.
|
|
124
|
+
try:
|
|
125
|
+
return self.llm_semantic_match(context, prediction, ground_truth)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print(f"LLM semantic match failed: {e}")
|
|
128
|
+
|
|
129
|
+
if enable_fuzzy_matching:
|
|
130
|
+
print("falling back to fuzzy matching")
|
|
131
|
+
# Fallback to cosine similarity if LLM matching is not used or failed
|
|
132
|
+
try:
|
|
133
|
+
return self.cosine_similarity_semantic_match(
|
|
134
|
+
prediction, ground_truth
|
|
135
|
+
)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
print(
|
|
138
|
+
f"Cosine similarity failed: {e}. Falling back to fuzzywuzzy."
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Final fallback to fuzzywuzzy
|
|
142
|
+
return self.fuzzywuzzy_semantic_match(prediction, ground_truth)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
5
|
+
AnswerDerailment,
|
|
6
|
+
AnswerUnsafeTopic,
|
|
7
|
+
)
|
|
8
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
9
|
+
DerailmentTemplateRenderer,
|
|
10
|
+
UnsafeTopicTemplateRenderer,
|
|
11
|
+
)
|
|
12
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LLMSafetyJudge:
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
llm_client: Provider,
|
|
19
|
+
answer_derailment: DerailmentTemplateRenderer,
|
|
20
|
+
answer_unsafe_topic: UnsafeTopicTemplateRenderer,
|
|
21
|
+
):
|
|
22
|
+
self.llm_client = llm_client
|
|
23
|
+
self.answer_derailment_template = answer_derailment
|
|
24
|
+
self.answer_unsafe_topic_template = answer_unsafe_topic
|
|
25
|
+
|
|
26
|
+
def judge_derailment_in_answer(
|
|
27
|
+
self, question: str, instructions: str, answer: str
|
|
28
|
+
) -> AnswerDerailment:
|
|
29
|
+
prompt = self.answer_derailment_template.render(
|
|
30
|
+
question=question, instructions=instructions, answer=answer
|
|
31
|
+
)
|
|
32
|
+
output = self.llm_client.query(prompt)
|
|
33
|
+
result = output.strip().lower()
|
|
34
|
+
try:
|
|
35
|
+
json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
|
|
36
|
+
thejson = json.loads(json_str)
|
|
37
|
+
answer_derailment = AnswerDerailment.model_validate(thejson)
|
|
38
|
+
except Exception:
|
|
39
|
+
answer_derailment = AnswerDerailment(
|
|
40
|
+
in_scope="not sure", statement=answer, reason=result
|
|
41
|
+
)
|
|
42
|
+
return answer_derailment
|
|
43
|
+
|
|
44
|
+
def judge_unsafe_topic_in_answer(
|
|
45
|
+
self, question: str, instructions: str, answer: str
|
|
46
|
+
) -> AnswerUnsafeTopic:
|
|
47
|
+
"""
|
|
48
|
+
Use LLM to judge if the answer contains unsafe topics.
|
|
49
|
+
"""
|
|
50
|
+
prompt = self.answer_unsafe_topic_template.render(
|
|
51
|
+
question=question, instructions=instructions, answer=answer
|
|
52
|
+
)
|
|
53
|
+
output = self.llm_client.query(prompt)
|
|
54
|
+
result = output.strip().lower()
|
|
55
|
+
try:
|
|
56
|
+
json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
|
|
57
|
+
thejson = json.loads(json_str)
|
|
58
|
+
answer_unsafe = AnswerUnsafeTopic.model_validate(thejson)
|
|
59
|
+
except Exception:
|
|
60
|
+
answer_unsafe = AnswerUnsafeTopic(
|
|
61
|
+
is_safe="not sure", statement=answer, reason=result
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return answer_unsafe
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import List, TypeVar
|
|
2
2
|
|
|
3
|
+
from wxo_agentic_evaluation.base_user import BaseUserSimulator
|
|
3
4
|
from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
|
|
4
5
|
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
5
6
|
from wxo_agentic_evaluation.type import ContentType, Message
|
|
@@ -7,9 +8,9 @@ from wxo_agentic_evaluation.type import ContentType, Message
|
|
|
7
8
|
T = TypeVar("T", bound=JinjaTemplateRenderer)
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
class LLMUser:
|
|
11
|
+
class LLMUser(BaseUserSimulator):
|
|
11
12
|
def __init__(
|
|
12
|
-
self, wai_client: Provider, template: T, user_response_style: List[str]
|
|
13
|
+
self, wai_client: Provider, template: T, user_response_style: List[str] | None = None
|
|
13
14
|
):
|
|
14
15
|
self.wai_client = wai_client
|
|
15
16
|
self.prompt_template = template
|
|
@@ -21,8 +22,8 @@ class LLMUser:
|
|
|
21
22
|
self,
|
|
22
23
|
user_story,
|
|
23
24
|
conversation_history: List[Message],
|
|
24
|
-
attack_instructions: str = None,
|
|
25
|
-
) -> Message
|
|
25
|
+
attack_instructions: str | None = None,
|
|
26
|
+
) -> Message:
|
|
26
27
|
# the tool response is already summarized, we don't need that to take over the chat history context window
|
|
27
28
|
prompt_input = self.prompt_template.render(
|
|
28
29
|
conversation_history=[
|