ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -41,15 +41,15 @@ class ExternalAgentValidation:
41
41
  data = b""
42
42
  for chunk in resp:
43
43
  for line in chunk.splitlines(True):
44
- if line.startswith(b"data:"):
45
- line = line.replace(b"data:", b"")
46
- if line.strip() == b"[DONE]":
47
- return
44
+ if line.startswith(b"event:"):
45
+ continue
48
46
  data += line
49
47
  if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
50
48
  # NOTE: edge case, "data" can be sent in two different chunks
51
49
  if data.startswith(b"data:"):
52
50
  data = data.replace(b"data:", b"")
51
+ if data.strip() == b"[DONE]":
52
+ return
53
53
  yield data
54
54
  data = b""
55
55
  if data:
@@ -74,7 +74,9 @@ class ExternalAgentValidation:
74
74
  payload = {"stream": True}
75
75
  payload["messages"] = messages
76
76
  resp = requests.post(
77
- url=self.service_url, headers=self.header, json=payload
77
+ url=self.service_url,
78
+ headers=self.header,
79
+ json=payload,
78
80
  )
79
81
  success, logged_events = self._validate_streaming_response(resp)
80
82
 
@@ -1,4 +1,4 @@
1
- from typing import Any, List, Literal, Mapping, Union
1
+ from typing import Any, List, Literal, Mapping, Optional, Union
2
2
 
3
3
  from pydantic import BaseModel
4
4
 
@@ -46,7 +46,7 @@ class ThreadRunStepDeltaChoice(BaseModel):
46
46
  class BaseEventData(BaseModel):
47
47
  id: str
48
48
  object: str
49
- thread_id: str
49
+ thread_id: Optional[str] = None
50
50
  model: str | None = None
51
51
  created: int | None = None
52
52
 
@@ -62,13 +62,7 @@ class ThreadRunStepDeltaData(BaseEventData):
62
62
 
63
63
 
64
64
  class UniversalData(BaseEventData):
65
- object: Union[
66
- Literal["thread.message.delta"],
67
- Literal["thread.run.step.delta"],
68
- Literal["thread.run.step.created"],
69
- Literal["thread.run.step.completed"],
70
- ]
71
- choices: List[ThreadMessageDeltaChoice]
65
+ object: Optional[str]
72
66
  choices: List[Union[ThreadMessageDeltaChoice, dict]]
73
67
 
74
68
 
@@ -0,0 +1,3 @@
1
+ from wxo_agentic_evaluation.extractors.labeled_messages import (
2
+ ExtractLabeledMessages,
3
+ )
@@ -0,0 +1,21 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ from wxo_agentic_evaluation.type import Message
5
+
6
+
7
+ class Extractor(ABC):
8
+ @property
9
+ @abstractmethod
10
+ def name(self) -> str:
11
+ """Unique name for the extractor."""
12
+ raise NotImplementedError
13
+
14
+ @staticmethod
15
+ @abstractmethod
16
+ def extract(
17
+ messages: list[Message],
18
+ **kwargs,
19
+ ) -> Any:
20
+ """Extract data from messages."""
21
+ raise NotImplementedError
@@ -0,0 +1,47 @@
1
+ import json
2
+ from typing import Any, List, Mapping
3
+
4
+ from wxo_agentic_evaluation.extractors.extractor_base import Extractor
5
+ from wxo_agentic_evaluation.type import ContentType, GoalDetail, Message
6
+
7
+
8
+ class ExtractLabeledMessages(Extractor):
9
+ def name(self):
10
+ return "Labelled Messages"
11
+
12
+ def extract(
13
+ messages: List[Message],
14
+ ground_truth,
15
+ **kwargs,
16
+ ) -> Any:
17
+
18
+ tool_dictionary = (
19
+ {
20
+ goal_detail.name: goal_detail
21
+ for goal_detail in ground_truth.goal_details
22
+ if goal_detail.type == ContentType.tool_call
23
+ }
24
+ if ground_truth.goal_details
25
+ else {}
26
+ )
27
+ labeled_messages = {}
28
+ for idx, message in enumerate(messages):
29
+ # TODO: investigate this logic - `message` body might not be consistent across providers
30
+ if not (message.role == "assistant" and message.tool_calls):
31
+ continue
32
+ try:
33
+ msg_tool_call = message.tool_calls[0].function
34
+ except Exception:
35
+ # ignore malformed tool_call content
36
+ continue
37
+
38
+ matching_goal_details = [
39
+ gd
40
+ for gd in tool_dictionary.values()
41
+ if gd.tool_name == msg_tool_call.name
42
+ ]
43
+
44
+ if matching_goal_details:
45
+ labeled_messages[idx] = matching_goal_details
46
+
47
+ return {"labeled_messages": labeled_messages}
@@ -0,0 +1,68 @@
1
+ from langchain.tools import tool
2
+ from langchain.agents import create_agent
3
+ from langchain_openai import ChatOpenAI
4
+
5
+ import json
6
+ from datetime import datetime
7
+
8
+ @tool
9
+ def get_assignment_id_hr_usecase(username: str) -> str:
10
+ """
11
+ get the assignment id from username
12
+ :param username: username of the employee
13
+ """
14
+ if username=="nwaters":
15
+ return "15778303"
16
+ if username=="johndoe":
17
+ return "15338303"
18
+ return "not found"
19
+
20
+ def validate_datetime(date_text):
21
+ try:
22
+ format = "%Y-%m-%d"
23
+ datetime.strptime(date_text, format)
24
+ return True
25
+ except ValueError:
26
+ return False
27
+
28
+
29
+ @tool
30
+ def get_timeoff_schedule_hr_usecase(assignment_id: str, start_date: str, end_date: str) -> str:
31
+ """
32
+ get timeoff schedule for employee based on assignment id, start date and end date
33
+ :param assignment_id: assignment_id of the user
34
+ :param start_date: start date of the timeoff scheduel, in YYYY-MM-DD format
35
+ :param assignment_id: end date of the timeoff scheduel, in YYYY-MM-DD format
36
+ """
37
+
38
+ if not validate_datetime(start_date):
39
+ return f"Incorrect date format {start_date}, should be YYYY-MM-DD"
40
+ if not validate_datetime(end_date):
41
+ return f"Incorrect date format {end_date}, should be YYYY-MM-DD"
42
+ if assignment_id=="15338303":
43
+ return json.dumps(["20250411", "20250311", "20250101"])
44
+ if assignment_id=="15778303":
45
+ return json.dumps(["20250105"])
46
+ return []
47
+
48
+
49
+ @tool
50
+ def get_direct_reports_hr_usecase(username: str) -> str:
51
+ """
52
+ get direct reports for a given username
53
+ :param assignment_id: assignment_id of the user
54
+ """
55
+
56
+ return json.dumps(["nwaters", "johndoe"])
57
+
58
+
59
+ llm = ChatOpenAI(model="gpt-4o-mini")
60
+ tools = [get_assignment_id_hr_usecase, get_timeoff_schedule_hr_usecase, get_direct_reports_hr_usecase]
61
+ system_prompt="""You are an HR Agent that can answer questions related to timeoff and holiday calendar. Use the tools provided to answer the user's question. If you do not have enough information to answer the question, say so. If you need more information, ask follow up questions."""
62
+
63
+ agent = create_agent(
64
+ tools=tools,
65
+ model=llm,
66
+ system_prompt=system_prompt
67
+ )
68
+
@@ -0,0 +1,60 @@
1
+ import json
2
+ from typing import List, Mapping, Union
3
+
4
+ import rich
5
+
6
+ from wxo_agentic_evaluation.type import (
7
+ LangfuseCollectionModel,
8
+ LangfuseDatasetModel,
9
+ )
10
+
11
+
12
+ class LangfuseCollection:
13
+ def __init__(self, name, description="", metadata: Mapping[str, str] = {}):
14
+ self.name = name
15
+ self.description = description
16
+ self.metadata = metadata
17
+
18
+ def upload(self, paths: Union[str, List[str]]):
19
+ from langfuse import get_client
20
+
21
+ langfuse_client = get_client()
22
+
23
+ datasets = []
24
+ if isinstance(paths, str):
25
+ paths = [paths]
26
+
27
+ for path in paths:
28
+ with open(path, encoding="utf-8") as f:
29
+ dataset = json.load(f)
30
+ dataset = LangfuseDatasetModel(
31
+ starting_sentence=dataset.get("starting_sentence", ""),
32
+ story=dataset.get("story", ""),
33
+ goals=dataset.get("goals"),
34
+ goal_details=dataset.get("goal_details"),
35
+ agent=dataset.get("agent")
36
+ )
37
+ datasets.append(dataset)
38
+
39
+ collection = LangfuseCollectionModel(
40
+ collection_name=self.name,
41
+ collection_description=self.description,
42
+ datasets=datasets,
43
+ metadata=self.metadata,
44
+ )
45
+
46
+ rich.print(
47
+ f"[g] Uploading {len(collection.datasets)} datasets to '{collection.collection_name}'"
48
+ )
49
+ langfuse_client.create_dataset(
50
+ name=collection.collection_name,
51
+ description=collection.collection_description,
52
+ metadata=collection.metadata,
53
+ )
54
+
55
+ for dataset in collection.datasets:
56
+ langfuse_client.create_dataset_item(
57
+ dataset_name=collection.collection_name,
58
+ input=dataset.langfuse_input,
59
+ expected_output=dataset.langfuse_output,
60
+ )
@@ -0,0 +1,192 @@
1
+ from collections import defaultdict
2
+ from typing import Callable, List
3
+
4
+ import rich
5
+ from langfuse import get_client
6
+ from langfuse.experiment import ExperimentResult
7
+
8
+ from wxo_agentic_evaluation.langfuse_collection import LangfuseCollection
9
+ from wxo_agentic_evaluation.metrics import Evaluation
10
+ from wxo_agentic_evaluation.metrics.dummy_metric import DummyMetric
11
+ from wxo_agentic_evaluation.metrics.journey_success import JourneySuccessMetric
12
+ from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
13
+ from wxo_agentic_evaluation.metrics.tool_calling import ToolCalling
14
+ from wxo_agentic_evaluation.otel_parser import parser as otel_parser
15
+ from wxo_agentic_evaluation.otel_parser.parser_types import (
16
+ Message as OtelMessage,
17
+ )
18
+ from wxo_agentic_evaluation.type import (
19
+ ExperimentResult,
20
+ LangfuseDatasetModel,
21
+ _convert_to_langfuse_format,
22
+ )
23
+
24
+ from wxo_agentic_evaluation.extractors import ExtractLabeledMessages
25
+
26
+ LANGFUSE_CLIENT = get_client()
27
+
28
+
29
+ def upload(name, session_id, value, data_type, metadata):
30
+ try:
31
+ LANGFUSE_CLIENT.create_score(
32
+ name=name,
33
+ session_id=session_id,
34
+ value=value,
35
+ data_type=data_type,
36
+ metadata=metadata,
37
+ )
38
+ except Exception as e:
39
+ rich.print(
40
+ f"[r] Uploading {name} with value {value} failed with exception {e}"
41
+ )
42
+
43
+
44
+ def sample_aggregator(session_results: List[List[Evaluation]]):
45
+ metric_names = [
46
+ "journey_success",
47
+ "total_tool_calls",
48
+ "correct_tool_calls",
49
+ "expected_tool_calls",
50
+ "tool_calls_with_incorrect_parameter",
51
+ "tool_call_recall",
52
+ "tool_call_precision",
53
+ ]
54
+ group_metrics = defaultdict(list)
55
+
56
+ for result in session_results:
57
+ for metric in result:
58
+ if metric["eval_name"] in metric_names:
59
+ group_metrics[metric["eval_name"]].append(
60
+ {"value": metric["value"], "metadata": metric["metadata"]}
61
+ )
62
+
63
+ average_metric = []
64
+ for metric_name, values in group_metrics.items():
65
+ aggr = []
66
+ for value in values:
67
+ aggr.append(value.get("value"))
68
+
69
+ metric_value = LangfuseMetric(
70
+ eval_name=f"Average_{metric_name}",
71
+ value=round(sum(aggr) / len(aggr), 2),
72
+ metadata=values[0]["metadata"],
73
+ )
74
+ average_metric.append(metric_value)
75
+
76
+ return average_metric
77
+
78
+
79
+ class EvaluationRunner:
80
+ def __init__(
81
+ self,
82
+ evaluation_name: str,
83
+ run_name: str,
84
+ session_ids: List[str],
85
+ collection: LangfuseCollection,
86
+ metrics: List[Evaluation],
87
+ aggregator: Callable,
88
+ ):
89
+ self.evaluation_name = evaluation_name
90
+ self.run_name = run_name
91
+
92
+ self.experiment_id = f"{self.evaluation_name}.{self.run_name}"
93
+
94
+ self.collection = collection
95
+ langfuse_dataset = LANGFUSE_CLIENT.get_dataset(self.collection.name)
96
+ self.test_cases: List[LangfuseDatasetModel] = []
97
+ for item in langfuse_dataset.items:
98
+ data_model = _convert_to_langfuse_format(item)
99
+ self.test_cases.append(data_model)
100
+
101
+ self.session_ids = session_ids
102
+ self.messages = [otel_parser.parse_session(id) for id in self.session_ids]
103
+
104
+ assert (
105
+ len(self.session_ids) == len(self.messages) == len(self.test_cases)
106
+ )
107
+
108
+ self.metrics = metrics
109
+ self.aggregator = aggregator
110
+
111
+ def evaluate(self):
112
+ metadata = {"experiment_id": self.experiment_id}
113
+
114
+ total_metrics = []
115
+ for idx, test_case in enumerate(self.test_cases):
116
+ metric_results = []
117
+ messages = self.messages[idx]
118
+ extracted_context = ExtractLabeledMessages.extract(messages, test_case)
119
+ for metric in self.metrics:
120
+ result = metric.evaluate(
121
+ messages=messages,
122
+ ground_truth=test_case,
123
+ extracted_context=extracted_context,
124
+ metadata=metadata
125
+ )
126
+ if isinstance(result, list):
127
+ metric_results.extend([r.model_dump() for r in result])
128
+ for r in result:
129
+ upload(
130
+ name=r.eval_name,
131
+ session_id=self.session_ids[idx],
132
+ value=r.value,
133
+ data_type=r.data_type,
134
+ metadata=r.metadata,
135
+ )
136
+ else:
137
+ metric_results.append(result.model_dump())
138
+ upload(
139
+ name=result.eval_name,
140
+ session_id=self.session_ids[idx],
141
+ value=result.value,
142
+ data_type=result.data_type,
143
+ metadata=result.metadata,
144
+ )
145
+ total_metrics.append(metric_results)
146
+
147
+ aggregate_metrics = self.aggregator(total_metrics)
148
+ for metric in aggregate_metrics:
149
+ try:
150
+ LANGFUSE_CLIENT.create_score(
151
+ name=metric.eval_name,
152
+ value=metric.value,
153
+ metadata=metric.metadata,
154
+ data_type="NUMERIC",
155
+ dataset_run_id=metric.metadata["experiment_id"],
156
+ )
157
+ except Exception as e:
158
+ rich.print(
159
+ f"[r] Uploading {metric.name} with value {metric.value} failed with exception {e}"
160
+ )
161
+
162
+ return ExperimentResult(
163
+ experiment_name=self.evaluation_name,
164
+ run_id=self.run_name,
165
+ experiment_id=self.experiment_id,
166
+ metrics=total_metrics,
167
+ session_ids=self.session_ids
168
+ )
169
+
170
+
171
+ if __name__ == "__main__":
172
+ collection_name = "HR AGENT DEMO"
173
+ langfuse_collection = LangfuseCollection(name=collection_name)
174
+ journey_sucess_metric = JourneySuccessMetric()
175
+ tool_calling = ToolCalling()
176
+
177
+ SESSION_ID = "agent-demo-session-id-NEW"
178
+
179
+ run = EvaluationRunner(
180
+ evaluation_name="sample_evaluation",
181
+ run_name="1",
182
+ session_ids=[
183
+ "agent-demo-session-id-NEW-0",
184
+ "agent-demo-session-id-NEW-1",
185
+ ],
186
+ collection=langfuse_collection,
187
+ metrics=[journey_sucess_metric, tool_calling],
188
+ aggregator=sample_aggregator,
189
+ )
190
+
191
+ experiment_results = run.evaluate()
192
+ rich.print(experiment_results.model_dump())
@@ -1,10 +1,22 @@
1
+ """
2
+ LLM Matching Module with Cosine Similarity Support
3
+
4
+ This module provides functionality for matching text using:
5
+ 1. LLM-based matching (using a language model to determine semantic equivalence)
6
+ 2. Embedding-based matching (using cosine similarity between text embeddings)
7
+ """
8
+
9
+ import math
1
10
  from typing import List
2
11
 
12
+ from fuzzywuzzy import fuzz
13
+
3
14
  from wxo_agentic_evaluation.prompt.template_render import (
4
15
  KeywordMatchingTemplateRenderer,
5
16
  SemanticMatchingTemplateRenderer,
6
17
  )
7
18
  from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
19
+ from wxo_agentic_evaluation.utils.utils import safe_divide
8
20
 
9
21
 
10
22
  class LLMMatcher:
@@ -13,10 +25,18 @@ class LLMMatcher:
13
25
  llm_client: Provider,
14
26
  keyword_template: KeywordMatchingTemplateRenderer,
15
27
  semantic_template: SemanticMatchingTemplateRenderer,
28
+ use_llm_for_semantic: bool = True,
29
+ embedding_model_id: str = "sentence-transformers/all-minilm-l6-v2",
30
+ similarity_threshold: float = 0.8,
31
+ enable_fuzzy_matching: bool = False,
16
32
  ):
17
33
  self.llm_client = llm_client
18
34
  self.keyword_template = keyword_template
19
35
  self.semantic_template = semantic_template
36
+ self.embedding_model_id = embedding_model_id
37
+ self.use_llm_for_semantic = use_llm_for_semantic
38
+ self.similarity_threshold = similarity_threshold
39
+ self.enable_fuzzy_matching = enable_fuzzy_matching
20
40
 
21
41
  def keywords_match(self, response_text: str, keywords: List[str]) -> bool:
22
42
  if len(keywords) == 0:
@@ -31,10 +51,92 @@ class LLMMatcher:
31
51
  result = output.strip().lower()
32
52
  return result.startswith("true")
33
53
 
34
- def semantic_match(self, prediction: str, ground_truth: str) -> bool:
54
+ def generate_embeddings(
55
+ self, prediction: str, ground_truth: str
56
+ ) -> List[List[float]]:
57
+
58
+ embeddings = self.llm_client.encode([prediction, ground_truth])
59
+
60
+ return embeddings
61
+
62
+ def compute_cosine_similarity(
63
+ self, vec1: List[float], vec2: List[float]
64
+ ) -> float:
65
+ """Calculate cosine similarity between two vectors using pure Python"""
66
+
67
+ # Manual dot product calculation
68
+ dot_product = sum(a * b for a, b in zip(vec1, vec2))
69
+
70
+ # Manual magnitude calculations
71
+ magnitude1 = math.sqrt(sum(a * a for a in vec1))
72
+ magnitude2 = math.sqrt(sum(b * b for b in vec2))
73
+
74
+ return safe_divide(dot_product, (magnitude1 * magnitude2))
75
+
76
+ def cosine_similarity_semantic_match(
77
+ self, prediction: str, ground_truth: str
78
+ ) -> bool:
79
+ embeddings = self.generate_embeddings(prediction, ground_truth)
80
+ cosine_similarity = self.compute_cosine_similarity(
81
+ embeddings[0], embeddings[1]
82
+ )
83
+
84
+ return cosine_similarity >= self.similarity_threshold
85
+
86
+ def llm_semantic_match(
87
+ self, context, prediction: str, ground_truth: str
88
+ ) -> bool:
89
+ """Performs semantic matching for the agent's final response and the expected response using the starting sentence of the conversation as the context
90
+
91
+ Args:
92
+ context: The starting sentence of the conversation. TODO can also consider using the LLM user's story
93
+ prediction: the predicted string
94
+ ground_truth: the expected string
95
+
96
+ Returns:
97
+ a boolean indicating if the sentences match.
98
+ """
99
+
35
100
  prompt = self.semantic_template.render(
36
- expected_text=ground_truth, actual_text=prediction
101
+ context=context, expected_text=ground_truth, actual_text=prediction
37
102
  )
38
103
  output: str = self.llm_client.query(prompt)
39
104
  result = output.strip().lower()
105
+
40
106
  return result.startswith("true")
107
+
108
+ def fuzzywuzzy_semantic_match(
109
+ self, prediction: str, ground_truth: str
110
+ ) -> bool:
111
+
112
+ similarity_score = fuzz.WRatio(prediction, ground_truth)
113
+
114
+ return similarity_score > self.similarity_threshold
115
+
116
+ def semantic_match(
117
+ self,
118
+ context: str,
119
+ prediction: str,
120
+ ground_truth: str,
121
+ enable_fuzzy_matching: bool = False,
122
+ ) -> bool:
123
+ ## TODO arjun-gupta1 10/06/2025: revist retry with exponential backoff. Opted for direct fallback to cosine similarity to avoid latency for now.
124
+ try:
125
+ return self.llm_semantic_match(context, prediction, ground_truth)
126
+ except Exception as e:
127
+ print(f"LLM semantic match failed: {e}")
128
+
129
+ if enable_fuzzy_matching:
130
+ print("falling back to fuzzy matching")
131
+ # Fallback to cosine similarity if LLM matching is not used or failed
132
+ try:
133
+ return self.cosine_similarity_semantic_match(
134
+ prediction, ground_truth
135
+ )
136
+ except Exception as e:
137
+ print(
138
+ f"Cosine similarity failed: {e}. Falling back to fuzzywuzzy."
139
+ )
140
+
141
+ # Final fallback to fuzzywuzzy
142
+ return self.fuzzywuzzy_semantic_match(prediction, ground_truth)
@@ -0,0 +1,64 @@
1
+ import json
2
+ import re
3
+
4
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
5
+ AnswerDerailment,
6
+ AnswerUnsafeTopic,
7
+ )
8
+ from wxo_agentic_evaluation.prompt.template_render import (
9
+ DerailmentTemplateRenderer,
10
+ UnsafeTopicTemplateRenderer,
11
+ )
12
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
13
+
14
+
15
+ class LLMSafetyJudge:
16
+ def __init__(
17
+ self,
18
+ llm_client: Provider,
19
+ answer_derailment: DerailmentTemplateRenderer,
20
+ answer_unsafe_topic: UnsafeTopicTemplateRenderer,
21
+ ):
22
+ self.llm_client = llm_client
23
+ self.answer_derailment_template = answer_derailment
24
+ self.answer_unsafe_topic_template = answer_unsafe_topic
25
+
26
+ def judge_derailment_in_answer(
27
+ self, question: str, instructions: str, answer: str
28
+ ) -> AnswerDerailment:
29
+ prompt = self.answer_derailment_template.render(
30
+ question=question, instructions=instructions, answer=answer
31
+ )
32
+ output = self.llm_client.query(prompt)
33
+ result = output.strip().lower()
34
+ try:
35
+ json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
36
+ thejson = json.loads(json_str)
37
+ answer_derailment = AnswerDerailment.model_validate(thejson)
38
+ except Exception:
39
+ answer_derailment = AnswerDerailment(
40
+ in_scope="not sure", statement=answer, reason=result
41
+ )
42
+ return answer_derailment
43
+
44
+ def judge_unsafe_topic_in_answer(
45
+ self, question: str, instructions: str, answer: str
46
+ ) -> AnswerUnsafeTopic:
47
+ """
48
+ Use LLM to judge if the answer contains unsafe topics.
49
+ """
50
+ prompt = self.answer_unsafe_topic_template.render(
51
+ question=question, instructions=instructions, answer=answer
52
+ )
53
+ output = self.llm_client.query(prompt)
54
+ result = output.strip().lower()
55
+ try:
56
+ json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
57
+ thejson = json.loads(json_str)
58
+ answer_unsafe = AnswerUnsafeTopic.model_validate(thejson)
59
+ except Exception:
60
+ answer_unsafe = AnswerUnsafeTopic(
61
+ is_safe="not sure", statement=answer, reason=result
62
+ )
63
+
64
+ return answer_unsafe
@@ -1,5 +1,6 @@
1
1
  from typing import List, TypeVar
2
2
 
3
+ from wxo_agentic_evaluation.base_user import BaseUserSimulator
3
4
  from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
4
5
  from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
5
6
  from wxo_agentic_evaluation.type import ContentType, Message
@@ -7,9 +8,9 @@ from wxo_agentic_evaluation.type import ContentType, Message
7
8
  T = TypeVar("T", bound=JinjaTemplateRenderer)
8
9
 
9
10
 
10
- class LLMUser:
11
+ class LLMUser(BaseUserSimulator):
11
12
  def __init__(
12
- self, wai_client: Provider, template: T, user_response_style: List[str]
13
+ self, wai_client: Provider, template: T, user_response_style: List[str] | None = None
13
14
  ):
14
15
  self.wai_client = wai_client
15
16
  self.prompt_template = template
@@ -21,8 +22,8 @@ class LLMUser:
21
22
  self,
22
23
  user_story,
23
24
  conversation_history: List[Message],
24
- attack_instructions: str = None,
25
- ) -> Message | None:
25
+ attack_instructions: str | None = None,
26
+ ) -> Message:
26
27
  # the tool response is already summarized, we don't need that to take over the chat history context window
27
28
  prompt_input = self.prompt_template.render(
28
29
  conversation_history=[