ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,192 @@
1
+ from collections import defaultdict
2
+ from typing import Callable, List
3
+
4
+ import rich
5
+ from langfuse import get_client
6
+ from langfuse.experiment import ExperimentResult
7
+
8
+ from wxo_agentic_evaluation.langfuse_collection import LangfuseCollection
9
+ from wxo_agentic_evaluation.metrics import Evaluation
10
+ from wxo_agentic_evaluation.metrics.dummy_metric import DummyMetric
11
+ from wxo_agentic_evaluation.metrics.journey_success import JourneySuccessMetric
12
+ from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
13
+ from wxo_agentic_evaluation.metrics.tool_calling import ToolCalling
14
+ from wxo_agentic_evaluation.otel_parser import parser as otel_parser
15
+ from wxo_agentic_evaluation.otel_parser.parser_types import (
16
+ Message as OtelMessage,
17
+ )
18
+ from wxo_agentic_evaluation.type import (
19
+ ExperimentResult,
20
+ LangfuseDatasetModel,
21
+ _convert_to_langfuse_format,
22
+ )
23
+
24
+ from wxo_agentic_evaluation.extractors import ExtractLabeledMessages
25
+
26
+ LANGFUSE_CLIENT = get_client()
27
+
28
+
29
+ def upload(name, session_id, value, data_type, metadata):
30
+ try:
31
+ LANGFUSE_CLIENT.create_score(
32
+ name=name,
33
+ session_id=session_id,
34
+ value=value,
35
+ data_type=data_type,
36
+ metadata=metadata,
37
+ )
38
+ except Exception as e:
39
+ rich.print(
40
+ f"[r] Uploading {name} with value {value} failed with exception {e}"
41
+ )
42
+
43
+
44
+ def sample_aggregator(session_results: List[List[Evaluation]]):
45
+ metric_names = [
46
+ "journey_success",
47
+ "total_tool_calls",
48
+ "correct_tool_calls",
49
+ "expected_tool_calls",
50
+ "tool_calls_with_incorrect_parameter",
51
+ "tool_call_recall",
52
+ "tool_call_precision",
53
+ ]
54
+ group_metrics = defaultdict(list)
55
+
56
+ for result in session_results:
57
+ for metric in result:
58
+ if metric["eval_name"] in metric_names:
59
+ group_metrics[metric["eval_name"]].append(
60
+ {"value": metric["value"], "metadata": metric["metadata"]}
61
+ )
62
+
63
+ average_metric = []
64
+ for metric_name, values in group_metrics.items():
65
+ aggr = []
66
+ for value in values:
67
+ aggr.append(value.get("value"))
68
+
69
+ metric_value = LangfuseMetric(
70
+ eval_name=f"Average_{metric_name}",
71
+ value=round(sum(aggr) / len(aggr), 2),
72
+ metadata=values[0]["metadata"],
73
+ )
74
+ average_metric.append(metric_value)
75
+
76
+ return average_metric
77
+
78
+
79
+ class EvaluationRunner:
80
+ def __init__(
81
+ self,
82
+ evaluation_name: str,
83
+ run_name: str,
84
+ session_ids: List[str],
85
+ collection: LangfuseCollection,
86
+ metrics: List[Evaluation],
87
+ aggregator: Callable,
88
+ ):
89
+ self.evaluation_name = evaluation_name
90
+ self.run_name = run_name
91
+
92
+ self.experiment_id = f"{self.evaluation_name}.{self.run_name}"
93
+
94
+ self.collection = collection
95
+ langfuse_dataset = LANGFUSE_CLIENT.get_dataset(self.collection.name)
96
+ self.test_cases: List[LangfuseDatasetModel] = []
97
+ for item in langfuse_dataset.items:
98
+ data_model = _convert_to_langfuse_format(item)
99
+ self.test_cases.append(data_model)
100
+
101
+ self.session_ids = session_ids
102
+ self.messages = [otel_parser.parse_session(id) for id in self.session_ids]
103
+
104
+ assert (
105
+ len(self.session_ids) == len(self.messages) == len(self.test_cases)
106
+ )
107
+
108
+ self.metrics = metrics
109
+ self.aggregator = aggregator
110
+
111
+ def evaluate(self):
112
+ metadata = {"experiment_id": self.experiment_id}
113
+
114
+ total_metrics = []
115
+ for idx, test_case in enumerate(self.test_cases):
116
+ metric_results = []
117
+ messages = self.messages[idx]
118
+ extracted_context = ExtractLabeledMessages.extract(messages, test_case)
119
+ for metric in self.metrics:
120
+ result = metric.evaluate(
121
+ messages=messages,
122
+ ground_truth=test_case,
123
+ extracted_context=extracted_context,
124
+ metadata=metadata
125
+ )
126
+ if isinstance(result, list):
127
+ metric_results.extend([r.model_dump() for r in result])
128
+ for r in result:
129
+ upload(
130
+ name=r.eval_name,
131
+ session_id=self.session_ids[idx],
132
+ value=r.value,
133
+ data_type=r.data_type,
134
+ metadata=r.metadata,
135
+ )
136
+ else:
137
+ metric_results.append(result.model_dump())
138
+ upload(
139
+ name=result.eval_name,
140
+ session_id=self.session_ids[idx],
141
+ value=result.value,
142
+ data_type=result.data_type,
143
+ metadata=result.metadata,
144
+ )
145
+ total_metrics.append(metric_results)
146
+
147
+ aggregate_metrics = self.aggregator(total_metrics)
148
+ for metric in aggregate_metrics:
149
+ try:
150
+ LANGFUSE_CLIENT.create_score(
151
+ name=metric.eval_name,
152
+ value=metric.value,
153
+ metadata=metric.metadata,
154
+ data_type="NUMERIC",
155
+ dataset_run_id=metric.metadata["experiment_id"],
156
+ )
157
+ except Exception as e:
158
+ rich.print(
159
+ f"[r] Uploading {metric.name} with value {metric.value} failed with exception {e}"
160
+ )
161
+
162
+ return ExperimentResult(
163
+ experiment_name=self.evaluation_name,
164
+ run_id=self.run_name,
165
+ experiment_id=self.experiment_id,
166
+ metrics=total_metrics,
167
+ session_ids=self.session_ids
168
+ )
169
+
170
+
171
+ if __name__ == "__main__":
172
+ collection_name = "HR AGENT DEMO"
173
+ langfuse_collection = LangfuseCollection(name=collection_name)
174
+ journey_sucess_metric = JourneySuccessMetric()
175
+ tool_calling = ToolCalling()
176
+
177
+ SESSION_ID = "agent-demo-session-id-NEW"
178
+
179
+ run = EvaluationRunner(
180
+ evaluation_name="sample_evaluation",
181
+ run_name="1",
182
+ session_ids=[
183
+ "agent-demo-session-id-NEW-0",
184
+ "agent-demo-session-id-NEW-1",
185
+ ],
186
+ collection=langfuse_collection,
187
+ metrics=[journey_sucess_metric, tool_calling],
188
+ aggregator=sample_aggregator,
189
+ )
190
+
191
+ experiment_results = run.evaluate()
192
+ rich.print(experiment_results.model_dump())
@@ -1,9 +1,22 @@
1
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
1
+ """
2
+ LLM Matching Module with Cosine Similarity Support
3
+
4
+ This module provides functionality for matching text using:
5
+ 1. LLM-based matching (using a language model to determine semantic equivalence)
6
+ 2. Embedding-based matching (using cosine similarity between text embeddings)
7
+ """
8
+
9
+ import math
10
+ from typing import List
11
+
12
+ from fuzzywuzzy import fuzz
13
+
2
14
  from wxo_agentic_evaluation.prompt.template_render import (
3
15
  KeywordMatchingTemplateRenderer,
4
16
  SemanticMatchingTemplateRenderer,
5
17
  )
6
- from typing import List
18
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
19
+ from wxo_agentic_evaluation.utils.utils import safe_divide
7
20
 
8
21
 
9
22
  class LLMMatcher:
@@ -12,10 +25,18 @@ class LLMMatcher:
12
25
  llm_client: Provider,
13
26
  keyword_template: KeywordMatchingTemplateRenderer,
14
27
  semantic_template: SemanticMatchingTemplateRenderer,
28
+ use_llm_for_semantic: bool = True,
29
+ embedding_model_id: str = "sentence-transformers/all-minilm-l6-v2",
30
+ similarity_threshold: float = 0.8,
31
+ enable_fuzzy_matching: bool = False,
15
32
  ):
16
33
  self.llm_client = llm_client
17
34
  self.keyword_template = keyword_template
18
35
  self.semantic_template = semantic_template
36
+ self.embedding_model_id = embedding_model_id
37
+ self.use_llm_for_semantic = use_llm_for_semantic
38
+ self.similarity_threshold = similarity_threshold
39
+ self.enable_fuzzy_matching = enable_fuzzy_matching
19
40
 
20
41
  def keywords_match(self, response_text: str, keywords: List[str]) -> bool:
21
42
  if len(keywords) == 0:
@@ -26,14 +47,96 @@ class LLMMatcher:
26
47
  prompt = self.keyword_template.render(
27
48
  keywords_text=keywords_text, response_text=response_text
28
49
  )
29
- output:str = self.llm_client.query(prompt)
50
+ output: str = self.llm_client.query(prompt)
30
51
  result = output.strip().lower()
31
52
  return result.startswith("true")
32
53
 
33
- def semantic_match(self, prediction: str, ground_truth: str) -> bool:
54
+ def generate_embeddings(
55
+ self, prediction: str, ground_truth: str
56
+ ) -> List[List[float]]:
57
+
58
+ embeddings = self.llm_client.encode([prediction, ground_truth])
59
+
60
+ return embeddings
61
+
62
+ def compute_cosine_similarity(
63
+ self, vec1: List[float], vec2: List[float]
64
+ ) -> float:
65
+ """Calculate cosine similarity between two vectors using pure Python"""
66
+
67
+ # Manual dot product calculation
68
+ dot_product = sum(a * b for a, b in zip(vec1, vec2))
69
+
70
+ # Manual magnitude calculations
71
+ magnitude1 = math.sqrt(sum(a * a for a in vec1))
72
+ magnitude2 = math.sqrt(sum(b * b for b in vec2))
73
+
74
+ return safe_divide(dot_product, (magnitude1 * magnitude2))
75
+
76
+ def cosine_similarity_semantic_match(
77
+ self, prediction: str, ground_truth: str
78
+ ) -> bool:
79
+ embeddings = self.generate_embeddings(prediction, ground_truth)
80
+ cosine_similarity = self.compute_cosine_similarity(
81
+ embeddings[0], embeddings[1]
82
+ )
83
+
84
+ return cosine_similarity >= self.similarity_threshold
85
+
86
+ def llm_semantic_match(
87
+ self, context, prediction: str, ground_truth: str
88
+ ) -> bool:
89
+ """Performs semantic matching for the agent's final response and the expected response using the starting sentence of the conversation as the context
90
+
91
+ Args:
92
+ context: The starting sentence of the conversation. TODO can also consider using the LLM user's story
93
+ prediction: the predicted string
94
+ ground_truth: the expected string
95
+
96
+ Returns:
97
+ a boolean indicating if the sentences match.
98
+ """
99
+
34
100
  prompt = self.semantic_template.render(
35
- expected_text=ground_truth, actual_text=prediction
101
+ context=context, expected_text=ground_truth, actual_text=prediction
36
102
  )
37
103
  output: str = self.llm_client.query(prompt)
38
104
  result = output.strip().lower()
105
+
39
106
  return result.startswith("true")
107
+
108
+ def fuzzywuzzy_semantic_match(
109
+ self, prediction: str, ground_truth: str
110
+ ) -> bool:
111
+
112
+ similarity_score = fuzz.WRatio(prediction, ground_truth)
113
+
114
+ return similarity_score > self.similarity_threshold
115
+
116
+ def semantic_match(
117
+ self,
118
+ context: str,
119
+ prediction: str,
120
+ ground_truth: str,
121
+ enable_fuzzy_matching: bool = False,
122
+ ) -> bool:
123
+ ## TODO arjun-gupta1 10/06/2025: revist retry with exponential backoff. Opted for direct fallback to cosine similarity to avoid latency for now.
124
+ try:
125
+ return self.llm_semantic_match(context, prediction, ground_truth)
126
+ except Exception as e:
127
+ print(f"LLM semantic match failed: {e}")
128
+
129
+ if enable_fuzzy_matching:
130
+ print("falling back to fuzzy matching")
131
+ # Fallback to cosine similarity if LLM matching is not used or failed
132
+ try:
133
+ return self.cosine_similarity_semantic_match(
134
+ prediction, ground_truth
135
+ )
136
+ except Exception as e:
137
+ print(
138
+ f"Cosine similarity failed: {e}. Falling back to fuzzywuzzy."
139
+ )
140
+
141
+ # Final fallback to fuzzywuzzy
142
+ return self.fuzzywuzzy_semantic_match(prediction, ground_truth)
@@ -1,12 +1,15 @@
1
- from typing import List
2
1
  import json
2
+ from typing import List
3
3
 
4
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
4
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
5
+ AnswerRelevancy,
6
+ Faithfulness,
7
+ )
5
8
  from wxo_agentic_evaluation.prompt.template_render import (
6
- FaithfulnessTemplateRenderer,
7
9
  AnswerRelevancyTemplateRenderer,
10
+ FaithfulnessTemplateRenderer,
8
11
  )
9
- from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
12
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
10
13
 
11
14
 
12
15
  class LLMJudge:
@@ -0,0 +1,64 @@
1
+ import json
2
+ import re
3
+
4
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
5
+ AnswerDerailment,
6
+ AnswerUnsafeTopic,
7
+ )
8
+ from wxo_agentic_evaluation.prompt.template_render import (
9
+ DerailmentTemplateRenderer,
10
+ UnsafeTopicTemplateRenderer,
11
+ )
12
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
13
+
14
+
15
+ class LLMSafetyJudge:
16
+ def __init__(
17
+ self,
18
+ llm_client: Provider,
19
+ answer_derailment: DerailmentTemplateRenderer,
20
+ answer_unsafe_topic: UnsafeTopicTemplateRenderer,
21
+ ):
22
+ self.llm_client = llm_client
23
+ self.answer_derailment_template = answer_derailment
24
+ self.answer_unsafe_topic_template = answer_unsafe_topic
25
+
26
+ def judge_derailment_in_answer(
27
+ self, question: str, instructions: str, answer: str
28
+ ) -> AnswerDerailment:
29
+ prompt = self.answer_derailment_template.render(
30
+ question=question, instructions=instructions, answer=answer
31
+ )
32
+ output = self.llm_client.query(prompt)
33
+ result = output.strip().lower()
34
+ try:
35
+ json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
36
+ thejson = json.loads(json_str)
37
+ answer_derailment = AnswerDerailment.model_validate(thejson)
38
+ except Exception:
39
+ answer_derailment = AnswerDerailment(
40
+ in_scope="not sure", statement=answer, reason=result
41
+ )
42
+ return answer_derailment
43
+
44
+ def judge_unsafe_topic_in_answer(
45
+ self, question: str, instructions: str, answer: str
46
+ ) -> AnswerUnsafeTopic:
47
+ """
48
+ Use LLM to judge if the answer contains unsafe topics.
49
+ """
50
+ prompt = self.answer_unsafe_topic_template.render(
51
+ question=question, instructions=instructions, answer=answer
52
+ )
53
+ output = self.llm_client.query(prompt)
54
+ result = output.strip().lower()
55
+ try:
56
+ json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
57
+ thejson = json.loads(json_str)
58
+ answer_unsafe = AnswerUnsafeTopic.model_validate(thejson)
59
+ except Exception:
60
+ answer_unsafe = AnswerUnsafeTopic(
61
+ is_safe="not sure", statement=answer, reason=result
62
+ )
63
+
64
+ return answer_unsafe
@@ -1,14 +1,16 @@
1
1
  from typing import List, TypeVar
2
- from wxo_agentic_evaluation.type import Message, ContentType
3
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
2
+
3
+ from wxo_agentic_evaluation.base_user import BaseUserSimulator
4
4
  from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
5
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
6
+ from wxo_agentic_evaluation.type import ContentType, Message
5
7
 
6
8
  T = TypeVar("T", bound=JinjaTemplateRenderer)
7
9
 
8
10
 
9
- class LLMUser:
11
+ class LLMUser(BaseUserSimulator):
10
12
  def __init__(
11
- self, wai_client: Provider, template: T, user_response_style: List[str]
13
+ self, wai_client: Provider, template: T, user_response_style: List[str] | None = None
12
14
  ):
13
15
  self.wai_client = wai_client
14
16
  self.prompt_template = template
@@ -17,8 +19,11 @@ class LLMUser:
17
19
  )
18
20
 
19
21
  def generate_user_input(
20
- self, user_story, conversation_history: List[Message]
21
- ) -> Message | None:
22
+ self,
23
+ user_story,
24
+ conversation_history: List[Message],
25
+ attack_instructions: str | None = None,
26
+ ) -> Message:
22
27
  # the tool response is already summarized, we don't need that to take over the chat history context window
23
28
  prompt_input = self.prompt_template.render(
24
29
  conversation_history=[
@@ -28,6 +33,7 @@ class LLMUser:
28
33
  ],
29
34
  user_story=user_story,
30
35
  user_response_style=self.user_response_style,
36
+ attack_instructions=attack_instructions,
31
37
  )
32
38
  user_input = self.wai_client.query(prompt_input)
33
39
  user_input = Message(
@@ -0,0 +1,114 @@
1
+ from typing import List
2
+
3
+ from wxo_agentic_evaluation.base_user import BaseUserSimulator
4
+ from wxo_agentic_evaluation.prompt.template_render import UserTemplateRenderer
5
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
6
+ from wxo_agentic_evaluation.type import ContentType, Message
7
+
8
+
9
+ class LLMUserV2(BaseUserSimulator):
10
+ def __init__(
11
+ self,
12
+ llm_client: Provider,
13
+ user_prompt_path: str,
14
+ ):
15
+ self.llm_client = llm_client
16
+ self.user_prompt_path = user_prompt_path
17
+ self.prompt_template = UserTemplateRenderer(
18
+ template_path=user_prompt_path
19
+ )
20
+
21
+ def _get_system_prompt(
22
+ self, user_story: str, user_response_style: List[str] = None
23
+ ) -> Message:
24
+ # Get the user system prompt
25
+ prompt_messages = self.prompt_template.render(
26
+ user_story=user_story,
27
+ user_response_style=user_response_style,
28
+ )
29
+ return Message(**prompt_messages[0], type=ContentType.text)
30
+
31
+ def _get_message_dicts(self, messages: List[Message]) -> List[dict]:
32
+ # Convert messages to dictionary format for the llm client
33
+ return [message.model_dump() for message in messages]
34
+
35
+ def _filter_conversation_history(
36
+ self, conversation_history: List[Message]
37
+ ) -> List[Message]:
38
+ # Filter out the agent system prompt
39
+ return [
40
+ message
41
+ for message in conversation_history
42
+ if message.role != "system"
43
+ ]
44
+
45
+ def flip_message_roles(self, messages: List[Message]) -> List[Message]:
46
+ # We flip the roles of messages in conversation history to basically prompt the
47
+ # user simulator with the assistant message as the user input message
48
+ # This helps to get the llm to respond as a natural user with the given story.
49
+ new_messages = []
50
+ for message in messages:
51
+ if message.role == "user":
52
+ new_messages.append(
53
+ Message(
54
+ role="assistant",
55
+ content=message.content,
56
+ type=ContentType.text,
57
+ )
58
+ )
59
+ else:
60
+ new_messages.append(
61
+ Message(
62
+ role="user",
63
+ content=message.content,
64
+ type=ContentType.text,
65
+ )
66
+ )
67
+ return new_messages
68
+
69
+ def generate_user_input(
70
+ self,
71
+ user_story: str,
72
+ conversation_history: List[Message],
73
+ user_response_style: List[str] = None,
74
+ starting_user_input: Message = None,
75
+ **kwargs,
76
+ ) -> Message:
77
+ # Get the user system prompt
78
+ system_prompt = self._get_system_prompt(user_story, user_response_style)
79
+
80
+ conversation_history = self._filter_conversation_history(
81
+ conversation_history
82
+ )
83
+
84
+ ## Adding dummy message if not provided from the simulation side.
85
+ if len(conversation_history) == 0:
86
+ conversation_history.append(
87
+ Message(
88
+ role="assistant",
89
+ content="Hi! How can I help you today?",
90
+ type=ContentType.text,
91
+ )
92
+ )
93
+
94
+ conversation_history = self.flip_message_roles(conversation_history)
95
+
96
+ # build the conversation history with the system prompt
97
+ messages = [system_prompt] + conversation_history
98
+
99
+ if starting_user_input is not None:
100
+ # If starting user input is provided, return it as is for the initial turn
101
+ return starting_user_input
102
+ else:
103
+
104
+ # Get response from LLM for simulation
105
+ response = self.llm_client.chat(
106
+ messages=self._get_message_dicts(messages)
107
+ )
108
+ response_message = Message(
109
+ role="user",
110
+ content=response.choices[0].message.content,
111
+ type=ContentType.text,
112
+ )
113
+
114
+ return response_message