ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
  3. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
  4. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
  5. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
  6. wxo_agentic_evaluation/__init__.py +0 -0
  7. wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
  8. wxo_agentic_evaluation/analytics/tools/main.py +163 -0
  9. wxo_agentic_evaluation/analytics/tools/types.py +130 -0
  10. wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
  11. wxo_agentic_evaluation/analyze_run.py +123 -0
  12. wxo_agentic_evaluation/annotate.py +40 -0
  13. wxo_agentic_evaluation/arg_configs.py +78 -0
  14. wxo_agentic_evaluation/batch_annotate.py +181 -0
  15. wxo_agentic_evaluation/data_annotator.py +253 -0
  16. wxo_agentic_evaluation/evaluation_package.py +518 -0
  17. wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
  18. wxo_agentic_evaluation/external_agent/types.py +65 -0
  19. wxo_agentic_evaluation/inference_backend.py +601 -0
  20. wxo_agentic_evaluation/llm_matching.py +39 -0
  21. wxo_agentic_evaluation/llm_rag_eval.py +47 -0
  22. wxo_agentic_evaluation/llm_user.py +38 -0
  23. wxo_agentic_evaluation/main.py +231 -0
  24. wxo_agentic_evaluation/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
  26. wxo_agentic_evaluation/metrics/metrics.py +101 -0
  27. wxo_agentic_evaluation/prompt/__init__.py +0 -0
  28. wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
  29. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
  30. wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
  31. wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
  32. wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
  33. wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
  34. wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
  35. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
  36. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
  37. wxo_agentic_evaluation/prompt/template_render.py +90 -0
  38. wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
  39. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
  40. wxo_agentic_evaluation/record_chat.py +165 -0
  41. wxo_agentic_evaluation/service_instance.py +179 -0
  42. wxo_agentic_evaluation/tool_planner.py +228 -0
  43. wxo_agentic_evaluation/type.py +176 -0
  44. wxo_agentic_evaluation/utils/__init__.py +6 -0
  45. wxo_agentic_evaluation/utils/utils.py +233 -0
  46. wxo_agentic_evaluation/watsonx_provider.py +175 -0
@@ -0,0 +1,233 @@
1
+ from urllib.parse import urlparse
2
+ from rich.console import Console, Group
3
+ from rich.table import Table
4
+ from rich.panel import Panel
5
+ from rich.rule import Rule
6
+ from rich import box
7
+ from rich import print
8
+
9
+ from typing import List
10
+
11
+ from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
12
+ from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary
13
+ from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
14
+
15
+ console = Console()
16
+
17
+ class AgentMetricsTable:
18
+ def __init__(self, data):
19
+ self.table = Table(
20
+ title="Agent Metrics",
21
+ box=box.ROUNDED,
22
+ show_lines=True,
23
+ )
24
+
25
+ if not data:
26
+ return
27
+
28
+ # Add columns with styling
29
+ headers = list(data[0].keys())
30
+ for header in headers:
31
+ self.table.add_column(header, style="cyan")
32
+
33
+ # Add rows
34
+ for row in data:
35
+ self.table.add_row(*[str(row.get(col, "")) for col in headers])
36
+
37
+ def print(self):
38
+ console.print(self.table)
39
+
40
+
41
+ def create_table(data: List[dict]) -> AgentMetricsTable:
42
+ """
43
+ Generate a Rich table from a list of dictionaries.
44
+ Returns the AgentMetricsTable instance.
45
+ """
46
+ if isinstance(data, dict):
47
+ data = [data]
48
+
49
+ if not data:
50
+ print("create_table() received an empty dataset. No table generated.")
51
+ return None
52
+
53
+ return AgentMetricsTable(data)
54
+
55
+
56
+ def create_average_row(results_list, include_response_time=True):
57
+ # TO-DO: we are hiding some the columns to allow the table to display properly
58
+ # need a better solution
59
+ columns = {
60
+ "average_over_test_cases": [
61
+ "Total Step",
62
+ "Agent Step",
63
+ # "Tool Call Accuracy",
64
+ # "Tool Call Relevancy",
65
+ # "Agent Routing Accuracy"
66
+ ],
67
+ "average_over_ground_truth_calls": [
68
+ "Wrong Function Calls",
69
+ # "Bad Calls",
70
+ "Wrong Parameters",
71
+ "Wrong Routing Calls",
72
+ ],
73
+ }
74
+
75
+ if include_response_time:
76
+ columns["average_over_test_cases"].append("Avg Resp Time (Secs)")
77
+
78
+ journey_success_values = [
79
+ 1 if item["Journey Success"] else 0 for item in results_list
80
+ ]
81
+ journey_success_avg = round(
82
+ sum(journey_success_values) / len(journey_success_values), 2
83
+ )
84
+ text_match_values = [
85
+ 1 if item["Text Match"] == "Summary Matched" else 0 for item in results_list
86
+ ]
87
+ text_match_avg = round(sum(text_match_values) / len(text_match_values), 2)
88
+ ground_truth_calls = sum(item["Ground Truth Calls"] for item in results_list)
89
+ ground_truth_calls_avg = round(ground_truth_calls / len(results_list), 2)
90
+ avg_row = {
91
+ col: round(sum(item[col] for item in results_list) / len(results_list), 2)
92
+ for col in columns["average_over_test_cases"]
93
+ }
94
+ avg_row.update(
95
+ {
96
+ col: round(sum(item[col] for item in results_list) / ground_truth_calls, 2)
97
+ for col in columns["average_over_ground_truth_calls"]
98
+ }
99
+ )
100
+
101
+ # TODO: FIX as part of PR
102
+ # avg_row["WXO Average Response Time (Secs)"] = round(sum(item["WXO Average Response Time (Secs)"] for item in results_list) / len(results_list), 2)
103
+
104
+ avg_row["Journey Success"] = journey_success_avg
105
+ avg_row["Text Match"] = text_match_avg
106
+ avg_row["Ground Truth Calls"] = ground_truth_calls_avg
107
+ avg_row["Dataset"] = "Summary (Average)"
108
+ return avg_row
109
+
110
+
111
+ def is_saas_url(service_url: str) -> bool:
112
+ hostname = urlparse(service_url).hostname
113
+ return hostname not in ("localhost", "127.0.0.1", "0.0.0.0", "::1")
114
+
115
+
116
+ def is_ibm_cloud_url(service_url: str) -> bool:
117
+ hostname = urlparse(service_url).hostname
118
+ return ".cloud.ibm.com" in hostname
119
+
120
+
121
+ def add_line_seperator():
122
+ console.print(Rule(style="grey42"))
123
+
124
+
125
+ class FaithfulnessTable:
126
+ def __init__(
127
+ self, faithfulness_metrics: List[Faithfulness], tool_call_ids: List[str]
128
+ ):
129
+ self.table = Table(title="Faithfulness", box=box.ROUNDED, show_lines=True)
130
+
131
+ self.table.add_column("Tool Call Id", style="blue")
132
+ self.table.add_column("Faithfulness Score", style="blue3")
133
+ self.table.add_column("Evidence", style="cyan")
134
+ self.table.add_column("Reasoning", style="yellow3")
135
+
136
+ for tool_call_id, faithfulness in zip(tool_call_ids, faithfulness_metrics):
137
+ faithfulness = faithfulness.table()
138
+ self.table.add_row(
139
+ tool_call_id,
140
+ faithfulness["faithfulness_score"],
141
+ faithfulness["evidence"],
142
+ faithfulness["reason"],
143
+ )
144
+
145
+ def print(self):
146
+ console.print(self.table)
147
+
148
+
149
+ class ConversationalSearchTable:
150
+ def __init__(
151
+ self,
152
+ confidence_scores_list: List[ConversationalConfidenceThresholdScore],
153
+ tool_call_ids: List[str],
154
+ ):
155
+ self.table = Table(
156
+ title="Conversational Search", box=box.ROUNDED, show_lines=True
157
+ )
158
+
159
+ self.table.add_column("Tool Call Id", style="blue")
160
+ self.table.add_column("Response Confidence", style="blue3")
161
+ self.table.add_column("Response Confidence Threshold", style="cyan")
162
+ self.table.add_column("Retrieval Confidence", style="blue3")
163
+ self.table.add_column("Retrieval Confidence Threshold", style="cyan")
164
+
165
+ for tool_call_id, confidence_scores in zip(
166
+ tool_call_ids, confidence_scores_list
167
+ ):
168
+ confidence_scores = confidence_scores.table()
169
+ self.table.add_row(
170
+ tool_call_id,
171
+ confidence_scores["response_confidence"],
172
+ confidence_scores["response_confidence_threshold"],
173
+ confidence_scores["retrieval_confidence"],
174
+ confidence_scores["retrieval_confidence_threshold"],
175
+ )
176
+
177
+
178
+ class KnowledgePanel:
179
+ def __init__(
180
+ self,
181
+ dataset_name: str,
182
+ tool_call_id: List[str],
183
+ faithfulness: List[Faithfulness] = None,
184
+ confidence_scores: List[ConversationalConfidenceThresholdScore] = None,
185
+ ):
186
+ self.faithfulness = FaithfulnessTable(faithfulness, tool_call_id)
187
+ self.confidence_scores = ConversationalSearchTable(
188
+ confidence_scores, tool_call_id
189
+ )
190
+ self.group = Group(self.faithfulness.table, self.confidence_scores.table)
191
+
192
+ # Panel acts as a section
193
+ self.section = Panel(
194
+ self.group,
195
+ title=f"Agent with Knowledge Metrics for {dataset_name}",
196
+ border_style="grey37",
197
+ title_align="left",
198
+ )
199
+
200
+ def print(self):
201
+ console.print(self.section)
202
+
203
+
204
+ class SummaryPanel:
205
+ def __init__(self, summary_metrics: KnowledgeBaseMetricSummary):
206
+
207
+ self.table = Table(
208
+ title="Agent with Knowledge Summary Metrics",
209
+ box=box.ROUNDED,
210
+ show_lines=True,
211
+ )
212
+ self.table.add_column("Dataset", style="blue3")
213
+ self.table.add_column("Average Response Confidence", style="cyan")
214
+ self.table.add_column("Average Retrieval Confidence", style="blue3")
215
+ self.table.add_column("Average Faithfulness", style="cyan")
216
+ self.table.add_column("Average Answer Relevancy", style="blue3")
217
+ self.table.add_column("Number Calls to Knowledge Bases", style="cyan")
218
+ self.table.add_column("Knowledge Bases Called", style="blue3")
219
+
220
+ average_metrics = summary_metrics.average
221
+ for dataset, metrics in average_metrics.items():
222
+ self.table.add_row(
223
+ dataset,
224
+ str(round(metrics["average_response_confidence"], 4)),
225
+ str(round(metrics["average_retrieval_confidence"], 4)),
226
+ str(metrics["average_faithfulness"]),
227
+ str(metrics["average_answer_relevancy"]),
228
+ str(metrics["number_of_calls"]),
229
+ metrics["knowledge_bases_called"],
230
+ )
231
+
232
+ def print(self):
233
+ console.print(self.table)
@@ -0,0 +1,175 @@
1
+ import os
2
+ import requests
3
+ import json
4
+ from types import MappingProxyType
5
+ from typing import List
6
+ import dataclasses
7
+ from ibm_watsonx_ai.foundation_models import ModelInference, Embeddings
8
+ from ibm_watsonx_ai.credentials import Credentials
9
+ from threading import Lock
10
+
11
+
12
+ ACCESS_URL = "https://iam.cloud.ibm.com/identity/token"
13
+ ACCESS_HEADER = {
14
+ "content-type": "application/x-www-form-urlencoded",
15
+ "accept": "application/json",
16
+ }
17
+
18
+ YPQA_URL = "https://yp-qa.ml.cloud.ibm.com"
19
+ PROD_URL = "https://us-south.ml.cloud.ibm.com"
20
+ DEFAULT_PARAM = MappingProxyType(
21
+ {"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 100}
22
+ )
23
+
24
+
25
+ class WatsonXProvider:
26
+ def __init__(
27
+ self,
28
+ model_id=None,
29
+ api_key=None,
30
+ space_id=None,
31
+ api_endpoint=PROD_URL,
32
+ url=ACCESS_URL,
33
+ timeout=60,
34
+ llm_decode_parameter=DEFAULT_PARAM,
35
+ embedding_model_id=None,
36
+ ):
37
+ super().__init__()
38
+ self.url = url
39
+ if (embedding_model_id is None) and (model_id is None):
40
+ raise Exception("either model_id or embedding_model_id must be specified")
41
+ self.model_id = model_id
42
+ api_key = os.environ.get("WATSONX_APIKEY", api_key)
43
+ if not api_key:
44
+ raise Exception("apikey must be specified")
45
+ self.api_key = api_key
46
+ self.access_data = {
47
+ "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
48
+ "apikey": self.api_key,
49
+ }
50
+ self.api_endpoint = api_endpoint
51
+ space_id = os.environ.get("WATSONX_SPACE_ID", space_id)
52
+ if not space_id:
53
+ raise Exception("space id must be specified")
54
+ self.space_id = space_id
55
+ self.timeout = timeout
56
+ self.embedding_model_id = embedding_model_id
57
+ self.lock = Lock()
58
+
59
+ if isinstance(llm_decode_parameter, MappingProxyType):
60
+ llm_decode_parameter = dict(llm_decode_parameter)
61
+ if dataclasses.is_dataclass(llm_decode_parameter):
62
+ llm_decode_parameter = dataclasses.asdict(llm_decode_parameter)
63
+
64
+ self.decode_param = llm_decode_parameter
65
+ self._refresh_token()
66
+
67
+ def _get_access_token(self):
68
+ response = requests.post(
69
+ self.url, headers=ACCESS_HEADER, data=self.access_data, timeout=self.timeout
70
+ )
71
+ if response.status_code == 200:
72
+ token_data = json.loads(response.text)
73
+ token = token_data["access_token"]
74
+
75
+ return token
76
+
77
+ raise RuntimeError(
78
+ f"try to acquire access token and get {response.status_code}"
79
+ )
80
+
81
+ def _refresh_token(self):
82
+ self.access_token = self._get_access_token()
83
+
84
+ if self.embedding_model_id is not None:
85
+ self.embedding_client = Embeddings(
86
+ model_id=self.embedding_model_id,
87
+ credentials=Credentials(token=self.access_token, url=self.api_endpoint),
88
+ space_id=self.space_id,
89
+ )
90
+ else:
91
+ self.embedding_client = None
92
+
93
+ if self.model_id is not None:
94
+ self.client = ModelInference(
95
+ model_id=self.model_id,
96
+ params=self.decode_param,
97
+ credentials=Credentials(token=self.access_token, url=self.api_endpoint),
98
+ space_id=self.space_id,
99
+ )
100
+ else:
101
+ self.client = None
102
+
103
+ def query(self, sentence: str) -> dict:
104
+ if self.model_id is None:
105
+ raise Exception("model id must be specified for text generation")
106
+ try:
107
+ return self.client.generate([sentence])[0][
108
+ "results"
109
+ ][ # pylint: disable=E1136
110
+ 0
111
+ ]
112
+ except Exception as e:
113
+ with self.lock:
114
+ if "authentication_token_expired" in str(e):
115
+ self._refresh_token()
116
+ raise e
117
+
118
+ def batch_query(self, sentences: List[str]) -> List[dict]:
119
+ if self.model_id is None:
120
+ raise Exception("model id must be specified for text generation")
121
+ try:
122
+ outputs = self.client.generate(sentences)
123
+ outputs = [output["results"][0] for output in outputs]
124
+ return outputs
125
+ except Exception as e:
126
+ with self.lock:
127
+ if "authentication_token_expired" in str(e):
128
+ self._refresh_token()
129
+ raise e
130
+ # pylint: disable=E1133
131
+ return []
132
+
133
+ def encode(self, sentences: List[str]) -> List[list]:
134
+ if self.embedding_model_id is None:
135
+ raise Exception("embedding model id must be specified for text encoding")
136
+ output = self.embedding_client.generate(sentences)
137
+ return [entry["embedding"] for entry in output["results"]]
138
+
139
+
140
+ if __name__ == "__main__":
141
+ import os
142
+
143
+ provider = WatsonXProvider(model_id="meta-llama/llama-3-2-90b-vision-instruct")
144
+
145
+ prompt = """
146
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
147
+
148
+
149
+ Your username is nwaters and you want to find out timeoff schedule for yourself for 20250101 to 20250303
150
+ <|eot_id|><|start_header_id|>user<|end_header_id|>
151
+
152
+
153
+ Reminder:
154
+ - try to respond only once per input
155
+ - if you get everything you need. respond with END
156
+
157
+ <|eot_id|>
158
+ <|start_header_id|>user<|end_header_id|>
159
+ my username is nwaters. what's my timeoff schedule?<|eot_id|>
160
+ <|start_header_id|>assistant<|end_header_id|>
161
+ we need to call get_assignment_id for username nwaters. do you want to make the function call? yes/no<|eot_id|>
162
+ <|start_header_id|>user<|end_header_id|>
163
+ yes<|eot_id|>
164
+ <|start_header_id|>assistant<|end_header_id|>
165
+ ok, i have your assignment id. what's the start and end date?<|eot_id|>
166
+ <|start_header_id|>user<|end_header_id|>
167
+ start and end is 20250101 to 20250303<|eot_id|>
168
+ <|start_header_id|>assistant<|end_header_id|>
169
+ Usernwaters did not take anytime off during the period<|eot_id|>
170
+
171
+ {% endfor -%}
172
+ <|eot_id|><|start_header_id|>user<|end_header_id|>
173
+ """
174
+
175
+ print(provider.query(prompt))