ibm-watsonx-orchestrate-evaluation-framework 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (41) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/METADATA +70 -7
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info/RECORD +56 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +3 -3
  4. wxo_agentic_evaluation/analytics/tools/ux.py +1 -1
  5. wxo_agentic_evaluation/analyze_run.py +10 -10
  6. wxo_agentic_evaluation/arg_configs.py +8 -1
  7. wxo_agentic_evaluation/batch_annotate.py +3 -9
  8. wxo_agentic_evaluation/data_annotator.py +50 -36
  9. wxo_agentic_evaluation/evaluation_package.py +102 -85
  10. wxo_agentic_evaluation/external_agent/__init__.py +37 -0
  11. wxo_agentic_evaluation/external_agent/external_validate.py +74 -29
  12. wxo_agentic_evaluation/external_agent/performance_test.py +66 -0
  13. wxo_agentic_evaluation/external_agent/types.py +8 -2
  14. wxo_agentic_evaluation/inference_backend.py +45 -50
  15. wxo_agentic_evaluation/llm_matching.py +6 -6
  16. wxo_agentic_evaluation/llm_rag_eval.py +4 -4
  17. wxo_agentic_evaluation/llm_user.py +3 -3
  18. wxo_agentic_evaluation/main.py +63 -23
  19. wxo_agentic_evaluation/metrics/metrics.py +59 -0
  20. wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2 +23 -0
  21. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +2 -0
  22. wxo_agentic_evaluation/prompt/examples/data_simple.json +1 -2
  23. wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2 +195 -0
  24. wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2 +154 -0
  25. wxo_agentic_evaluation/prompt/template_render.py +17 -0
  26. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +13 -7
  27. wxo_agentic_evaluation/record_chat.py +74 -26
  28. wxo_agentic_evaluation/resource_map.py +47 -0
  29. wxo_agentic_evaluation/service_provider/__init__.py +35 -0
  30. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +108 -0
  31. wxo_agentic_evaluation/service_provider/ollama_provider.py +40 -0
  32. wxo_agentic_evaluation/service_provider/provider.py +19 -0
  33. wxo_agentic_evaluation/{watsonx_provider.py → service_provider/watsonx_provider.py} +27 -18
  34. wxo_agentic_evaluation/test_prompt.py +94 -0
  35. wxo_agentic_evaluation/tool_planner.py +130 -17
  36. wxo_agentic_evaluation/type.py +0 -57
  37. wxo_agentic_evaluation/utils/utils.py +6 -54
  38. ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/RECORD +0 -46
  39. ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/licenses/LICENSE +0 -22
  40. {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/WHEEL +0 -0
  41. {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,195 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ Your task is to generate a starting sentence provided a user story.
3
+
4
+ ## Generation Guidelines
5
+ The starting sentence is the first question a user would ask in order to achieve the provided story.
6
+ The starting sentence shouldn't be the same as the story. The story is the objective of the conversation, while the starting sentence is the initial question a user would ask in order to achieve that objective.
7
+ The generated starting sentence should be a concise, no longer than one sentence, and in first person.
8
+
9
+ ## Output:
10
+ Respond in a JSON format with the following fields:
11
+ - starting_sentence: this is the generated starting sentence
12
+
13
+ This is an example of a valid JSON output
14
+
15
+ {
16
+ "starting_sentence": "placeholder text",
17
+ }
18
+
19
+ --
20
+
21
+ <|start_header_id|>user<|end_header_id|>
22
+ Input:
23
+ {
24
+ "agent": "workday_employee_support_manager",
25
+ "goals": {
26
+ "get_user_workday_ids": [
27
+ "get_payslips"
28
+ ],
29
+ "get_payslips": [
30
+ "summarize"
31
+ ]
32
+ },
33
+ "goal_details": [
34
+ {
35
+ "type": "tool_call",
36
+ "name": "get_user_workday_ids",
37
+ "tool_name": "get_user_workday_ids",
38
+ "args": {
39
+ "email": "abrennan@workday.net"
40
+ }
41
+ },
42
+ {
43
+ "type": "tool_call",
44
+ "name": "get_payslips",
45
+ "tool_name": "get_payslips",
46
+ "args": {
47
+ "user_id": "6dcb8106e8b74b5aabb1fc3ab8ef2b92"
48
+ }
49
+ },
50
+ {
51
+ "type": "text",
52
+ "name": "summarize",
53
+ "response": "No payslips found for user with ID 6dcb8106e8b74b5aabb1fc3ab8ef2b92.",
54
+ "keywords": [
55
+ "payslips"
56
+ ]
57
+ }
58
+ ],
59
+ "story": "Your email id is abrennan@workday.net. You want to get your payslips",
60
+ }
61
+ <|eot_id|>
62
+
63
+ <|start_header_id|>assistant<|end_header_id|>
64
+ {"starting_sentence": "I work at Workday, I want to get my payslips. My email id is abrennan@workday.net"}
65
+ <|eot_id|>
66
+
67
+
68
+ <|start_header_id|>user<|end_header_id|>
69
+ Input:
70
+ {
71
+ "agent": "servicenow_asset_management_agent",
72
+ "goals": {
73
+ "get_assets": [
74
+ "update_an_asset"
75
+ ],
76
+ "update_an_asset": [
77
+ "summarize"
78
+ ]
79
+ },
80
+ "goal_details": [
81
+ {
82
+ "type": "tool_call",
83
+ "name": "get_assets",
84
+ "tool_name": "get_assets",
85
+ "args": {}
86
+ },
87
+ {
88
+ "type": "tool_call",
89
+ "name": "update_an_asset",
90
+ "tool_name": "update_an_asset",
91
+ "args": {
92
+ "args": {
93
+ "current_name": "A100",
94
+ "new_name": "Macbook"
95
+ }
96
+ }
97
+ },
98
+ {
99
+ "name": "summarize",
100
+ "type": "text",
101
+ "response": "The asset A100 has been updated to Macbook.",
102
+ "keywords": [
103
+ "A100",
104
+ "Macbook"
105
+ ]
106
+ }
107
+ ],
108
+ "story": "You want to update an asset. The current asset name is A100. You want to update it to Macbook.",
109
+ }
110
+ <|eot_id|>
111
+ <|start_header_id|>assistant<|end_header_id|>
112
+ {"starting_sentence": "I want to update an asset"}
113
+ <|eot_id|>
114
+
115
+ <|start_header_id|>user<|end_header_id|>
116
+ Input:
117
+ {
118
+ "agent": "sales_research",
119
+ "goals": {
120
+ "search_company_by_typeahead": [
121
+ "get_news_and_media"
122
+ ],
123
+ "get_news_and_media": [
124
+ "summarize"
125
+ ]
126
+ },
127
+ "goal_details": [
128
+ {
129
+ "type": "tool_call",
130
+ "name": "search_company_by_typeahead",
131
+ "tool_name": "search_company_by_typeahead",
132
+ "args": {
133
+ "query": "tech"
134
+ }
135
+ },
136
+ {
137
+ "type": "tool_call",
138
+ "name": "get_news_and_media",
139
+ "tool_name": "get_news_and_media",
140
+ "args": {
141
+ "entity_name": "TechNova"
142
+ }
143
+ },
144
+ {
145
+ "name": "summarize",
146
+ "type": "text",
147
+ "response": "Here is the latest news on the tech company:\n\n| Company Name | Headline | Source | Date | Summary |\n| --- | --- | --- | --- | --- |\n| TechNova | TechNova Launches AI-driven Sales Platform | TechCrunch | 2025-06-15 | TechNova announced its new AI platform aimed at transforming enterprise sales operations. |",
148
+ "keywords": [
149
+ "TechNova",
150
+ "AI-driven Sales Platform",
151
+ "TechCrunch",
152
+ "2025-06-15"
153
+ ]
154
+ }
155
+ ],
156
+ "story": "You need the latest news on a company. The company name is TechNova.",
157
+ }
158
+ <|eot_id|>
159
+ <|start_header_id|>assistant<|end_header_id|>
160
+ {"starting_sentence": "I need the latest news on a company"}
161
+ <|eot_id|>
162
+
163
+ <|start_header_id|>user<|end_header_id|>
164
+ Input:
165
+ {
166
+ "agent": "question_answer_agent",
167
+ "goals": {
168
+ "summarize": []
169
+ },
170
+ "goal_details": [
171
+ {
172
+ "name": "summarize",
173
+ "type": "text",
174
+ "response": "The current PM of England is Keir Starmer",
175
+ "keywords": [
176
+ "Keir Starmer"
177
+ ]
178
+ }
179
+ ],
180
+ "story": "You want to know who the PM of England is.",
181
+ }
182
+ <|eot_id|>
183
+
184
+ <|start_header_id|>assistant<|end_header_id|>
185
+ {"starting_sentence": "Who is the PM of the England?"}
186
+ <|eot_id|>
187
+
188
+ ---
189
+
190
+ <|start_header_id|>assistant<|end_header_id|>
191
+ # Now, evaluate the following:
192
+ Input:
193
+ {{ input_data }}
194
+ Starting sentence:
195
+
@@ -0,0 +1,154 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ Your task is to generate a user story based on the provided information, focusing solely on the user's end goal and the essential context. Write a clear, concise narrative that:
3
+
4
+ - Centers on the user's primary objective and the key actions they take to achieve it.
5
+ - Includes only information directly relevant to understanding the user's goals and context.
6
+ - Excludes intermediate steps or process details that do not directly impact the user's outcome.
7
+ - Avoids repetition and omits any duplicate or unnecessary details.
8
+ - Captures the essence of the user's intent and required actions without commentary or extraneous explanation.
9
+
10
+ The final story should be brief, direct, and goal-oriented.
11
+ <|eot_id|>
12
+
13
+ <|start_header_id|>user<|end_header_id|>
14
+ Here area a few examples of how to generate a user story based on the provided input data:
15
+ # Example 1:
16
+ Input data:
17
+ {
18
+ "agent": "workday_employee_support_manager",
19
+ "goals": {
20
+ "get_user_workday_ids": [
21
+ "get_payslips"
22
+ ],
23
+ "get_payslips": [
24
+ "summarize"
25
+ ]
26
+ },
27
+ "goal_details": [
28
+ {
29
+ "type": "tool_call",
30
+ "name": "get_user_workday_ids",
31
+ "tool_name": "get_user_workday_ids",
32
+ "args": {
33
+ "email": "abrennan@workday.net"
34
+ }
35
+ },
36
+ {
37
+ "type": "tool_call",
38
+ "name": "get_payslips",
39
+ "tool_name": "get_payslips",
40
+ "args": {
41
+ "user_id": "6dcb8106e8b74b5aabb1fc3ab8ef2b92"
42
+ }
43
+ },
44
+ {
45
+ "type": "text",
46
+ "name": "summarize",
47
+ "response": "No payslips found for user with ID 6dcb8106e8b74b5aabb1fc3ab8ef2b92.",
48
+ "keywords": [
49
+ "payslips"
50
+ ]
51
+ }
52
+ ],
53
+ "story": "",
54
+ "starting_sentence": "I work at Workday, I want to get my payslips. My email id is abrennan@workday.net"
55
+ }
56
+ Story: Your email id is abrennan@workday.net. You want to get your payslips
57
+
58
+ # Example 2:
59
+ Input data:
60
+ {
61
+ "agent": "servicenow_asset_management_agent",
62
+ "goals": {
63
+ "get_assets": [
64
+ "update_an_asset"
65
+ ],
66
+ "update_an_asset": [
67
+ "summarize"
68
+ ]
69
+ },
70
+ "goal_details": [
71
+ {
72
+ "type": "tool_call",
73
+ "name": "get_assets",
74
+ "tool_name": "get_assets",
75
+ "args": {}
76
+ },
77
+ {
78
+ "type": "tool_call",
79
+ "name": "update_an_asset",
80
+ "tool_name": "update_an_asset",
81
+ "args": {
82
+ "args": {
83
+ "current_name": "A100",
84
+ "new_name": "Macbook"
85
+ }
86
+ }
87
+ },
88
+ {
89
+ "name": "summarize",
90
+ "type": "text",
91
+ "response": "The asset A100 has been updated to Macbook.",
92
+ "keywords": [
93
+ "A100",
94
+ "Macbook"
95
+ ]
96
+ }
97
+ ],
98
+ "story": "",
99
+ "starting_sentence": "I want to update an asset"
100
+ }
101
+ Story: You want to update an asset. The current asset name is A100. You want to update it to Macbook.
102
+
103
+ # Example 3:
104
+ Input data:
105
+ {
106
+ "agent": "sales_research",
107
+ "goals": {
108
+ "search_company_by_typeahead": [
109
+ "get_news_and_media"
110
+ ],
111
+ "get_news_and_media": [
112
+ "summarize"
113
+ ]
114
+ },
115
+ "goal_details": [
116
+ {
117
+ "type": "tool_call",
118
+ "name": "search_company_by_typeahead",
119
+ "tool_name": "search_company_by_typeahead",
120
+ "args": {
121
+ "query": "tech"
122
+ }
123
+ },
124
+ {
125
+ "type": "tool_call",
126
+ "name": "get_news_and_media",
127
+ "tool_name": "get_news_and_media",
128
+ "args": {
129
+ "entity_name": "TechNova"
130
+ }
131
+ },
132
+ {
133
+ "name": "summarize",
134
+ "type": "text",
135
+ "response": "Here is the latest news on the tech company:\n\n| Company Name | Headline | Source | Date | Summary |\n| --- | --- | --- | --- | --- |\n| TechNova | TechNova Launches AI-driven Sales Platform | TechCrunch | 2025-06-15 | TechNova announced its new AI platform aimed at transforming enterprise sales operations. |",
136
+ "keywords": [
137
+ "TechNova",
138
+ "AI-driven Sales Platform",
139
+ "TechCrunch",
140
+ "2025-06-15"
141
+ ]
142
+ }
143
+ ],
144
+ "story": "",
145
+ "starting_sentence": "I need the latest news on a company"
146
+ }
147
+ Story: You need the latest news on a company. The company name is TechNova.
148
+
149
+ <|eot_id|>
150
+ <|start_header_id|>assistant<|end_header_id|>
151
+ # Now, evaluate the following:
152
+ Input data:
153
+ {{ input_data }}
154
+ Story:
@@ -61,6 +61,14 @@ class ToolPlannerTemplateRenderer(JinjaTemplateRenderer):
61
61
  agent_name=agent_name,
62
62
  available_tools=available_tools,
63
63
  )
64
+
65
+ class ArgsExtractorTemplateRenderer(JinjaTemplateRenderer):
66
+ def render(self, tool_signature: str, step: dict, inputs: dict) -> str:
67
+ return super().render(
68
+ tool_signature=tool_signature,
69
+ step=step,
70
+ inputs=inputs,
71
+ )
64
72
 
65
73
  class ToolChainAgentTemplateRenderer(JinjaTemplateRenderer):
66
74
  def render(self, tool_call_history: List, available_tools:str) -> str:
@@ -88,3 +96,12 @@ class BatchTestCaseGeneratorTemplateRenderer(JinjaTemplateRenderer):
88
96
  num_variants=num_variants,
89
97
  example_str=example_str,
90
98
  )
99
+
100
+ class StoryGenerationTemplateRenderer(JinjaTemplateRenderer):
101
+ def render(
102
+ self,
103
+ input_data: dict,
104
+ ) -> str:
105
+ return super().render(
106
+ input_data=input_data,
107
+ )
@@ -1,7 +1,7 @@
1
1
  <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
2
  You are a tool-planning assistant for an AI system.
3
3
 
4
- Your job is to extract and sequence tool calls based on user stories and available tools.
4
+ Your job is to generate a sequence of tool calls based on user stories and available tools.
5
5
 
6
6
  Rules:
7
7
  - Use only the tools listed below.
@@ -11,12 +11,16 @@ Rules:
11
11
  - Output ONLY one valid JSON array.
12
12
  - DO NOT include extra text or wrap the output. Just return the JSON list.
13
13
 
14
- Available Tools:
15
- {{ available_tools }}
14
+ <|eot_id|>
15
+
16
+ <|start_header_id|>user<|end_header_id|>
17
+ Here area a few examples of how to generate a squence of tool calls based on user stories and available tools.
16
18
 
17
19
  Example:
18
20
  Story: "Your username is nwaters. You want to find out your time-off schedule from: 2025-01-01 to: 2025-12-31."
19
-
21
+ Available Tools:
22
+ [{'Function Name': '_is_valid_date', 'Arguments': ['date_str'], 'Docstring': 'Check if the provided date string is in YYYY-MM-DD format.'}, {'Function Name': 'fetch_assignment_id', 'Arguments': ['username'], 'Docstring': "Return the assignment ID for a given employee username.\n\n:param username: Employee's username\n:return: Assignment ID as a string or 'not found'"}, {'Function Name': 'retrieve_timeoff_schedule', 'Arguments': ['assignment_id', 'start_date', 'end_date'], 'Docstring': 'Get time-off schedule for an employee within a given date range.\n\n:param assignment_id: Assignment ID of the employee\n:param start_date: Start date in YYYY-MM-DD format\n:param end_date: End date in YYYY-MM-DD format\n:return: JSON list of time-off dates or error message'}, {'Function Name': 'list_direct_reports', 'Arguments': ['manager_assignment_id'], 'Docstring': "Retrieve the list of direct report Employee's username for a specified manager's assignment ID.\n\n:param manager_assignment_id: Assignment ID of the manager as a string\n:return: JSON-encoded list of Employee's username who report to the manager"}, {'Function Name': 'get_address_type', 'Arguments': ['address_type_name'], 'Docstring': 'Retrieve a string address type ID based on a given address type name.\n\n:param address_type_name: Address type as a string\n:return: Corresponding string address_type_id or -1 if not found'}, {'Function Name': 'update_address', 'Arguments': ['address_type_id', 'assignment_id', 'new_address'], 'Docstring': "Update the address for an employee based on assignment ID and address type.\n\n:param address_type_id: String representing the address type\n:param assignment_id: Employee's assignment ID\n:param new_address: New address string\n:return: Success or error message"}]
23
+ Result:
20
24
  [
21
25
  {
22
26
  "tool_name": "fetch_assignment_id",
@@ -34,7 +38,9 @@ Story: "Your username is nwaters. You want to find out your time-off schedule fr
34
38
  }
35
39
  ]
36
40
  <|eot_id|>
37
- <|start_header_id|>user<|end_header_id|>
38
- Story: "{{ user_story }}"
39
- <|eot_id|>
40
41
  <|start_header_id|>assistant<|end_header_id|>
42
+ # Now, generate results for the following user story and available tools:
43
+ Story: "{{ user_story }}"
44
+ Available Tools:
45
+ {{ available_tools }}
46
+ Result:
@@ -11,19 +11,25 @@ from wxo_agentic_evaluation.inference_backend import (
11
11
  from wxo_agentic_evaluation.data_annotator import DataAnnotator
12
12
  from wxo_agentic_evaluation.utils.utils import is_saas_url
13
13
  from wxo_agentic_evaluation.service_instance import tenant_setup
14
+ from wxo_agentic_evaluation.prompt.template_render import StoryGenerationTemplateRenderer
15
+ from wxo_agentic_evaluation.service_provider import get_provider
16
+ from wxo_agentic_evaluation import __file__
14
17
 
15
18
  import json
16
19
  import os
17
20
  import rich
18
21
  from datetime import datetime
19
22
  import time
20
- from typing import List
23
+ from typing import List, Dict
24
+ import hashlib
21
25
  from jsonargparse import CLI
22
26
  import warnings
23
27
 
24
28
  warnings.filterwarnings("ignore", category=DeprecationWarning)
25
29
  warnings.filterwarnings("ignore", category=FutureWarning)
26
30
 
31
+ root_dir = os.path.dirname(__file__)
32
+ STORY_GENERATION_PROMPT_PATH = os.path.join(root_dir, "prompt", "story_generation_prompt.jinja2")
27
33
 
28
34
  def get_all_runs(wxo_client: WXOClient):
29
35
  limit = 20 # Maximum allowed limit per request
@@ -31,9 +37,11 @@ def get_all_runs(wxo_client: WXOClient):
31
37
  all_runs = []
32
38
 
33
39
  if is_saas_url(wxo_client.service_url):
34
- path = "v1//orchestrate/runs"
40
+ # TO-DO: this is not validated after the v1 prefix change
41
+ # need additional validation
42
+ path = "v1/orchestrate/runs"
35
43
  else:
36
- path = "/orchestrate/runs"
44
+ path = "v1/orchestrate/runs"
37
45
 
38
46
  initial_response = wxo_client.get(
39
47
  path, {"limit": limit, "offset": 0}
@@ -62,34 +70,60 @@ def get_all_runs(wxo_client: WXOClient):
62
70
  return all_runs
63
71
 
64
72
 
65
- def pull_messages_from_thread_id(thread_id: str, wxo_client: WXOClient):
66
- inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
67
- messages = inference_backend.get_messages(thread_id)
68
- return messages
73
+ def generate_story(annotated_data: dict):
74
+ renderer = StoryGenerationTemplateRenderer(STORY_GENERATION_PROMPT_PATH)
75
+ provider = get_provider(
76
+ model_id="meta-llama/llama-3-405b-instruct",
77
+ params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 256},
78
+ )
79
+ prompt = renderer.render(input_data=json.dumps(annotated_data, indent=2))
80
+ res = provider.query(prompt)
81
+ return res.strip()
69
82
 
70
83
 
71
84
  def annotate_messages(
72
- messages: List[Message], keywords_generation_config: KeywordsGenerationConfig
85
+ agent_name: str, messages: List[Message], keywords_generation_config: KeywordsGenerationConfig
73
86
  ):
74
87
  annotator = DataAnnotator(
75
88
  messages=messages, keywords_generation_config=keywords_generation_config
76
89
  )
77
- return annotator.generate()
90
+ annotated_data = annotator.generate()
91
+ if agent_name is not None:
92
+ annotated_data["agent"] = agent_name
93
+
94
+ annotated_data["story"] = generate_story(annotated_data)
95
+
96
+ return annotated_data
97
+
98
+ def has_messages_changed(
99
+ thread_id: str,
100
+ messages: List[Message],
101
+ previous_hashes: Dict[str, str],
102
+ ) -> bool:
103
+ # serialize just the message content
104
+ payload = [msg.model_dump() for msg in messages]
105
+ sig = json.dumps(payload, sort_keys=True, default=str)
106
+ h = hashlib.sha256(sig.encode()).hexdigest()
107
+
108
+ if previous_hashes.get(thread_id) != h:
109
+ previous_hashes[thread_id] = h
110
+ return True
111
+ return False
78
112
 
79
113
 
80
114
  def record_chats(config: ChatRecordingConfig):
81
115
  """Record chats in background mode"""
82
116
  start_time = datetime.utcnow()
83
117
  processed_threads = set()
118
+ previous_input_hash: dict[str, str] = {}
84
119
 
85
120
  rich.print(
86
121
  f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
87
122
  )
88
123
  if config.token is None:
89
- token = tenant_setup(config.service_url, config.tenant_name)
90
- else:
91
- token = config.token
92
- wxo_client = get_wxo_client(config.service_url, token)
124
+ config.token = tenant_setup(config.service_url, config.tenant_name)
125
+ wxo_client = get_wxo_client(config.service_url, config.tenant_name, config.token)
126
+ inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
93
127
  try:
94
128
  while True:
95
129
  all_runs = get_all_runs(wxo_client)
@@ -98,7 +132,12 @@ def record_chats(config: ChatRecordingConfig):
98
132
  # Process only new runs that started after our recording began
99
133
  for run in all_runs:
100
134
  thread_id = run.get("thread_id")
101
- if thread_id in seen_threads:
135
+ try:
136
+ agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
137
+ except Exception as e:
138
+ rich.print(f"[yellow]WARNING:[/yellow]Failure in getting thread id {thread_id}")
139
+ continue
140
+ if thread_id in seen_threads or agent_name is None:
102
141
  continue
103
142
  seen_threads.add(thread_id)
104
143
  started_at = run.get("started_at")
@@ -119,33 +158,42 @@ def record_chats(config: ChatRecordingConfig):
119
158
  rich.print(
120
159
  f"[green]INFO:[/green] Messages saved to: {os.path.join(config.output_dir, f'{thread_id}_messages.json')}"
121
160
  )
122
- rich.print(
123
- f"[green]INFO:[/green] Annotations saved to: {os.path.join(config.output_dir, f'{thread_id}_annotated_data.json')}"
124
- )
161
+ # rich.print(
162
+ # f"[green]INFO:[/green] Annotations saved to: {os.path.join(config.output_dir, f'{thread_id}_annotated_data.json')}"
163
+ # )
125
164
  processed_threads.add(thread_id)
126
165
 
127
166
  try:
128
- messages = pull_messages_from_thread_id(
129
- thread_id, wxo_client
130
- )
167
+ messages = inference_backend.get_messages(thread_id)
168
+
169
+ if not has_messages_changed(
170
+ thread_id,
171
+ messages,
172
+ previous_input_hash,
173
+ ):
174
+ continue
175
+
131
176
  annotated_data = annotate_messages(
132
- messages, config.keywords_generation_config
177
+ agent_name, messages, config.keywords_generation_config
133
178
  )
134
179
 
135
180
  messages_filename = os.path.join(
136
181
  config.output_dir, f"{thread_id}_messages.json"
137
182
  )
138
- annotation_filename = os.path.join(
139
- config.output_dir, f"{thread_id}_annotated_data.json"
140
- )
141
183
 
142
184
  with open(messages_filename, "w") as f:
143
185
  json.dump(
144
186
  [msg.model_dump() for msg in messages], f, indent=4
145
187
  )
146
188
 
147
- with open(annotation_filename, "w") as f:
148
- json.dump(annotated_data, f, indent=4)
189
+ # TO-DO: we want some tracing but we also do not want to persist the file
190
+ # in the same folder.
191
+ # annotation_filename = os.path.join(
192
+ # config.output_dir, f"{thread_id}_annotated_data.json"
193
+ # )
194
+
195
+ # with open(annotation_filename, "w") as f:
196
+ # json.dump(annotated_data, f, indent=4)
149
197
  except Exception as e:
150
198
  rich.print(
151
199
  f"[red]ERROR:[/red] Failed to process thread {thread_id}: {str(e)}"
@@ -0,0 +1,47 @@
1
+ from collections import defaultdict
2
+ from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
3
+
4
+
5
+ class ResourceMap:
6
+ def __init__(self, wxo_client: WXOClient):
7
+ self.wxo_client = wxo_client
8
+ self.agent2tools, self.tools2agents = self.init_mapping()
9
+ self.all_agents = list(self.agent2tools.keys())
10
+
11
+ def init_mapping(self):
12
+ agent2tools = defaultdict(set)
13
+ tools2agents = defaultdict(set)
14
+ if is_saas_url(self.wxo_client.service_url):
15
+ # TO-DO: this is not validated after the v1 prefix change
16
+ # need additional validation
17
+ tools_path = "v1/orchestrate/tools/"
18
+ agents_path = "v1/orchestrate/agents"
19
+ else:
20
+ tools_path = "v1/tools/"
21
+ agents_path = "v1/orchestrate/agents/"
22
+
23
+ tool_map = {}
24
+
25
+ resp = self.wxo_client.get(tools_path)
26
+ if resp.status_code == 200:
27
+ tools = resp.json()
28
+ tool_map = {tool["id"]: tool["name"] for tool in tools}
29
+ else:
30
+ resp.raise_for_status()
31
+
32
+ resp = self.wxo_client.get(agents_path)
33
+
34
+ if resp.status_code == 200:
35
+ agents = resp.json()
36
+ for agent in agents:
37
+ agent_name = agent["name"]
38
+ tools = [tool_map[id] for id in agent["tools"]]
39
+ for tool in tools:
40
+ agent2tools[agent_name].add(tool)
41
+ tools2agents[tool].add(agent_name)
42
+ else:
43
+ resp.raise_for_status()
44
+
45
+ agent2tools = dict(agent2tools)
46
+ tools2agents = dict(tools2agents)
47
+ return agent2tools, tools2agents
@@ -0,0 +1,35 @@
1
+ from wxo_agentic_evaluation.service_provider.ollama_provider import OllamaProvider
2
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
3
+ from wxo_agentic_evaluation.service_provider.model_proxy_provider import ModelProxyProvider
4
+ from wxo_agentic_evaluation.arg_configs import ProviderConfig
5
+
6
+ import os
7
+
8
+ def _instantiate_provider(config: ProviderConfig, **kwargs):
9
+ if config.provider == "watsonx":
10
+ return WatsonXProvider(model_id=config.model_id, **kwargs)
11
+ elif config.provider == "ollama":
12
+ return OllamaProvider(model_id=config.model_id, **kwargs)
13
+ elif config.provider == "model_proxy":
14
+ return ModelProxyProvider(model_id=config.model_id, **kwargs)
15
+ else:
16
+ raise RuntimeError(f"target provider is not supported {config.provider}")
17
+
18
+ def get_provider(config: ProviderConfig = None, model_id: str = None, **kwargs):
19
+ if config:
20
+ return _instantiate_provider(config, **kwargs)
21
+
22
+ if not model_id:
23
+ raise ValueError("model_id must be provided if config is not supplied")
24
+
25
+ if "WATSONX_APIKEY" in os.environ and "WATSONX_SPACE_ID" in os.environ:
26
+ config = ProviderConfig(provider="watsonx", model_id=model_id)
27
+ return _instantiate_provider(config, **kwargs)
28
+
29
+ if "WO_API_KEY" in os.environ and "WO_INSTANCE" in os.environ:
30
+ config = ProviderConfig(provider="model_proxy", model_id=model_id)
31
+ return _instantiate_provider(config, **kwargs)
32
+
33
+ raise RuntimeError(
34
+ "No provider found. Please either provide a config or set the required environment variables."
35
+ )