ibm-watsonx-orchestrate-evaluation-framework 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/METADATA +70 -7
- ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info/RECORD +56 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +3 -3
- wxo_agentic_evaluation/analytics/tools/ux.py +1 -1
- wxo_agentic_evaluation/analyze_run.py +10 -10
- wxo_agentic_evaluation/arg_configs.py +8 -1
- wxo_agentic_evaluation/batch_annotate.py +3 -9
- wxo_agentic_evaluation/data_annotator.py +50 -36
- wxo_agentic_evaluation/evaluation_package.py +102 -85
- wxo_agentic_evaluation/external_agent/__init__.py +37 -0
- wxo_agentic_evaluation/external_agent/external_validate.py +74 -29
- wxo_agentic_evaluation/external_agent/performance_test.py +66 -0
- wxo_agentic_evaluation/external_agent/types.py +8 -2
- wxo_agentic_evaluation/inference_backend.py +45 -50
- wxo_agentic_evaluation/llm_matching.py +6 -6
- wxo_agentic_evaluation/llm_rag_eval.py +4 -4
- wxo_agentic_evaluation/llm_user.py +3 -3
- wxo_agentic_evaluation/main.py +63 -23
- wxo_agentic_evaluation/metrics/metrics.py +59 -0
- wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2 +23 -0
- wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +2 -0
- wxo_agentic_evaluation/prompt/examples/data_simple.json +1 -2
- wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2 +195 -0
- wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2 +154 -0
- wxo_agentic_evaluation/prompt/template_render.py +17 -0
- wxo_agentic_evaluation/prompt/tool_planner.jinja2 +13 -7
- wxo_agentic_evaluation/record_chat.py +74 -26
- wxo_agentic_evaluation/resource_map.py +47 -0
- wxo_agentic_evaluation/service_provider/__init__.py +35 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +108 -0
- wxo_agentic_evaluation/service_provider/ollama_provider.py +40 -0
- wxo_agentic_evaluation/service_provider/provider.py +19 -0
- wxo_agentic_evaluation/{watsonx_provider.py → service_provider/watsonx_provider.py} +27 -18
- wxo_agentic_evaluation/test_prompt.py +94 -0
- wxo_agentic_evaluation/tool_planner.py +130 -17
- wxo_agentic_evaluation/type.py +0 -57
- wxo_agentic_evaluation/utils/utils.py +6 -54
- ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/RECORD +0 -46
- ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/licenses/LICENSE +0 -22
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
2
|
+
Your task is to generate a starting sentence provided a user story.
|
|
3
|
+
|
|
4
|
+
## Generation Guidelines
|
|
5
|
+
The starting sentence is the first question a user would ask in order to achieve the provided story.
|
|
6
|
+
The starting sentence shouldn't be the same as the story. The story is the objective of the conversation, while the starting sentence is the initial question a user would ask in order to achieve that objective.
|
|
7
|
+
The generated starting sentence should be a concise, no longer than one sentence, and in first person.
|
|
8
|
+
|
|
9
|
+
## Output:
|
|
10
|
+
Respond in a JSON format with the following fields:
|
|
11
|
+
- starting_sentence: this is the generated starting sentence
|
|
12
|
+
|
|
13
|
+
This is an example of a valid JSON output
|
|
14
|
+
|
|
15
|
+
{
|
|
16
|
+
"starting_sentence": "placeholder text",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
--
|
|
20
|
+
|
|
21
|
+
<|start_header_id|>user<|end_header_id|>
|
|
22
|
+
Input:
|
|
23
|
+
{
|
|
24
|
+
"agent": "workday_employee_support_manager",
|
|
25
|
+
"goals": {
|
|
26
|
+
"get_user_workday_ids": [
|
|
27
|
+
"get_payslips"
|
|
28
|
+
],
|
|
29
|
+
"get_payslips": [
|
|
30
|
+
"summarize"
|
|
31
|
+
]
|
|
32
|
+
},
|
|
33
|
+
"goal_details": [
|
|
34
|
+
{
|
|
35
|
+
"type": "tool_call",
|
|
36
|
+
"name": "get_user_workday_ids",
|
|
37
|
+
"tool_name": "get_user_workday_ids",
|
|
38
|
+
"args": {
|
|
39
|
+
"email": "abrennan@workday.net"
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"type": "tool_call",
|
|
44
|
+
"name": "get_payslips",
|
|
45
|
+
"tool_name": "get_payslips",
|
|
46
|
+
"args": {
|
|
47
|
+
"user_id": "6dcb8106e8b74b5aabb1fc3ab8ef2b92"
|
|
48
|
+
}
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"type": "text",
|
|
52
|
+
"name": "summarize",
|
|
53
|
+
"response": "No payslips found for user with ID 6dcb8106e8b74b5aabb1fc3ab8ef2b92.",
|
|
54
|
+
"keywords": [
|
|
55
|
+
"payslips"
|
|
56
|
+
]
|
|
57
|
+
}
|
|
58
|
+
],
|
|
59
|
+
"story": "Your email id is abrennan@workday.net. You want to get your payslips",
|
|
60
|
+
}
|
|
61
|
+
<|eot_id|>
|
|
62
|
+
|
|
63
|
+
<|start_header_id|>assistant<|end_header_id|>
|
|
64
|
+
{"starting_sentence": "I work at Workday, I want to get my payslips. My email id is abrennan@workday.net"}
|
|
65
|
+
<|eot_id|>
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
<|start_header_id|>user<|end_header_id|>
|
|
69
|
+
Input:
|
|
70
|
+
{
|
|
71
|
+
"agent": "servicenow_asset_management_agent",
|
|
72
|
+
"goals": {
|
|
73
|
+
"get_assets": [
|
|
74
|
+
"update_an_asset"
|
|
75
|
+
],
|
|
76
|
+
"update_an_asset": [
|
|
77
|
+
"summarize"
|
|
78
|
+
]
|
|
79
|
+
},
|
|
80
|
+
"goal_details": [
|
|
81
|
+
{
|
|
82
|
+
"type": "tool_call",
|
|
83
|
+
"name": "get_assets",
|
|
84
|
+
"tool_name": "get_assets",
|
|
85
|
+
"args": {}
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
"type": "tool_call",
|
|
89
|
+
"name": "update_an_asset",
|
|
90
|
+
"tool_name": "update_an_asset",
|
|
91
|
+
"args": {
|
|
92
|
+
"args": {
|
|
93
|
+
"current_name": "A100",
|
|
94
|
+
"new_name": "Macbook"
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
"name": "summarize",
|
|
100
|
+
"type": "text",
|
|
101
|
+
"response": "The asset A100 has been updated to Macbook.",
|
|
102
|
+
"keywords": [
|
|
103
|
+
"A100",
|
|
104
|
+
"Macbook"
|
|
105
|
+
]
|
|
106
|
+
}
|
|
107
|
+
],
|
|
108
|
+
"story": "You want to update an asset. The current asset name is A100. You want to update it to Macbook.",
|
|
109
|
+
}
|
|
110
|
+
<|eot_id|>
|
|
111
|
+
<|start_header_id|>assistant<|end_header_id|>
|
|
112
|
+
{"starting_sentence": "I want to update an asset"}
|
|
113
|
+
<|eot_id|>
|
|
114
|
+
|
|
115
|
+
<|start_header_id|>user<|end_header_id|>
|
|
116
|
+
Input:
|
|
117
|
+
{
|
|
118
|
+
"agent": "sales_research",
|
|
119
|
+
"goals": {
|
|
120
|
+
"search_company_by_typeahead": [
|
|
121
|
+
"get_news_and_media"
|
|
122
|
+
],
|
|
123
|
+
"get_news_and_media": [
|
|
124
|
+
"summarize"
|
|
125
|
+
]
|
|
126
|
+
},
|
|
127
|
+
"goal_details": [
|
|
128
|
+
{
|
|
129
|
+
"type": "tool_call",
|
|
130
|
+
"name": "search_company_by_typeahead",
|
|
131
|
+
"tool_name": "search_company_by_typeahead",
|
|
132
|
+
"args": {
|
|
133
|
+
"query": "tech"
|
|
134
|
+
}
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
"type": "tool_call",
|
|
138
|
+
"name": "get_news_and_media",
|
|
139
|
+
"tool_name": "get_news_and_media",
|
|
140
|
+
"args": {
|
|
141
|
+
"entity_name": "TechNova"
|
|
142
|
+
}
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
"name": "summarize",
|
|
146
|
+
"type": "text",
|
|
147
|
+
"response": "Here is the latest news on the tech company:\n\n| Company Name | Headline | Source | Date | Summary |\n| --- | --- | --- | --- | --- |\n| TechNova | TechNova Launches AI-driven Sales Platform | TechCrunch | 2025-06-15 | TechNova announced its new AI platform aimed at transforming enterprise sales operations. |",
|
|
148
|
+
"keywords": [
|
|
149
|
+
"TechNova",
|
|
150
|
+
"AI-driven Sales Platform",
|
|
151
|
+
"TechCrunch",
|
|
152
|
+
"2025-06-15"
|
|
153
|
+
]
|
|
154
|
+
}
|
|
155
|
+
],
|
|
156
|
+
"story": "You need the latest news on a company. The company name is TechNova.",
|
|
157
|
+
}
|
|
158
|
+
<|eot_id|>
|
|
159
|
+
<|start_header_id|>assistant<|end_header_id|>
|
|
160
|
+
{"starting_sentence": "I need the latest news on a company"}
|
|
161
|
+
<|eot_id|>
|
|
162
|
+
|
|
163
|
+
<|start_header_id|>user<|end_header_id|>
|
|
164
|
+
Input:
|
|
165
|
+
{
|
|
166
|
+
"agent": "question_answer_agent",
|
|
167
|
+
"goals": {
|
|
168
|
+
"summarize": []
|
|
169
|
+
},
|
|
170
|
+
"goal_details": [
|
|
171
|
+
{
|
|
172
|
+
"name": "summarize",
|
|
173
|
+
"type": "text",
|
|
174
|
+
"response": "The current PM of England is Keir Starmer",
|
|
175
|
+
"keywords": [
|
|
176
|
+
"Keir Starmer"
|
|
177
|
+
]
|
|
178
|
+
}
|
|
179
|
+
],
|
|
180
|
+
"story": "You want to know who the PM of England is.",
|
|
181
|
+
}
|
|
182
|
+
<|eot_id|>
|
|
183
|
+
|
|
184
|
+
<|start_header_id|>assistant<|end_header_id|>
|
|
185
|
+
{"starting_sentence": "Who is the PM of the England?"}
|
|
186
|
+
<|eot_id|>
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
<|start_header_id|>assistant<|end_header_id|>
|
|
191
|
+
# Now, evaluate the following:
|
|
192
|
+
Input:
|
|
193
|
+
{{ input_data }}
|
|
194
|
+
Starting sentence:
|
|
195
|
+
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
2
|
+
Your task is to generate a user story based on the provided information, focusing solely on the user's end goal and the essential context. Write a clear, concise narrative that:
|
|
3
|
+
|
|
4
|
+
- Centers on the user's primary objective and the key actions they take to achieve it.
|
|
5
|
+
- Includes only information directly relevant to understanding the user's goals and context.
|
|
6
|
+
- Excludes intermediate steps or process details that do not directly impact the user's outcome.
|
|
7
|
+
- Avoids repetition and omits any duplicate or unnecessary details.
|
|
8
|
+
- Captures the essence of the user's intent and required actions without commentary or extraneous explanation.
|
|
9
|
+
|
|
10
|
+
The final story should be brief, direct, and goal-oriented.
|
|
11
|
+
<|eot_id|>
|
|
12
|
+
|
|
13
|
+
<|start_header_id|>user<|end_header_id|>
|
|
14
|
+
Here area a few examples of how to generate a user story based on the provided input data:
|
|
15
|
+
# Example 1:
|
|
16
|
+
Input data:
|
|
17
|
+
{
|
|
18
|
+
"agent": "workday_employee_support_manager",
|
|
19
|
+
"goals": {
|
|
20
|
+
"get_user_workday_ids": [
|
|
21
|
+
"get_payslips"
|
|
22
|
+
],
|
|
23
|
+
"get_payslips": [
|
|
24
|
+
"summarize"
|
|
25
|
+
]
|
|
26
|
+
},
|
|
27
|
+
"goal_details": [
|
|
28
|
+
{
|
|
29
|
+
"type": "tool_call",
|
|
30
|
+
"name": "get_user_workday_ids",
|
|
31
|
+
"tool_name": "get_user_workday_ids",
|
|
32
|
+
"args": {
|
|
33
|
+
"email": "abrennan@workday.net"
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"type": "tool_call",
|
|
38
|
+
"name": "get_payslips",
|
|
39
|
+
"tool_name": "get_payslips",
|
|
40
|
+
"args": {
|
|
41
|
+
"user_id": "6dcb8106e8b74b5aabb1fc3ab8ef2b92"
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"type": "text",
|
|
46
|
+
"name": "summarize",
|
|
47
|
+
"response": "No payslips found for user with ID 6dcb8106e8b74b5aabb1fc3ab8ef2b92.",
|
|
48
|
+
"keywords": [
|
|
49
|
+
"payslips"
|
|
50
|
+
]
|
|
51
|
+
}
|
|
52
|
+
],
|
|
53
|
+
"story": "",
|
|
54
|
+
"starting_sentence": "I work at Workday, I want to get my payslips. My email id is abrennan@workday.net"
|
|
55
|
+
}
|
|
56
|
+
Story: Your email id is abrennan@workday.net. You want to get your payslips
|
|
57
|
+
|
|
58
|
+
# Example 2:
|
|
59
|
+
Input data:
|
|
60
|
+
{
|
|
61
|
+
"agent": "servicenow_asset_management_agent",
|
|
62
|
+
"goals": {
|
|
63
|
+
"get_assets": [
|
|
64
|
+
"update_an_asset"
|
|
65
|
+
],
|
|
66
|
+
"update_an_asset": [
|
|
67
|
+
"summarize"
|
|
68
|
+
]
|
|
69
|
+
},
|
|
70
|
+
"goal_details": [
|
|
71
|
+
{
|
|
72
|
+
"type": "tool_call",
|
|
73
|
+
"name": "get_assets",
|
|
74
|
+
"tool_name": "get_assets",
|
|
75
|
+
"args": {}
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"type": "tool_call",
|
|
79
|
+
"name": "update_an_asset",
|
|
80
|
+
"tool_name": "update_an_asset",
|
|
81
|
+
"args": {
|
|
82
|
+
"args": {
|
|
83
|
+
"current_name": "A100",
|
|
84
|
+
"new_name": "Macbook"
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
"name": "summarize",
|
|
90
|
+
"type": "text",
|
|
91
|
+
"response": "The asset A100 has been updated to Macbook.",
|
|
92
|
+
"keywords": [
|
|
93
|
+
"A100",
|
|
94
|
+
"Macbook"
|
|
95
|
+
]
|
|
96
|
+
}
|
|
97
|
+
],
|
|
98
|
+
"story": "",
|
|
99
|
+
"starting_sentence": "I want to update an asset"
|
|
100
|
+
}
|
|
101
|
+
Story: You want to update an asset. The current asset name is A100. You want to update it to Macbook.
|
|
102
|
+
|
|
103
|
+
# Example 3:
|
|
104
|
+
Input data:
|
|
105
|
+
{
|
|
106
|
+
"agent": "sales_research",
|
|
107
|
+
"goals": {
|
|
108
|
+
"search_company_by_typeahead": [
|
|
109
|
+
"get_news_and_media"
|
|
110
|
+
],
|
|
111
|
+
"get_news_and_media": [
|
|
112
|
+
"summarize"
|
|
113
|
+
]
|
|
114
|
+
},
|
|
115
|
+
"goal_details": [
|
|
116
|
+
{
|
|
117
|
+
"type": "tool_call",
|
|
118
|
+
"name": "search_company_by_typeahead",
|
|
119
|
+
"tool_name": "search_company_by_typeahead",
|
|
120
|
+
"args": {
|
|
121
|
+
"query": "tech"
|
|
122
|
+
}
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
"type": "tool_call",
|
|
126
|
+
"name": "get_news_and_media",
|
|
127
|
+
"tool_name": "get_news_and_media",
|
|
128
|
+
"args": {
|
|
129
|
+
"entity_name": "TechNova"
|
|
130
|
+
}
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
"name": "summarize",
|
|
134
|
+
"type": "text",
|
|
135
|
+
"response": "Here is the latest news on the tech company:\n\n| Company Name | Headline | Source | Date | Summary |\n| --- | --- | --- | --- | --- |\n| TechNova | TechNova Launches AI-driven Sales Platform | TechCrunch | 2025-06-15 | TechNova announced its new AI platform aimed at transforming enterprise sales operations. |",
|
|
136
|
+
"keywords": [
|
|
137
|
+
"TechNova",
|
|
138
|
+
"AI-driven Sales Platform",
|
|
139
|
+
"TechCrunch",
|
|
140
|
+
"2025-06-15"
|
|
141
|
+
]
|
|
142
|
+
}
|
|
143
|
+
],
|
|
144
|
+
"story": "",
|
|
145
|
+
"starting_sentence": "I need the latest news on a company"
|
|
146
|
+
}
|
|
147
|
+
Story: You need the latest news on a company. The company name is TechNova.
|
|
148
|
+
|
|
149
|
+
<|eot_id|>
|
|
150
|
+
<|start_header_id|>assistant<|end_header_id|>
|
|
151
|
+
# Now, evaluate the following:
|
|
152
|
+
Input data:
|
|
153
|
+
{{ input_data }}
|
|
154
|
+
Story:
|
|
@@ -61,6 +61,14 @@ class ToolPlannerTemplateRenderer(JinjaTemplateRenderer):
|
|
|
61
61
|
agent_name=agent_name,
|
|
62
62
|
available_tools=available_tools,
|
|
63
63
|
)
|
|
64
|
+
|
|
65
|
+
class ArgsExtractorTemplateRenderer(JinjaTemplateRenderer):
|
|
66
|
+
def render(self, tool_signature: str, step: dict, inputs: dict) -> str:
|
|
67
|
+
return super().render(
|
|
68
|
+
tool_signature=tool_signature,
|
|
69
|
+
step=step,
|
|
70
|
+
inputs=inputs,
|
|
71
|
+
)
|
|
64
72
|
|
|
65
73
|
class ToolChainAgentTemplateRenderer(JinjaTemplateRenderer):
|
|
66
74
|
def render(self, tool_call_history: List, available_tools:str) -> str:
|
|
@@ -88,3 +96,12 @@ class BatchTestCaseGeneratorTemplateRenderer(JinjaTemplateRenderer):
|
|
|
88
96
|
num_variants=num_variants,
|
|
89
97
|
example_str=example_str,
|
|
90
98
|
)
|
|
99
|
+
|
|
100
|
+
class StoryGenerationTemplateRenderer(JinjaTemplateRenderer):
|
|
101
|
+
def render(
|
|
102
|
+
self,
|
|
103
|
+
input_data: dict,
|
|
104
|
+
) -> str:
|
|
105
|
+
return super().render(
|
|
106
|
+
input_data=input_data,
|
|
107
|
+
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
2
2
|
You are a tool-planning assistant for an AI system.
|
|
3
3
|
|
|
4
|
-
Your job is to
|
|
4
|
+
Your job is to generate a sequence of tool calls based on user stories and available tools.
|
|
5
5
|
|
|
6
6
|
Rules:
|
|
7
7
|
- Use only the tools listed below.
|
|
@@ -11,12 +11,16 @@ Rules:
|
|
|
11
11
|
- Output ONLY one valid JSON array.
|
|
12
12
|
- DO NOT include extra text or wrap the output. Just return the JSON list.
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
<|eot_id|>
|
|
15
|
+
|
|
16
|
+
<|start_header_id|>user<|end_header_id|>
|
|
17
|
+
Here area a few examples of how to generate a squence of tool calls based on user stories and available tools.
|
|
16
18
|
|
|
17
19
|
Example:
|
|
18
20
|
Story: "Your username is nwaters. You want to find out your time-off schedule from: 2025-01-01 to: 2025-12-31."
|
|
19
|
-
|
|
21
|
+
Available Tools:
|
|
22
|
+
[{'Function Name': '_is_valid_date', 'Arguments': ['date_str'], 'Docstring': 'Check if the provided date string is in YYYY-MM-DD format.'}, {'Function Name': 'fetch_assignment_id', 'Arguments': ['username'], 'Docstring': "Return the assignment ID for a given employee username.\n\n:param username: Employee's username\n:return: Assignment ID as a string or 'not found'"}, {'Function Name': 'retrieve_timeoff_schedule', 'Arguments': ['assignment_id', 'start_date', 'end_date'], 'Docstring': 'Get time-off schedule for an employee within a given date range.\n\n:param assignment_id: Assignment ID of the employee\n:param start_date: Start date in YYYY-MM-DD format\n:param end_date: End date in YYYY-MM-DD format\n:return: JSON list of time-off dates or error message'}, {'Function Name': 'list_direct_reports', 'Arguments': ['manager_assignment_id'], 'Docstring': "Retrieve the list of direct report Employee's username for a specified manager's assignment ID.\n\n:param manager_assignment_id: Assignment ID of the manager as a string\n:return: JSON-encoded list of Employee's username who report to the manager"}, {'Function Name': 'get_address_type', 'Arguments': ['address_type_name'], 'Docstring': 'Retrieve a string address type ID based on a given address type name.\n\n:param address_type_name: Address type as a string\n:return: Corresponding string address_type_id or -1 if not found'}, {'Function Name': 'update_address', 'Arguments': ['address_type_id', 'assignment_id', 'new_address'], 'Docstring': "Update the address for an employee based on assignment ID and address type.\n\n:param address_type_id: String representing the address type\n:param assignment_id: Employee's assignment ID\n:param new_address: New address string\n:return: Success or error message"}]
|
|
23
|
+
Result:
|
|
20
24
|
[
|
|
21
25
|
{
|
|
22
26
|
"tool_name": "fetch_assignment_id",
|
|
@@ -34,7 +38,9 @@ Story: "Your username is nwaters. You want to find out your time-off schedule fr
|
|
|
34
38
|
}
|
|
35
39
|
]
|
|
36
40
|
<|eot_id|>
|
|
37
|
-
<|start_header_id|>user<|end_header_id|>
|
|
38
|
-
Story: "{{ user_story }}"
|
|
39
|
-
<|eot_id|>
|
|
40
41
|
<|start_header_id|>assistant<|end_header_id|>
|
|
42
|
+
# Now, generate results for the following user story and available tools:
|
|
43
|
+
Story: "{{ user_story }}"
|
|
44
|
+
Available Tools:
|
|
45
|
+
{{ available_tools }}
|
|
46
|
+
Result:
|
|
@@ -11,19 +11,25 @@ from wxo_agentic_evaluation.inference_backend import (
|
|
|
11
11
|
from wxo_agentic_evaluation.data_annotator import DataAnnotator
|
|
12
12
|
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
13
13
|
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
14
|
+
from wxo_agentic_evaluation.prompt.template_render import StoryGenerationTemplateRenderer
|
|
15
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
16
|
+
from wxo_agentic_evaluation import __file__
|
|
14
17
|
|
|
15
18
|
import json
|
|
16
19
|
import os
|
|
17
20
|
import rich
|
|
18
21
|
from datetime import datetime
|
|
19
22
|
import time
|
|
20
|
-
from typing import List
|
|
23
|
+
from typing import List, Dict
|
|
24
|
+
import hashlib
|
|
21
25
|
from jsonargparse import CLI
|
|
22
26
|
import warnings
|
|
23
27
|
|
|
24
28
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
25
29
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
26
30
|
|
|
31
|
+
root_dir = os.path.dirname(__file__)
|
|
32
|
+
STORY_GENERATION_PROMPT_PATH = os.path.join(root_dir, "prompt", "story_generation_prompt.jinja2")
|
|
27
33
|
|
|
28
34
|
def get_all_runs(wxo_client: WXOClient):
|
|
29
35
|
limit = 20 # Maximum allowed limit per request
|
|
@@ -31,9 +37,11 @@ def get_all_runs(wxo_client: WXOClient):
|
|
|
31
37
|
all_runs = []
|
|
32
38
|
|
|
33
39
|
if is_saas_url(wxo_client.service_url):
|
|
34
|
-
|
|
40
|
+
# TO-DO: this is not validated after the v1 prefix change
|
|
41
|
+
# need additional validation
|
|
42
|
+
path = "v1/orchestrate/runs"
|
|
35
43
|
else:
|
|
36
|
-
path = "/orchestrate/runs"
|
|
44
|
+
path = "v1/orchestrate/runs"
|
|
37
45
|
|
|
38
46
|
initial_response = wxo_client.get(
|
|
39
47
|
path, {"limit": limit, "offset": 0}
|
|
@@ -62,34 +70,60 @@ def get_all_runs(wxo_client: WXOClient):
|
|
|
62
70
|
return all_runs
|
|
63
71
|
|
|
64
72
|
|
|
65
|
-
def
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
73
|
+
def generate_story(annotated_data: dict):
|
|
74
|
+
renderer = StoryGenerationTemplateRenderer(STORY_GENERATION_PROMPT_PATH)
|
|
75
|
+
provider = get_provider(
|
|
76
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
77
|
+
params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 256},
|
|
78
|
+
)
|
|
79
|
+
prompt = renderer.render(input_data=json.dumps(annotated_data, indent=2))
|
|
80
|
+
res = provider.query(prompt)
|
|
81
|
+
return res.strip()
|
|
69
82
|
|
|
70
83
|
|
|
71
84
|
def annotate_messages(
|
|
72
|
-
messages: List[Message], keywords_generation_config: KeywordsGenerationConfig
|
|
85
|
+
agent_name: str, messages: List[Message], keywords_generation_config: KeywordsGenerationConfig
|
|
73
86
|
):
|
|
74
87
|
annotator = DataAnnotator(
|
|
75
88
|
messages=messages, keywords_generation_config=keywords_generation_config
|
|
76
89
|
)
|
|
77
|
-
|
|
90
|
+
annotated_data = annotator.generate()
|
|
91
|
+
if agent_name is not None:
|
|
92
|
+
annotated_data["agent"] = agent_name
|
|
93
|
+
|
|
94
|
+
annotated_data["story"] = generate_story(annotated_data)
|
|
95
|
+
|
|
96
|
+
return annotated_data
|
|
97
|
+
|
|
98
|
+
def has_messages_changed(
|
|
99
|
+
thread_id: str,
|
|
100
|
+
messages: List[Message],
|
|
101
|
+
previous_hashes: Dict[str, str],
|
|
102
|
+
) -> bool:
|
|
103
|
+
# serialize just the message content
|
|
104
|
+
payload = [msg.model_dump() for msg in messages]
|
|
105
|
+
sig = json.dumps(payload, sort_keys=True, default=str)
|
|
106
|
+
h = hashlib.sha256(sig.encode()).hexdigest()
|
|
107
|
+
|
|
108
|
+
if previous_hashes.get(thread_id) != h:
|
|
109
|
+
previous_hashes[thread_id] = h
|
|
110
|
+
return True
|
|
111
|
+
return False
|
|
78
112
|
|
|
79
113
|
|
|
80
114
|
def record_chats(config: ChatRecordingConfig):
|
|
81
115
|
"""Record chats in background mode"""
|
|
82
116
|
start_time = datetime.utcnow()
|
|
83
117
|
processed_threads = set()
|
|
118
|
+
previous_input_hash: dict[str, str] = {}
|
|
84
119
|
|
|
85
120
|
rich.print(
|
|
86
121
|
f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
|
|
87
122
|
)
|
|
88
123
|
if config.token is None:
|
|
89
|
-
token = tenant_setup(config.service_url, config.tenant_name)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
wxo_client = get_wxo_client(config.service_url, token)
|
|
124
|
+
config.token = tenant_setup(config.service_url, config.tenant_name)
|
|
125
|
+
wxo_client = get_wxo_client(config.service_url, config.tenant_name, config.token)
|
|
126
|
+
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
93
127
|
try:
|
|
94
128
|
while True:
|
|
95
129
|
all_runs = get_all_runs(wxo_client)
|
|
@@ -98,7 +132,12 @@ def record_chats(config: ChatRecordingConfig):
|
|
|
98
132
|
# Process only new runs that started after our recording began
|
|
99
133
|
for run in all_runs:
|
|
100
134
|
thread_id = run.get("thread_id")
|
|
101
|
-
|
|
135
|
+
try:
|
|
136
|
+
agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
rich.print(f"[yellow]WARNING:[/yellow]Failure in getting thread id {thread_id}")
|
|
139
|
+
continue
|
|
140
|
+
if thread_id in seen_threads or agent_name is None:
|
|
102
141
|
continue
|
|
103
142
|
seen_threads.add(thread_id)
|
|
104
143
|
started_at = run.get("started_at")
|
|
@@ -119,33 +158,42 @@ def record_chats(config: ChatRecordingConfig):
|
|
|
119
158
|
rich.print(
|
|
120
159
|
f"[green]INFO:[/green] Messages saved to: {os.path.join(config.output_dir, f'{thread_id}_messages.json')}"
|
|
121
160
|
)
|
|
122
|
-
rich.print(
|
|
123
|
-
|
|
124
|
-
)
|
|
161
|
+
# rich.print(
|
|
162
|
+
# f"[green]INFO:[/green] Annotations saved to: {os.path.join(config.output_dir, f'{thread_id}_annotated_data.json')}"
|
|
163
|
+
# )
|
|
125
164
|
processed_threads.add(thread_id)
|
|
126
165
|
|
|
127
166
|
try:
|
|
128
|
-
messages =
|
|
129
|
-
|
|
130
|
-
|
|
167
|
+
messages = inference_backend.get_messages(thread_id)
|
|
168
|
+
|
|
169
|
+
if not has_messages_changed(
|
|
170
|
+
thread_id,
|
|
171
|
+
messages,
|
|
172
|
+
previous_input_hash,
|
|
173
|
+
):
|
|
174
|
+
continue
|
|
175
|
+
|
|
131
176
|
annotated_data = annotate_messages(
|
|
132
|
-
messages, config.keywords_generation_config
|
|
177
|
+
agent_name, messages, config.keywords_generation_config
|
|
133
178
|
)
|
|
134
179
|
|
|
135
180
|
messages_filename = os.path.join(
|
|
136
181
|
config.output_dir, f"{thread_id}_messages.json"
|
|
137
182
|
)
|
|
138
|
-
annotation_filename = os.path.join(
|
|
139
|
-
config.output_dir, f"{thread_id}_annotated_data.json"
|
|
140
|
-
)
|
|
141
183
|
|
|
142
184
|
with open(messages_filename, "w") as f:
|
|
143
185
|
json.dump(
|
|
144
186
|
[msg.model_dump() for msg in messages], f, indent=4
|
|
145
187
|
)
|
|
146
188
|
|
|
147
|
-
|
|
148
|
-
|
|
189
|
+
# TO-DO: we want some tracing but we also do not want to persist the file
|
|
190
|
+
# in the same folder.
|
|
191
|
+
# annotation_filename = os.path.join(
|
|
192
|
+
# config.output_dir, f"{thread_id}_annotated_data.json"
|
|
193
|
+
# )
|
|
194
|
+
|
|
195
|
+
# with open(annotation_filename, "w") as f:
|
|
196
|
+
# json.dump(annotated_data, f, indent=4)
|
|
149
197
|
except Exception as e:
|
|
150
198
|
rich.print(
|
|
151
199
|
f"[red]ERROR:[/red] Failed to process thread {thread_id}: {str(e)}"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ResourceMap:
|
|
6
|
+
def __init__(self, wxo_client: WXOClient):
|
|
7
|
+
self.wxo_client = wxo_client
|
|
8
|
+
self.agent2tools, self.tools2agents = self.init_mapping()
|
|
9
|
+
self.all_agents = list(self.agent2tools.keys())
|
|
10
|
+
|
|
11
|
+
def init_mapping(self):
|
|
12
|
+
agent2tools = defaultdict(set)
|
|
13
|
+
tools2agents = defaultdict(set)
|
|
14
|
+
if is_saas_url(self.wxo_client.service_url):
|
|
15
|
+
# TO-DO: this is not validated after the v1 prefix change
|
|
16
|
+
# need additional validation
|
|
17
|
+
tools_path = "v1/orchestrate/tools/"
|
|
18
|
+
agents_path = "v1/orchestrate/agents"
|
|
19
|
+
else:
|
|
20
|
+
tools_path = "v1/tools/"
|
|
21
|
+
agents_path = "v1/orchestrate/agents/"
|
|
22
|
+
|
|
23
|
+
tool_map = {}
|
|
24
|
+
|
|
25
|
+
resp = self.wxo_client.get(tools_path)
|
|
26
|
+
if resp.status_code == 200:
|
|
27
|
+
tools = resp.json()
|
|
28
|
+
tool_map = {tool["id"]: tool["name"] for tool in tools}
|
|
29
|
+
else:
|
|
30
|
+
resp.raise_for_status()
|
|
31
|
+
|
|
32
|
+
resp = self.wxo_client.get(agents_path)
|
|
33
|
+
|
|
34
|
+
if resp.status_code == 200:
|
|
35
|
+
agents = resp.json()
|
|
36
|
+
for agent in agents:
|
|
37
|
+
agent_name = agent["name"]
|
|
38
|
+
tools = [tool_map[id] for id in agent["tools"]]
|
|
39
|
+
for tool in tools:
|
|
40
|
+
agent2tools[agent_name].add(tool)
|
|
41
|
+
tools2agents[tool].add(agent_name)
|
|
42
|
+
else:
|
|
43
|
+
resp.raise_for_status()
|
|
44
|
+
|
|
45
|
+
agent2tools = dict(agent2tools)
|
|
46
|
+
tools2agents = dict(tools2agents)
|
|
47
|
+
return agent2tools, tools2agents
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.service_provider.ollama_provider import OllamaProvider
|
|
2
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
|
|
3
|
+
from wxo_agentic_evaluation.service_provider.model_proxy_provider import ModelProxyProvider
|
|
4
|
+
from wxo_agentic_evaluation.arg_configs import ProviderConfig
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
def _instantiate_provider(config: ProviderConfig, **kwargs):
|
|
9
|
+
if config.provider == "watsonx":
|
|
10
|
+
return WatsonXProvider(model_id=config.model_id, **kwargs)
|
|
11
|
+
elif config.provider == "ollama":
|
|
12
|
+
return OllamaProvider(model_id=config.model_id, **kwargs)
|
|
13
|
+
elif config.provider == "model_proxy":
|
|
14
|
+
return ModelProxyProvider(model_id=config.model_id, **kwargs)
|
|
15
|
+
else:
|
|
16
|
+
raise RuntimeError(f"target provider is not supported {config.provider}")
|
|
17
|
+
|
|
18
|
+
def get_provider(config: ProviderConfig = None, model_id: str = None, **kwargs):
|
|
19
|
+
if config:
|
|
20
|
+
return _instantiate_provider(config, **kwargs)
|
|
21
|
+
|
|
22
|
+
if not model_id:
|
|
23
|
+
raise ValueError("model_id must be provided if config is not supplied")
|
|
24
|
+
|
|
25
|
+
if "WATSONX_APIKEY" in os.environ and "WATSONX_SPACE_ID" in os.environ:
|
|
26
|
+
config = ProviderConfig(provider="watsonx", model_id=model_id)
|
|
27
|
+
return _instantiate_provider(config, **kwargs)
|
|
28
|
+
|
|
29
|
+
if "WO_API_KEY" in os.environ and "WO_INSTANCE" in os.environ:
|
|
30
|
+
config = ProviderConfig(provider="model_proxy", model_id=model_id)
|
|
31
|
+
return _instantiate_provider(config, **kwargs)
|
|
32
|
+
|
|
33
|
+
raise RuntimeError(
|
|
34
|
+
"No provider found. Please either provide a config or set the required environment variables."
|
|
35
|
+
)
|