ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
- wxo_agentic_evaluation/__init__.py +0 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
- wxo_agentic_evaluation/analytics/tools/main.py +163 -0
- wxo_agentic_evaluation/analytics/tools/types.py +130 -0
- wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
- wxo_agentic_evaluation/analyze_run.py +123 -0
- wxo_agentic_evaluation/annotate.py +40 -0
- wxo_agentic_evaluation/arg_configs.py +78 -0
- wxo_agentic_evaluation/batch_annotate.py +181 -0
- wxo_agentic_evaluation/data_annotator.py +253 -0
- wxo_agentic_evaluation/evaluation_package.py +518 -0
- wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
- wxo_agentic_evaluation/external_agent/types.py +65 -0
- wxo_agentic_evaluation/inference_backend.py +601 -0
- wxo_agentic_evaluation/llm_matching.py +39 -0
- wxo_agentic_evaluation/llm_rag_eval.py +47 -0
- wxo_agentic_evaluation/llm_user.py +38 -0
- wxo_agentic_evaluation/main.py +231 -0
- wxo_agentic_evaluation/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
- wxo_agentic_evaluation/metrics/metrics.py +101 -0
- wxo_agentic_evaluation/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
- wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
- wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
- wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
- wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
- wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
- wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
- wxo_agentic_evaluation/prompt/template_render.py +90 -0
- wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
- wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
- wxo_agentic_evaluation/record_chat.py +165 -0
- wxo_agentic_evaluation/service_instance.py +179 -0
- wxo_agentic_evaluation/tool_planner.py +228 -0
- wxo_agentic_evaluation/type.py +176 -0
- wxo_agentic_evaluation/utils/__init__.py +6 -0
- wxo_agentic_evaluation/utils/utils.py +233 -0
- wxo_agentic_evaluation/watsonx_provider.py +175 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import yaml
|
|
3
|
+
import os
|
|
4
|
+
import requests
|
|
5
|
+
from wxo_agentic_evaluation.utils.utils import is_saas_url, is_ibm_cloud_url
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
USER = {"username": "wxo.archer@ibm.com", "password": "watsonx"}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ServiceInstance:
|
|
13
|
+
def __init__(
|
|
14
|
+
self, service_url, tenant_name, is_saas: bool = None, is_ibm_cloud: bool = None
|
|
15
|
+
) -> None:
|
|
16
|
+
self.service_url = service_url
|
|
17
|
+
self.tenant_name = tenant_name
|
|
18
|
+
STAGING_AUTH_ENDPOINT = (
|
|
19
|
+
"https://iam.platform.test.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
|
|
20
|
+
)
|
|
21
|
+
PROD_AUTH_ENDPOINT = (
|
|
22
|
+
"https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
|
|
23
|
+
)
|
|
24
|
+
PROD_AUTH_ENDPOINT_IBMCLOUD = "https://iam.cloud.ibm.com/identity/token"
|
|
25
|
+
|
|
26
|
+
self.is_saas = is_saas_url(service_url) if is_saas is None else is_saas
|
|
27
|
+
self.is_ibm_cloud = (
|
|
28
|
+
is_ibm_cloud_url(service_url) if is_ibm_cloud is None else is_ibm_cloud
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if self.is_saas:
|
|
32
|
+
if self.is_ibm_cloud:
|
|
33
|
+
self.auth_endpoint = PROD_AUTH_ENDPOINT_IBMCLOUD
|
|
34
|
+
else:
|
|
35
|
+
self.auth_endpoint = (
|
|
36
|
+
STAGING_AUTH_ENDPOINT
|
|
37
|
+
if "staging" in service_url
|
|
38
|
+
else PROD_AUTH_ENDPOINT
|
|
39
|
+
)
|
|
40
|
+
self.tenant_url = None # Not used in SaaS
|
|
41
|
+
self.tenant_auth_endpoint = None
|
|
42
|
+
else:
|
|
43
|
+
self.auth_endpoint = f"{service_url}/api/v1/auth/token"
|
|
44
|
+
self.tenant_url = f"{service_url}/tenants"
|
|
45
|
+
self.tenant_auth_endpoint = "{}/api/v1/auth/token?tenant_id={}"
|
|
46
|
+
|
|
47
|
+
self.global_token = self.get_user_token()
|
|
48
|
+
|
|
49
|
+
def get_user_token(self):
|
|
50
|
+
try:
|
|
51
|
+
if self.is_saas:
|
|
52
|
+
apikey = os.environ.get("WATSONX_IAM_SAAS_APIKEY")
|
|
53
|
+
if not apikey:
|
|
54
|
+
raise RuntimeError(
|
|
55
|
+
"WATSONX_IAM_SAAS_APIKEY not set in environment for SaaS mode"
|
|
56
|
+
)
|
|
57
|
+
if self.is_ibm_cloud:
|
|
58
|
+
data = {
|
|
59
|
+
"grant_type": "urn:ibm:params:oauth:grant-type:apikey",
|
|
60
|
+
"apikey": apikey,
|
|
61
|
+
}
|
|
62
|
+
response = requests.post(self.auth_endpoint, data=data)
|
|
63
|
+
token_key = "access_token"
|
|
64
|
+
else:
|
|
65
|
+
headers = {
|
|
66
|
+
"Accept": "application/json",
|
|
67
|
+
"Content-Type": "application/json",
|
|
68
|
+
}
|
|
69
|
+
payload = {"apikey": apikey}
|
|
70
|
+
response = requests.post(
|
|
71
|
+
self.auth_endpoint, headers=headers, json=payload
|
|
72
|
+
)
|
|
73
|
+
token_key = "token"
|
|
74
|
+
else:
|
|
75
|
+
response = requests.post(self.auth_endpoint, data=USER)
|
|
76
|
+
token_key = "access_token"
|
|
77
|
+
|
|
78
|
+
return response.json()[token_key]
|
|
79
|
+
|
|
80
|
+
except KeyError as e:
|
|
81
|
+
print(
|
|
82
|
+
f"[ERROR] Missing key '{e}' in response. SaaS mode: {self.is_saas}. Full response: {response.text}"
|
|
83
|
+
)
|
|
84
|
+
raise
|
|
85
|
+
except requests.RequestException as e:
|
|
86
|
+
print(f"[ERROR] Request failed: {e}")
|
|
87
|
+
raise
|
|
88
|
+
|
|
89
|
+
def _get_tenant_token(self, tenant_id: str):
|
|
90
|
+
resp = requests.post(
|
|
91
|
+
self.tenant_auth_endpoint.format(self.service_url, tenant_id), data=USER
|
|
92
|
+
)
|
|
93
|
+
if resp.status_code == 200:
|
|
94
|
+
return resp.json()["access_token"]
|
|
95
|
+
else:
|
|
96
|
+
resp.raise_for_status()
|
|
97
|
+
|
|
98
|
+
def get_default_tenant(self, apikey):
|
|
99
|
+
headers = {
|
|
100
|
+
"Authorization": f"Bearer {apikey}",
|
|
101
|
+
"Content-Type": "application/json",
|
|
102
|
+
}
|
|
103
|
+
resp = requests.get(self.tenant_url, headers=headers)
|
|
104
|
+
if resp.status_code == 200:
|
|
105
|
+
tenant_config = resp.json()
|
|
106
|
+
for tenant in tenant_config:
|
|
107
|
+
if tenant["name"] == self.tenant_name:
|
|
108
|
+
return tenant
|
|
109
|
+
return {}
|
|
110
|
+
else:
|
|
111
|
+
resp.raise_for_status()
|
|
112
|
+
|
|
113
|
+
def create_eval_tenant(self, apikey):
|
|
114
|
+
headers = {
|
|
115
|
+
"Authorization": f"Bearer {apikey}",
|
|
116
|
+
"Content-Type": "application/json",
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
tenant_config = {
|
|
120
|
+
"name": self.tenant_name,
|
|
121
|
+
"title": "WatsonX Orchestrate Development",
|
|
122
|
+
"tags": ["test"],
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
resp = requests.post(self.tenant_url, headers=headers, json=tenant_config)
|
|
126
|
+
if resp.status_code == 201:
|
|
127
|
+
return True
|
|
128
|
+
else:
|
|
129
|
+
resp.raise_for_status()
|
|
130
|
+
|
|
131
|
+
def create_tenant_if_not_exist(self) -> str:
|
|
132
|
+
if self.is_saas:
|
|
133
|
+
logger.info(
|
|
134
|
+
"SaaS mode: running against Remote Service and skipping tenant creation"
|
|
135
|
+
)
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
user_auth_token = self.global_token
|
|
139
|
+
default_tenant = self.get_default_tenant(user_auth_token)
|
|
140
|
+
|
|
141
|
+
if not default_tenant:
|
|
142
|
+
logger.info("no local tenant found. A default tenant is created")
|
|
143
|
+
self.create_eval_tenant(user_auth_token)
|
|
144
|
+
default_tenant = self.get_default_tenant(user_auth_token)
|
|
145
|
+
else:
|
|
146
|
+
logger.info("local tenant found")
|
|
147
|
+
|
|
148
|
+
return default_tenant["id"]
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def tenant_setup(service_url: str, tenant_name: str):
|
|
152
|
+
# service_instance = ServiceInstance(
|
|
153
|
+
# service_url=service_url,
|
|
154
|
+
# tenant_name=tenant_name
|
|
155
|
+
# )
|
|
156
|
+
# tenant_id = service_instance.create_tenant_if_not_exist()
|
|
157
|
+
# if service_instance.is_saas:
|
|
158
|
+
# tenant_token = service_instance.global_token
|
|
159
|
+
# else:
|
|
160
|
+
# tenant_token = service_instance._get_tenant_token(tenant_id)
|
|
161
|
+
|
|
162
|
+
auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
|
|
163
|
+
env_config_path = f"{os.path.expanduser('~')}/.config/orchestrate/config.yaml"
|
|
164
|
+
|
|
165
|
+
# TO-DO: update SDK and use SDK to manage this
|
|
166
|
+
with open(auth_config_path, "r") as f:
|
|
167
|
+
auth_config = yaml.safe_load(f)
|
|
168
|
+
# auth_config["auth"][tenant_name] = {"wxo_mcsp_token": tenant_token}
|
|
169
|
+
|
|
170
|
+
with open(env_config_path, "r") as f:
|
|
171
|
+
env_config = yaml.safe_load(f)
|
|
172
|
+
env_config["environments"][tenant_name] = {"wxo_url": service_url}
|
|
173
|
+
env_config["context"]["active_environment"] = tenant_name
|
|
174
|
+
|
|
175
|
+
with open(auth_config_path, "w") as f:
|
|
176
|
+
yaml.dump(auth_config, f)
|
|
177
|
+
with open(env_config_path, "w") as f:
|
|
178
|
+
yaml.dump(env_config, f)
|
|
179
|
+
return auth_config["auth"][tenant_name]["wxo_mcsp_token"]
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import ast
|
|
3
|
+
import csv
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import importlib.util
|
|
6
|
+
import re
|
|
7
|
+
from jsonargparse import CLI
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
|
|
11
|
+
from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
|
|
12
|
+
from wxo_agentic_evaluation.prompt.template_render import ToolPlannerTemplateRenderer, ToolChainAgentTemplateRenderer
|
|
13
|
+
from wxo_agentic_evaluation import __file__
|
|
14
|
+
|
|
15
|
+
root_dir = os.path.dirname(__file__)
|
|
16
|
+
TOOL_PLANNER_PROMPT_PATH = os.path.join(root_dir, "prompt", "tool_planner.jinja2")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def extract_first_json_list(raw: str) -> list:
|
|
20
|
+
matches = re.findall(r"\[\s*{.*?}\s*]", raw, re.DOTALL)
|
|
21
|
+
for match in matches:
|
|
22
|
+
try:
|
|
23
|
+
parsed = json.loads(match)
|
|
24
|
+
if isinstance(parsed, list) and all("tool_name" in step for step in parsed):
|
|
25
|
+
return parsed
|
|
26
|
+
except Exception:
|
|
27
|
+
continue
|
|
28
|
+
print("⚠️ Could not parse tool call plan. Raw output:")
|
|
29
|
+
print(raw)
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def load_tools_module(tools_path: Path) -> dict:
|
|
34
|
+
tools_dict = {}
|
|
35
|
+
files_to_parse = []
|
|
36
|
+
|
|
37
|
+
if tools_path.is_file():
|
|
38
|
+
files_to_parse.append(tools_path)
|
|
39
|
+
elif tools_path.is_dir():
|
|
40
|
+
files_to_parse.extend(tools_path.glob("**/*.py"))
|
|
41
|
+
else:
|
|
42
|
+
raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
|
|
43
|
+
|
|
44
|
+
for file_path in files_to_parse:
|
|
45
|
+
try:
|
|
46
|
+
module_name = file_path.stem
|
|
47
|
+
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
|
48
|
+
module = importlib.util.module_from_spec(spec)
|
|
49
|
+
spec.loader.exec_module(module)
|
|
50
|
+
|
|
51
|
+
# Add all module's non-private functions to tools_dict
|
|
52
|
+
for attr_name in dir(module):
|
|
53
|
+
attr = getattr(module, attr_name)
|
|
54
|
+
if callable(attr) and not attr_name.startswith('_'):
|
|
55
|
+
tools_dict[attr_name] = attr
|
|
56
|
+
except Exception as e:
|
|
57
|
+
print(f"Warning: Failed to load {file_path}: {str(e)}")
|
|
58
|
+
|
|
59
|
+
return tools_dict
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def extract_tool_signatures(tools_path: Path) -> list:
|
|
63
|
+
tool_data = []
|
|
64
|
+
files_to_parse = []
|
|
65
|
+
|
|
66
|
+
# Handle both single file and directory cases
|
|
67
|
+
if tools_path.is_file():
|
|
68
|
+
files_to_parse.append(tools_path)
|
|
69
|
+
elif tools_path.is_dir():
|
|
70
|
+
files_to_parse.extend(tools_path.glob("**/*.py"))
|
|
71
|
+
else:
|
|
72
|
+
raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
|
|
73
|
+
|
|
74
|
+
for file_path in files_to_parse:
|
|
75
|
+
try:
|
|
76
|
+
with file_path.open("r", encoding="utf-8") as f:
|
|
77
|
+
code = f.read()
|
|
78
|
+
parsed_code = ast.parse(code)
|
|
79
|
+
|
|
80
|
+
for node in parsed_code.body:
|
|
81
|
+
if isinstance(node, ast.FunctionDef):
|
|
82
|
+
name = node.name
|
|
83
|
+
args = [arg.arg for arg in node.args.args if arg.arg != "self"]
|
|
84
|
+
docstring = ast.get_docstring(node)
|
|
85
|
+
tool_data.append({
|
|
86
|
+
"Function Name": name,
|
|
87
|
+
"Arguments": args,
|
|
88
|
+
"Docstring": docstring or "No description available"
|
|
89
|
+
})
|
|
90
|
+
except Exception as e:
|
|
91
|
+
print(f"Warning: Failed to parse {file_path}: {str(e)}")
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
return tool_data
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def ensure_data_available(tool_name: str, inputs: dict, snapshot: dict, tools_module: dict) -> dict:
|
|
98
|
+
cache = snapshot.setdefault("input_output_examples", {}).setdefault(tool_name, [])
|
|
99
|
+
for entry in cache:
|
|
100
|
+
if entry["inputs"] == inputs:
|
|
101
|
+
return entry["output"]
|
|
102
|
+
|
|
103
|
+
if tool_name not in tools_module:
|
|
104
|
+
raise ValueError(f"Tool '{tool_name}' not found")
|
|
105
|
+
|
|
106
|
+
output = tools_module[tool_name](**inputs)
|
|
107
|
+
cache.append({"inputs": inputs, "output": output})
|
|
108
|
+
if not isinstance(output, dict):
|
|
109
|
+
print(f" Tool {tool_name} returned non-dict output: {output}")
|
|
110
|
+
return output
|
|
111
|
+
|
|
112
|
+
def plan_tool_calls_with_llm(story: str, agent_name: str, tool_signatures_str: str, provider) -> list:
|
|
113
|
+
|
|
114
|
+
renderer = ToolPlannerTemplateRenderer(TOOL_PLANNER_PROMPT_PATH)
|
|
115
|
+
|
|
116
|
+
prompt = renderer.render(
|
|
117
|
+
user_story=story,
|
|
118
|
+
agent_name=agent_name,
|
|
119
|
+
available_tools=tool_signatures_str,
|
|
120
|
+
)
|
|
121
|
+
response = provider.query(prompt)
|
|
122
|
+
raw = response.get("generated_text", "")
|
|
123
|
+
parsed = extract_first_json_list(raw)
|
|
124
|
+
print("\n LLM Tool Plan:")
|
|
125
|
+
print(json.dumps(parsed, indent=2))
|
|
126
|
+
return parsed
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# --- Tool Execution Logic ---
|
|
130
|
+
def run_tool_chain(tool_plan: list, snapshot: dict, tools_module) -> None:
|
|
131
|
+
memory = {}
|
|
132
|
+
|
|
133
|
+
for step in tool_plan:
|
|
134
|
+
name = step["tool_name"]
|
|
135
|
+
raw_inputs = step["inputs"]
|
|
136
|
+
print(f"\n🔧 Tool: {name}")
|
|
137
|
+
print(f" Raw inputs: {raw_inputs}")
|
|
138
|
+
|
|
139
|
+
resolved_inputs = {}
|
|
140
|
+
list_keys = []
|
|
141
|
+
|
|
142
|
+
for k, v in raw_inputs.items():
|
|
143
|
+
if isinstance(v, str) and v.startswith("$"):
|
|
144
|
+
expr = v[1:]
|
|
145
|
+
try:
|
|
146
|
+
resolved_value = eval(expr, {}, memory)
|
|
147
|
+
resolved_inputs[k] = resolved_value
|
|
148
|
+
if isinstance(resolved_value, list):
|
|
149
|
+
list_keys.append(k)
|
|
150
|
+
except Exception as e:
|
|
151
|
+
print(f" ❌ Failed to resolve {v} in memory: {memory}")
|
|
152
|
+
raise ValueError(f"Failed to resolve placeholder {v}: {e}")
|
|
153
|
+
else:
|
|
154
|
+
resolved_inputs[k] = v
|
|
155
|
+
|
|
156
|
+
print(f" Resolved inputs: {resolved_inputs}")
|
|
157
|
+
|
|
158
|
+
if list_keys:
|
|
159
|
+
if len(list_keys) > 1:
|
|
160
|
+
raise ValueError(f"Tool '{name}' received multiple list inputs. Only one supported for now.")
|
|
161
|
+
list_key = list_keys[0]
|
|
162
|
+
value_list = resolved_inputs[list_key]
|
|
163
|
+
|
|
164
|
+
results = []
|
|
165
|
+
for idx, val in enumerate(value_list):
|
|
166
|
+
item_inputs = resolved_inputs.copy()
|
|
167
|
+
item_inputs[list_key] = val
|
|
168
|
+
print(f" ⚙️ Running {name} with {list_key} = {val}")
|
|
169
|
+
output = ensure_data_available(name, item_inputs, snapshot, tools_module)
|
|
170
|
+
results.append(output)
|
|
171
|
+
memory[f"{name}_{idx}"] = output
|
|
172
|
+
|
|
173
|
+
memory[name] = results
|
|
174
|
+
print(f"Stored {len(results)} outputs under '{name}' and indexed as '{name}_i'")
|
|
175
|
+
else:
|
|
176
|
+
output = ensure_data_available(name, resolved_inputs, snapshot, tools_module)
|
|
177
|
+
memory[name] = output
|
|
178
|
+
print(f"Stored output under tool name: {name} = {output}")
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# --- Main Snapshot Builder ---
|
|
182
|
+
def build_snapshot(agent_name: str, tools_path: Path, stories: list, output_path: Path):
|
|
183
|
+
agent = {"name": agent_name}
|
|
184
|
+
tools_module = load_tools_module(tools_path)
|
|
185
|
+
tool_signatures = extract_tool_signatures(tools_path)
|
|
186
|
+
|
|
187
|
+
provider = WatsonXProvider(
|
|
188
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
189
|
+
llm_decode_parameter={
|
|
190
|
+
"min_new_tokens": 50,
|
|
191
|
+
"decoding_method": "greedy",
|
|
192
|
+
"max_new_tokens": 200
|
|
193
|
+
}
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
snapshot = {
|
|
197
|
+
"agent": agent,
|
|
198
|
+
"tools": tool_signatures,
|
|
199
|
+
"input_output_examples": {}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
for story in stories:
|
|
203
|
+
print(f"\n📘 Planning tool calls for story: {story}")
|
|
204
|
+
tool_plan = plan_tool_calls_with_llm(story, agent["name"], tool_signatures, provider)
|
|
205
|
+
run_tool_chain(tool_plan, snapshot, tools_module)
|
|
206
|
+
|
|
207
|
+
with output_path.open("w", encoding="utf-8") as f:
|
|
208
|
+
json.dump(snapshot, f, indent=2)
|
|
209
|
+
print(f"\n✅ Snapshot saved to {output_path}")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
if __name__ == "__main__":
|
|
213
|
+
config = CLI(BatchAnnotateConfig, as_positional=False)
|
|
214
|
+
tools_path = Path(config.tools_path)
|
|
215
|
+
stories_path = Path(config.stories_path)
|
|
216
|
+
|
|
217
|
+
stories = []
|
|
218
|
+
agent_name = None
|
|
219
|
+
with stories_path.open("r", encoding="utf-8", newline='') as f:
|
|
220
|
+
csv_reader = csv.DictReader(f)
|
|
221
|
+
for row in csv_reader:
|
|
222
|
+
stories.append(row["story"])
|
|
223
|
+
if agent_name is None:
|
|
224
|
+
agent_name = row["agent"]
|
|
225
|
+
|
|
226
|
+
snapshot_path = stories_path.parent / f"{agent_name}_snapshot_llm.json"
|
|
227
|
+
|
|
228
|
+
build_snapshot(agent_name, tools_path, stories, snapshot_path)
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
from typing import Dict, List, Union, Any, Optional
|
|
2
|
+
from pydantic import BaseModel, computed_field, ConfigDict
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class EventTypes(StrEnum):
|
|
7
|
+
run_started = "run.started"
|
|
8
|
+
run_step_delta = "run.step.delta"
|
|
9
|
+
message_started = "message.started"
|
|
10
|
+
message_delta = "message.delta"
|
|
11
|
+
message_created = "message.created"
|
|
12
|
+
run_completed = "run.completed"
|
|
13
|
+
done = "done"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ContentType(StrEnum):
|
|
17
|
+
text = "text"
|
|
18
|
+
tool_call = "tool_call"
|
|
19
|
+
tool_response = "tool_response"
|
|
20
|
+
conversational_search = "conversational_search"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ConversationalSearchCitations(BaseModel):
|
|
24
|
+
url: str
|
|
25
|
+
body: str
|
|
26
|
+
text: str
|
|
27
|
+
title: str
|
|
28
|
+
range_end: int
|
|
29
|
+
range_start: int
|
|
30
|
+
search_result_idx: int
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ConversationalSearchResultMetadata(BaseModel):
|
|
34
|
+
score: float
|
|
35
|
+
document_retrieval_source: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ConversationalSearchResults(BaseModel):
|
|
39
|
+
url: str
|
|
40
|
+
body: str
|
|
41
|
+
title: str
|
|
42
|
+
result_metadata: ConversationalSearchResultMetadata
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ConversationalConfidenceThresholdScore(BaseModel):
|
|
46
|
+
response_confidence: float
|
|
47
|
+
response_confidence_threshold: float
|
|
48
|
+
retrieval_confidence: float
|
|
49
|
+
retrieval_confidence_threshold: float
|
|
50
|
+
|
|
51
|
+
def table(self):
|
|
52
|
+
return {
|
|
53
|
+
"response_confidence": str(self.response_confidence),
|
|
54
|
+
"response_confidence_threshold": str(self.response_confidence_threshold),
|
|
55
|
+
"retrieval_confidence": str(self.retrieval_confidence),
|
|
56
|
+
"retrieval_confidence_threshold": str(self.retrieval_confidence_threshold),
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ConversationSearchMetadata(BaseModel):
|
|
61
|
+
"""This class is used to store additional informational about the conversational search response that was not part of the API response.
|
|
62
|
+
|
|
63
|
+
For example, the tool call that generated the conversational search response is not part of the API response. However,
|
|
64
|
+
during evaluation, we want to refer to the tool that generated the conversational search response.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
tool_call_id: str
|
|
68
|
+
model_config = ConfigDict(frozen=True)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class ConversationalSearch(BaseModel):
|
|
72
|
+
metadata: ConversationSearchMetadata
|
|
73
|
+
response_type: str
|
|
74
|
+
text: str # same as `content` in Message. This field can be removed if neccesary
|
|
75
|
+
citations: List[ConversationalSearchCitations]
|
|
76
|
+
search_results: List[ConversationalSearchResults]
|
|
77
|
+
citations_title: str
|
|
78
|
+
confidence_scores: ConversationalConfidenceThresholdScore
|
|
79
|
+
response_length_option: str
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class Message(BaseModel):
|
|
83
|
+
role: str
|
|
84
|
+
content: Union[str, Dict[str, Any]]
|
|
85
|
+
type: ContentType
|
|
86
|
+
# event that produced the message
|
|
87
|
+
event: Optional[str] = None
|
|
88
|
+
# used to correlate the Message with the retrieval context (ConversationalSearch)
|
|
89
|
+
conversational_search_metadata: Optional[ConversationSearchMetadata] = None
|
|
90
|
+
|
|
91
|
+
model_config = ConfigDict(frozen=True)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class ExtendedMessage(BaseModel):
|
|
95
|
+
message: Message
|
|
96
|
+
reason: dict | None = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class KnowledgeBaseGoalDetail(BaseModel):
|
|
100
|
+
enabled: bool = False
|
|
101
|
+
metrics: list = []
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class GoalDetail(BaseModel):
|
|
105
|
+
name: str
|
|
106
|
+
tool_name: str = None
|
|
107
|
+
type: ContentType
|
|
108
|
+
args: Dict = None
|
|
109
|
+
response: str = None
|
|
110
|
+
keywords: List = None
|
|
111
|
+
knowledge_base: KnowledgeBaseGoalDetail = KnowledgeBaseGoalDetail()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class MineField(BaseModel):
|
|
115
|
+
type: ContentType
|
|
116
|
+
name: str
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class EvaluationData(BaseModel):
|
|
120
|
+
agent: str
|
|
121
|
+
goals: Dict
|
|
122
|
+
story: str
|
|
123
|
+
mine_fields: List[MineField]
|
|
124
|
+
goal_details: List[GoalDetail]
|
|
125
|
+
starting_sentence: str = None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class ToolCallAndRoutingMetrics(BaseModel):
|
|
129
|
+
total_tool_calls: int
|
|
130
|
+
expected_tool_calls: int
|
|
131
|
+
relevant_tool_calls: int
|
|
132
|
+
correct_tool_calls: int
|
|
133
|
+
total_routing_calls: int
|
|
134
|
+
expected_routing_calls: int
|
|
135
|
+
|
|
136
|
+
@computed_field
|
|
137
|
+
@property
|
|
138
|
+
def non_transfer_tool_calls(self) -> int:
|
|
139
|
+
return self.total_tool_calls - self.total_routing_calls
|
|
140
|
+
|
|
141
|
+
@computed_field
|
|
142
|
+
@property
|
|
143
|
+
def tool_call_accuracy(self) -> float:
|
|
144
|
+
return round(
|
|
145
|
+
(
|
|
146
|
+
self.correct_tool_calls / self.non_transfer_tool_calls
|
|
147
|
+
if self.non_transfer_tool_calls > 0
|
|
148
|
+
else 0.0
|
|
149
|
+
),
|
|
150
|
+
2,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
@computed_field
|
|
154
|
+
@property
|
|
155
|
+
def tool_call_relevancy(self) -> float:
|
|
156
|
+
return round(
|
|
157
|
+
(
|
|
158
|
+
(self.relevant_tool_calls - self.expected_routing_calls)
|
|
159
|
+
/ self.non_transfer_tool_calls
|
|
160
|
+
if self.non_transfer_tool_calls > 0
|
|
161
|
+
else 0.0
|
|
162
|
+
),
|
|
163
|
+
2,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
@computed_field
|
|
167
|
+
@property
|
|
168
|
+
def agent_routing_accuracy(self) -> float:
|
|
169
|
+
return round(
|
|
170
|
+
(
|
|
171
|
+
self.expected_routing_calls / self.total_routing_calls
|
|
172
|
+
if self.total_routing_calls > 0
|
|
173
|
+
else 0.0
|
|
174
|
+
),
|
|
175
|
+
2,
|
|
176
|
+
)
|