ibm-watsonx-orchestrate-evaluation-framework 1.0.6__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/METADATA +12 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/RECORD +10 -10
- wxo_agentic_evaluation/arg_configs.py +1 -0
- wxo_agentic_evaluation/data_annotator.py +7 -4
- wxo_agentic_evaluation/record_chat.py +49 -33
- wxo_agentic_evaluation/resource_map.py +1 -1
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +11 -2
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.7
|
|
4
4
|
Summary: The WxO evaluation framework
|
|
5
5
|
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
6
|
License: MIT
|
|
@@ -53,6 +53,17 @@ Run the following command to install evaluation framework in the same env:
|
|
|
53
53
|
pip install -e .
|
|
54
54
|
```
|
|
55
55
|
|
|
56
|
+
## contribution guide
|
|
57
|
+
### secret resolution
|
|
58
|
+
install detect secret utilities:
|
|
59
|
+
```
|
|
60
|
+
pip install --upgrade git+https://github.com/ibm/detect-secrets.git@master#egg=detect-secrets
|
|
61
|
+
```
|
|
62
|
+
run the scan & resolve detections:
|
|
63
|
+
```
|
|
64
|
+
detect-secrets scan --exclude-files "benchmark|results" --update .secrets.baseline && detect-secrets audit .secrets.baseline && git add .secrets.baseline
|
|
65
|
+
```
|
|
66
|
+
|
|
56
67
|
|
|
57
68
|
## quick experiment against the default wxo-dev env
|
|
58
69
|
```bash
|
|
@@ -1,20 +1,20 @@
|
|
|
1
1
|
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
|
|
3
3
|
wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
|
|
4
|
-
wxo_agentic_evaluation/arg_configs.py,sha256=
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
|
|
5
5
|
wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
|
|
6
|
-
wxo_agentic_evaluation/data_annotator.py,sha256=
|
|
6
|
+
wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
|
|
7
7
|
wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
|
|
8
8
|
wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
|
|
9
9
|
wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
|
|
10
10
|
wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
|
|
11
11
|
wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
|
|
12
12
|
wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
|
|
13
|
-
wxo_agentic_evaluation/record_chat.py,sha256=
|
|
14
|
-
wxo_agentic_evaluation/resource_map.py,sha256
|
|
13
|
+
wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
|
|
14
|
+
wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
|
|
15
15
|
wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
|
|
16
16
|
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
17
|
-
wxo_agentic_evaluation/tool_planner.py,sha256=
|
|
17
|
+
wxo_agentic_evaluation/tool_planner.py,sha256=JW5o0VYaaUorB3FBcrwLzgG3-iqEWrqjVhh82u7x8YM,12960
|
|
18
18
|
wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
|
|
19
19
|
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
|
|
20
20
|
wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
|
|
@@ -44,13 +44,13 @@ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TC
|
|
|
44
44
|
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
45
|
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
46
46
|
wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
|
|
47
|
-
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=
|
|
47
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=Y36Ryv4nPG8RdVP_zsQsRlEWv8F_hGi7-wOppWPQTwc,4026
|
|
48
48
|
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
|
|
49
49
|
wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
|
|
50
50
|
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
|
|
51
51
|
wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
|
|
52
52
|
wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
|
|
53
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
54
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
55
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
56
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
53
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/METADATA,sha256=wz60je0UK3ogKLH9qiDLS808j57cfWOosONyCuQR95g,18051
|
|
54
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
55
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
56
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD,,
|
|
@@ -247,11 +247,14 @@ class DataAnnotator:
|
|
|
247
247
|
}
|
|
248
248
|
goal_details.append(summarize_step)
|
|
249
249
|
break
|
|
250
|
-
|
|
251
|
-
if
|
|
252
|
-
goals[
|
|
253
|
-
|
|
250
|
+
|
|
251
|
+
if previous is None:
|
|
252
|
+
goals["summarize"] = []
|
|
253
|
+
elif summarize_step is None:
|
|
254
254
|
goals[previous] = []
|
|
255
|
+
else:
|
|
256
|
+
goals[previous] = ["summarize"]
|
|
257
|
+
|
|
255
258
|
|
|
256
259
|
def generate(self) -> Dict:
|
|
257
260
|
"""Generate the final dataset"""
|
|
@@ -43,17 +43,13 @@ def get_all_runs(wxo_client: WXOClient):
|
|
|
43
43
|
else:
|
|
44
44
|
path = "v1/orchestrate/runs"
|
|
45
45
|
|
|
46
|
-
initial_response = wxo_client.get(
|
|
47
|
-
path, {"limit": limit, "offset": 0}
|
|
48
|
-
).json()
|
|
46
|
+
initial_response = wxo_client.get(path, {"limit": limit, "offset": 0}).json()
|
|
49
47
|
total_runs = initial_response["total"]
|
|
50
48
|
all_runs.extend(initial_response["data"])
|
|
51
49
|
|
|
52
50
|
while len(all_runs) < total_runs:
|
|
53
51
|
offset += limit
|
|
54
|
-
response = wxo_client.get(
|
|
55
|
-
path, {"limit": limit, "offset": offset}
|
|
56
|
-
).json()
|
|
52
|
+
response = wxo_client.get(path, {"limit": limit, "offset": offset}).json()
|
|
57
53
|
all_runs.extend(response["data"])
|
|
58
54
|
|
|
59
55
|
# Sort runs by completed_at in descending order (most recent first)
|
|
@@ -92,9 +88,10 @@ def annotate_messages(
|
|
|
92
88
|
annotated_data["agent"] = agent_name
|
|
93
89
|
|
|
94
90
|
annotated_data["story"] = generate_story(annotated_data)
|
|
95
|
-
|
|
91
|
+
|
|
96
92
|
return annotated_data
|
|
97
93
|
|
|
94
|
+
|
|
98
95
|
def has_messages_changed(
|
|
99
96
|
thread_id: str,
|
|
100
97
|
messages: List[Message],
|
|
@@ -111,32 +108,27 @@ def has_messages_changed(
|
|
|
111
108
|
return False
|
|
112
109
|
|
|
113
110
|
|
|
114
|
-
def
|
|
111
|
+
def _record(config: ChatRecordingConfig, bad_threads: set):
|
|
115
112
|
"""Record chats in background mode"""
|
|
116
113
|
start_time = datetime.utcnow()
|
|
117
114
|
processed_threads = set()
|
|
118
115
|
previous_input_hash: dict[str, str] = {}
|
|
119
116
|
|
|
120
|
-
rich.print(
|
|
121
|
-
f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
|
|
122
|
-
)
|
|
123
117
|
if config.token is None:
|
|
124
118
|
config.token = tenant_setup(config.service_url, config.tenant_name)
|
|
125
119
|
wxo_client = get_wxo_client(config.service_url, config.tenant_name, config.token)
|
|
126
120
|
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
127
|
-
|
|
128
|
-
|
|
121
|
+
|
|
122
|
+
retry_count = 0
|
|
123
|
+
while retry_count < config.max_retries:
|
|
124
|
+
thread_id = None
|
|
125
|
+
try:
|
|
129
126
|
all_runs = get_all_runs(wxo_client)
|
|
130
127
|
seen_threads = set()
|
|
131
128
|
# Process only new runs that started after our recording began
|
|
132
129
|
for run in all_runs:
|
|
133
130
|
thread_id = run.get("thread_id")
|
|
134
|
-
|
|
135
|
-
agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
|
|
136
|
-
except Exception as e:
|
|
137
|
-
rich.print(f"[yellow]WARNING:[/yellow]Failure in getting thread id {thread_id}")
|
|
138
|
-
continue
|
|
139
|
-
if thread_id in seen_threads or agent_name is None:
|
|
131
|
+
if (thread_id in bad_threads) or (thread_id in seen_threads):
|
|
140
132
|
continue
|
|
141
133
|
seen_threads.add(thread_id)
|
|
142
134
|
started_at = run.get("started_at")
|
|
@@ -162,11 +154,17 @@ def record_chats(config: ChatRecordingConfig):
|
|
|
162
154
|
try:
|
|
163
155
|
messages = inference_backend.get_messages(thread_id)
|
|
164
156
|
|
|
165
|
-
if not has_messages_changed(
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
157
|
+
if not has_messages_changed(thread_id, messages, previous_input_hash):
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
rich.print(f"[yellow]WARNING:[/yellow] Failure getting agent name for thread_id {thread_id}: {e}")
|
|
164
|
+
raise
|
|
165
|
+
|
|
166
|
+
if agent_name is None:
|
|
167
|
+
rich.print(f"[yellow]WARNING:[/yellow] No agent name found for thread_id {thread_id}. Skipping ...")
|
|
170
168
|
continue
|
|
171
169
|
|
|
172
170
|
annotated_data = annotate_messages(
|
|
@@ -180,19 +178,37 @@ def record_chats(config: ChatRecordingConfig):
|
|
|
180
178
|
with open(annotation_filename, "w") as f:
|
|
181
179
|
json.dump(annotated_data, f, indent=4)
|
|
182
180
|
except Exception as e:
|
|
183
|
-
rich.print(
|
|
184
|
-
|
|
185
|
-
)
|
|
181
|
+
rich.print(f"[yellow]WARNING:[/yellow] Failed to process thread {thread_id}: {e}")
|
|
182
|
+
raise
|
|
186
183
|
except (ValueError, TypeError) as e:
|
|
187
|
-
rich.print(
|
|
188
|
-
|
|
189
|
-
|
|
184
|
+
rich.print(f"[yellow]WARNING:[/yellow] Invalid timestamp for thread {thread_id}: {e}")
|
|
185
|
+
raise
|
|
186
|
+
|
|
187
|
+
retry_count = 0
|
|
188
|
+
time.sleep(2)
|
|
190
189
|
|
|
191
|
-
|
|
190
|
+
except KeyboardInterrupt:
|
|
191
|
+
rich.print("\n[yellow]Recording stopped by user[/yellow]")
|
|
192
|
+
break
|
|
192
193
|
|
|
193
|
-
|
|
194
|
-
|
|
194
|
+
except Exception as e:
|
|
195
|
+
if thread_id is None:
|
|
196
|
+
rich.print(f"[red]ERROR:[/red] {e}")
|
|
197
|
+
break
|
|
195
198
|
|
|
199
|
+
time.sleep(1)
|
|
200
|
+
retry_count += 1
|
|
201
|
+
if retry_count >= config.max_retries:
|
|
202
|
+
rich.print(f"[red]ERROR:[/red] Maximum retries reached. Skipping thread {thread_id}")
|
|
203
|
+
bad_threads.add(thread_id)
|
|
204
|
+
_record(config, bad_threads)
|
|
205
|
+
|
|
206
|
+
def record_chats(config: ChatRecordingConfig):
|
|
207
|
+
rich.print(
|
|
208
|
+
f"[green]INFO:[/green] Chat recording started. Press Ctrl+C to stop."
|
|
209
|
+
)
|
|
210
|
+
bad_threads = set()
|
|
211
|
+
_record(config, bad_threads)
|
|
196
212
|
|
|
197
213
|
if __name__ == "__main__":
|
|
198
214
|
record_chats(CLI(ChatRecordingConfig, as_positional=False))
|
|
@@ -14,7 +14,7 @@ class ResourceMap:
|
|
|
14
14
|
if is_saas_url(self.wxo_client.service_url):
|
|
15
15
|
# TO-DO: this is not validated after the v1 prefix change
|
|
16
16
|
# need additional validation
|
|
17
|
-
tools_path = "v1/orchestrate/tools
|
|
17
|
+
tools_path = "v1/orchestrate/tools"
|
|
18
18
|
agents_path = "v1/orchestrate/agents"
|
|
19
19
|
else:
|
|
20
20
|
tools_path = "v1/tools/"
|
|
@@ -10,8 +10,6 @@ from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url
|
|
|
10
10
|
|
|
11
11
|
AUTH_ENDPOINT_AWS = "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
|
|
12
12
|
AUTH_ENDPOINT_IBM_CLOUD = "https://iam.cloud.ibm.com/identity/token"
|
|
13
|
-
WO_INSTANCE = os.environ.get("WO_INSTANCE")
|
|
14
|
-
WO_API_KEY = os.environ.get("WO_API_KEY")
|
|
15
13
|
DEFAULT_PARAM = {"min_new_tokens": 1, "decoding_method": "greedy", "max_new_tokens": 400}
|
|
16
14
|
|
|
17
15
|
|
|
@@ -19,14 +17,16 @@ class ModelProxyProvider(Provider):
|
|
|
19
17
|
def __init__(
|
|
20
18
|
self,
|
|
21
19
|
model_id=None,
|
|
22
|
-
api_key=
|
|
23
|
-
instance_url=
|
|
20
|
+
api_key=None,
|
|
21
|
+
instance_url=None,
|
|
24
22
|
timeout=300,
|
|
25
23
|
embedding_model_id=None,
|
|
26
24
|
params=None
|
|
27
25
|
):
|
|
28
26
|
super().__init__()
|
|
29
27
|
|
|
28
|
+
instance_url = os.environ.get("WO_INSTANCE", instance_url)
|
|
29
|
+
api_key = os.environ.get("WO_API_KEY", api_key)
|
|
30
30
|
if not instance_url or not api_key:
|
|
31
31
|
raise RuntimeError("instance url and WO apikey must be specified to use WO model proxy")
|
|
32
32
|
|
|
@@ -6,6 +6,7 @@ import importlib.util
|
|
|
6
6
|
import re
|
|
7
7
|
from jsonargparse import CLI
|
|
8
8
|
import os
|
|
9
|
+
import sys
|
|
9
10
|
import textwrap
|
|
10
11
|
from dataclasses import is_dataclass, asdict
|
|
11
12
|
|
|
@@ -83,8 +84,16 @@ def load_tools_module(tools_path: Path) -> dict:
|
|
|
83
84
|
module_name = file_path.stem
|
|
84
85
|
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
|
85
86
|
module = importlib.util.module_from_spec(spec)
|
|
86
|
-
|
|
87
|
-
|
|
87
|
+
parent_dir = str(file_path.parent)
|
|
88
|
+
sys_path_modified = False
|
|
89
|
+
if parent_dir not in sys.path:
|
|
90
|
+
sys.path.append(parent_dir)
|
|
91
|
+
sys_path_modified = True
|
|
92
|
+
try:
|
|
93
|
+
spec.loader.exec_module(module)
|
|
94
|
+
finally:
|
|
95
|
+
if sys_path_modified:
|
|
96
|
+
sys.path.pop()
|
|
88
97
|
# Add all module's non-private functions to tools_dict
|
|
89
98
|
for attr_name in dir(module):
|
|
90
99
|
attr = getattr(module, attr_name)
|
|
File without changes
|