ibm-watsonx-orchestrate-evaluation-framework 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/METADATA +12 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/RECORD +10 -10
- wxo_agentic_evaluation/arg_configs.py +1 -0
- wxo_agentic_evaluation/data_annotator.py +7 -4
- wxo_agentic_evaluation/record_chat.py +54 -53
- wxo_agentic_evaluation/resource_map.py +1 -1
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +11 -2
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.7
|
|
4
4
|
Summary: The WxO evaluation framework
|
|
5
5
|
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
6
|
License: MIT
|
|
@@ -53,6 +53,17 @@ Run the following command to install evaluation framework in the same env:
|
|
|
53
53
|
pip install -e .
|
|
54
54
|
```
|
|
55
55
|
|
|
56
|
+
## contribution guide
|
|
57
|
+
### secret resolution
|
|
58
|
+
install detect secret utilities:
|
|
59
|
+
```
|
|
60
|
+
pip install --upgrade git+https://github.com/ibm/detect-secrets.git@master#egg=detect-secrets
|
|
61
|
+
```
|
|
62
|
+
run the scan & resolve detections:
|
|
63
|
+
```
|
|
64
|
+
detect-secrets scan --exclude-files "benchmark|results" --update .secrets.baseline && detect-secrets audit .secrets.baseline && git add .secrets.baseline
|
|
65
|
+
```
|
|
66
|
+
|
|
56
67
|
|
|
57
68
|
## quick experiment against the default wxo-dev env
|
|
58
69
|
```bash
|
|
@@ -1,20 +1,20 @@
|
|
|
1
1
|
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
|
|
3
3
|
wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
|
|
4
|
-
wxo_agentic_evaluation/arg_configs.py,sha256=
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
|
|
5
5
|
wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
|
|
6
|
-
wxo_agentic_evaluation/data_annotator.py,sha256=
|
|
6
|
+
wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
|
|
7
7
|
wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
|
|
8
8
|
wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
|
|
9
9
|
wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
|
|
10
10
|
wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
|
|
11
11
|
wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
|
|
12
12
|
wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
|
|
13
|
-
wxo_agentic_evaluation/record_chat.py,sha256=
|
|
14
|
-
wxo_agentic_evaluation/resource_map.py,sha256
|
|
13
|
+
wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
|
|
14
|
+
wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
|
|
15
15
|
wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
|
|
16
16
|
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
17
|
-
wxo_agentic_evaluation/tool_planner.py,sha256=
|
|
17
|
+
wxo_agentic_evaluation/tool_planner.py,sha256=JW5o0VYaaUorB3FBcrwLzgG3-iqEWrqjVhh82u7x8YM,12960
|
|
18
18
|
wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
|
|
19
19
|
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
|
|
20
20
|
wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
|
|
@@ -44,13 +44,13 @@ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TC
|
|
|
44
44
|
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
45
|
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
46
46
|
wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
|
|
47
|
-
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=
|
|
47
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=Y36Ryv4nPG8RdVP_zsQsRlEWv8F_hGi7-wOppWPQTwc,4026
|
|
48
48
|
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
|
|
49
49
|
wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
|
|
50
50
|
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
|
|
51
51
|
wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
|
|
52
52
|
wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
|
|
53
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
54
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
55
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
56
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
53
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/METADATA,sha256=wz60je0UK3ogKLH9qiDLS808j57cfWOosONyCuQR95g,18051
|
|
54
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
55
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
56
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD,,
|
|
@@ -247,11 +247,14 @@ class DataAnnotator:
|
|
|
247
247
|
}
|
|
248
248
|
goal_details.append(summarize_step)
|
|
249
249
|
break
|
|
250
|
-
|
|
251
|
-
if
|
|
252
|
-
goals[
|
|
253
|
-
|
|
250
|
+
|
|
251
|
+
if previous is None:
|
|
252
|
+
goals["summarize"] = []
|
|
253
|
+
elif summarize_step is None:
|
|
254
254
|
goals[previous] = []
|
|
255
|
+
else:
|
|
256
|
+
goals[previous] = ["summarize"]
|
|
257
|
+
|
|
255
258
|
|
|
256
259
|
def generate(self) -> Dict:
|
|
257
260
|
"""Generate the final dataset"""
|
|
@@ -43,17 +43,13 @@ def get_all_runs(wxo_client: WXOClient):
|
|
|
43
43
|
else:
|
|
44
44
|
path = "v1/orchestrate/runs"
|
|
45
45
|
|
|
46
|
-
initial_response = wxo_client.get(
|
|
47
|
-
path, {"limit": limit, "offset": 0}
|
|
48
|
-
).json()
|
|
46
|
+
initial_response = wxo_client.get(path, {"limit": limit, "offset": 0}).json()
|
|
49
47
|
total_runs = initial_response["total"]
|
|
50
48
|
all_runs.extend(initial_response["data"])
|
|
51
49
|
|
|
52
50
|
while len(all_runs) < total_runs:
|
|
53
51
|
offset += limit
|
|
54
|
-
response = wxo_client.get(
|
|
55
|
-
path, {"limit": limit, "offset": offset}
|
|
56
|
-
).json()
|
|
52
|
+
response = wxo_client.get(path, {"limit": limit, "offset": offset}).json()
|
|
57
53
|
all_runs.extend(response["data"])
|
|
58
54
|
|
|
59
55
|
# Sort runs by completed_at in descending order (most recent first)
|
|
@@ -92,9 +88,10 @@ def annotate_messages(
|
|
|
92
88
|
annotated_data["agent"] = agent_name
|
|
93
89
|
|
|
94
90
|
annotated_data["story"] = generate_story(annotated_data)
|
|
95
|
-
|
|
91
|
+
|
|
96
92
|
return annotated_data
|
|
97
93
|
|
|
94
|
+
|
|
98
95
|
def has_messages_changed(
|
|
99
96
|
thread_id: str,
|
|
100
97
|
messages: List[Message],
|
|
@@ -111,33 +108,27 @@ def has_messages_changed(
|
|
|
111
108
|
return False
|
|
112
109
|
|
|
113
110
|
|
|
114
|
-
def
|
|
111
|
+
def _record(config: ChatRecordingConfig, bad_threads: set):
|
|
115
112
|
"""Record chats in background mode"""
|
|
116
113
|
start_time = datetime.utcnow()
|
|
117
114
|
processed_threads = set()
|
|
118
115
|
previous_input_hash: dict[str, str] = {}
|
|
119
116
|
|
|
120
|
-
rich.print(
|
|
121
|
-
f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
|
|
122
|
-
)
|
|
123
117
|
if config.token is None:
|
|
124
118
|
config.token = tenant_setup(config.service_url, config.tenant_name)
|
|
125
119
|
wxo_client = get_wxo_client(config.service_url, config.tenant_name, config.token)
|
|
126
120
|
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
127
|
-
|
|
128
|
-
|
|
121
|
+
|
|
122
|
+
retry_count = 0
|
|
123
|
+
while retry_count < config.max_retries:
|
|
124
|
+
thread_id = None
|
|
125
|
+
try:
|
|
129
126
|
all_runs = get_all_runs(wxo_client)
|
|
130
127
|
seen_threads = set()
|
|
131
|
-
|
|
132
128
|
# Process only new runs that started after our recording began
|
|
133
129
|
for run in all_runs:
|
|
134
130
|
thread_id = run.get("thread_id")
|
|
135
|
-
|
|
136
|
-
agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
|
|
137
|
-
except Exception as e:
|
|
138
|
-
rich.print(f"[yellow]WARNING:[/yellow]Failure in getting thread id {thread_id}")
|
|
139
|
-
continue
|
|
140
|
-
if thread_id in seen_threads or agent_name is None:
|
|
131
|
+
if (thread_id in bad_threads) or (thread_id in seen_threads):
|
|
141
132
|
continue
|
|
142
133
|
seen_threads.add(thread_id)
|
|
143
134
|
started_at = run.get("started_at")
|
|
@@ -156,58 +147,68 @@ def record_chats(config: ChatRecordingConfig):
|
|
|
156
147
|
f"\n[green]INFO:[/green] New recording started at {started_at}"
|
|
157
148
|
)
|
|
158
149
|
rich.print(
|
|
159
|
-
f"[green]INFO:[/green]
|
|
150
|
+
f"[green]INFO:[/green] Annotations saved to: {os.path.join(config.output_dir, f'{thread_id}_annotated_data.json')}"
|
|
160
151
|
)
|
|
161
|
-
# rich.print(
|
|
162
|
-
# f"[green]INFO:[/green] Annotations saved to: {os.path.join(config.output_dir, f'{thread_id}_annotated_data.json')}"
|
|
163
|
-
# )
|
|
164
152
|
processed_threads.add(thread_id)
|
|
165
153
|
|
|
166
154
|
try:
|
|
167
155
|
messages = inference_backend.get_messages(thread_id)
|
|
168
156
|
|
|
169
|
-
if not has_messages_changed(
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
157
|
+
if not has_messages_changed(thread_id, messages, previous_input_hash):
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
rich.print(f"[yellow]WARNING:[/yellow] Failure getting agent name for thread_id {thread_id}: {e}")
|
|
164
|
+
raise
|
|
165
|
+
|
|
166
|
+
if agent_name is None:
|
|
167
|
+
rich.print(f"[yellow]WARNING:[/yellow] No agent name found for thread_id {thread_id}. Skipping ...")
|
|
174
168
|
continue
|
|
175
169
|
|
|
176
170
|
annotated_data = annotate_messages(
|
|
177
171
|
agent_name, messages, config.keywords_generation_config
|
|
178
172
|
)
|
|
179
173
|
|
|
180
|
-
|
|
181
|
-
config.output_dir, f"{thread_id}
|
|
174
|
+
annotation_filename = os.path.join(
|
|
175
|
+
config.output_dir, f"{thread_id}_annotated_data.json"
|
|
182
176
|
)
|
|
183
177
|
|
|
184
|
-
with open(
|
|
185
|
-
json.dump(
|
|
186
|
-
[msg.model_dump() for msg in messages], f, indent=4
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
# TO-DO: we want some tracing but we also do not want to persist the file
|
|
190
|
-
# in the same folder.
|
|
191
|
-
# annotation_filename = os.path.join(
|
|
192
|
-
# config.output_dir, f"{thread_id}_annotated_data.json"
|
|
193
|
-
# )
|
|
194
|
-
|
|
195
|
-
# with open(annotation_filename, "w") as f:
|
|
196
|
-
# json.dump(annotated_data, f, indent=4)
|
|
178
|
+
with open(annotation_filename, "w") as f:
|
|
179
|
+
json.dump(annotated_data, f, indent=4)
|
|
197
180
|
except Exception as e:
|
|
198
|
-
rich.print(
|
|
199
|
-
|
|
200
|
-
)
|
|
181
|
+
rich.print(f"[yellow]WARNING:[/yellow] Failed to process thread {thread_id}: {e}")
|
|
182
|
+
raise
|
|
201
183
|
except (ValueError, TypeError) as e:
|
|
202
|
-
rich.print(
|
|
203
|
-
|
|
204
|
-
)
|
|
184
|
+
rich.print(f"[yellow]WARNING:[/yellow] Invalid timestamp for thread {thread_id}: {e}")
|
|
185
|
+
raise
|
|
205
186
|
|
|
206
|
-
|
|
187
|
+
retry_count = 0
|
|
188
|
+
time.sleep(2)
|
|
207
189
|
|
|
208
|
-
|
|
209
|
-
|
|
190
|
+
except KeyboardInterrupt:
|
|
191
|
+
rich.print("\n[yellow]Recording stopped by user[/yellow]")
|
|
192
|
+
break
|
|
210
193
|
|
|
194
|
+
except Exception as e:
|
|
195
|
+
if thread_id is None:
|
|
196
|
+
rich.print(f"[red]ERROR:[/red] {e}")
|
|
197
|
+
break
|
|
198
|
+
|
|
199
|
+
time.sleep(1)
|
|
200
|
+
retry_count += 1
|
|
201
|
+
if retry_count >= config.max_retries:
|
|
202
|
+
rich.print(f"[red]ERROR:[/red] Maximum retries reached. Skipping thread {thread_id}")
|
|
203
|
+
bad_threads.add(thread_id)
|
|
204
|
+
_record(config, bad_threads)
|
|
205
|
+
|
|
206
|
+
def record_chats(config: ChatRecordingConfig):
|
|
207
|
+
rich.print(
|
|
208
|
+
f"[green]INFO:[/green] Chat recording started. Press Ctrl+C to stop."
|
|
209
|
+
)
|
|
210
|
+
bad_threads = set()
|
|
211
|
+
_record(config, bad_threads)
|
|
211
212
|
|
|
212
213
|
if __name__ == "__main__":
|
|
213
214
|
record_chats(CLI(ChatRecordingConfig, as_positional=False))
|
|
@@ -14,7 +14,7 @@ class ResourceMap:
|
|
|
14
14
|
if is_saas_url(self.wxo_client.service_url):
|
|
15
15
|
# TO-DO: this is not validated after the v1 prefix change
|
|
16
16
|
# need additional validation
|
|
17
|
-
tools_path = "v1/orchestrate/tools
|
|
17
|
+
tools_path = "v1/orchestrate/tools"
|
|
18
18
|
agents_path = "v1/orchestrate/agents"
|
|
19
19
|
else:
|
|
20
20
|
tools_path = "v1/tools/"
|
|
@@ -10,8 +10,6 @@ from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url
|
|
|
10
10
|
|
|
11
11
|
AUTH_ENDPOINT_AWS = "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
|
|
12
12
|
AUTH_ENDPOINT_IBM_CLOUD = "https://iam.cloud.ibm.com/identity/token"
|
|
13
|
-
WO_INSTANCE = os.environ.get("WO_INSTANCE")
|
|
14
|
-
WO_API_KEY = os.environ.get("WO_API_KEY")
|
|
15
13
|
DEFAULT_PARAM = {"min_new_tokens": 1, "decoding_method": "greedy", "max_new_tokens": 400}
|
|
16
14
|
|
|
17
15
|
|
|
@@ -19,14 +17,16 @@ class ModelProxyProvider(Provider):
|
|
|
19
17
|
def __init__(
|
|
20
18
|
self,
|
|
21
19
|
model_id=None,
|
|
22
|
-
api_key=
|
|
23
|
-
instance_url=
|
|
20
|
+
api_key=None,
|
|
21
|
+
instance_url=None,
|
|
24
22
|
timeout=300,
|
|
25
23
|
embedding_model_id=None,
|
|
26
24
|
params=None
|
|
27
25
|
):
|
|
28
26
|
super().__init__()
|
|
29
27
|
|
|
28
|
+
instance_url = os.environ.get("WO_INSTANCE", instance_url)
|
|
29
|
+
api_key = os.environ.get("WO_API_KEY", api_key)
|
|
30
30
|
if not instance_url or not api_key:
|
|
31
31
|
raise RuntimeError("instance url and WO apikey must be specified to use WO model proxy")
|
|
32
32
|
|
|
@@ -6,6 +6,7 @@ import importlib.util
|
|
|
6
6
|
import re
|
|
7
7
|
from jsonargparse import CLI
|
|
8
8
|
import os
|
|
9
|
+
import sys
|
|
9
10
|
import textwrap
|
|
10
11
|
from dataclasses import is_dataclass, asdict
|
|
11
12
|
|
|
@@ -83,8 +84,16 @@ def load_tools_module(tools_path: Path) -> dict:
|
|
|
83
84
|
module_name = file_path.stem
|
|
84
85
|
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
|
85
86
|
module = importlib.util.module_from_spec(spec)
|
|
86
|
-
|
|
87
|
-
|
|
87
|
+
parent_dir = str(file_path.parent)
|
|
88
|
+
sys_path_modified = False
|
|
89
|
+
if parent_dir not in sys.path:
|
|
90
|
+
sys.path.append(parent_dir)
|
|
91
|
+
sys_path_modified = True
|
|
92
|
+
try:
|
|
93
|
+
spec.loader.exec_module(module)
|
|
94
|
+
finally:
|
|
95
|
+
if sys_path_modified:
|
|
96
|
+
sys.path.pop()
|
|
88
97
|
# Add all module's non-private functions to tools_dict
|
|
89
98
|
for attr_name in dir(module):
|
|
90
99
|
attr = getattr(module, attr_name)
|
|
File without changes
|