ibm-watsonx-orchestrate-evaluation-framework 1.0.6__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.0.6
3
+ Version: 1.0.7
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
@@ -53,6 +53,17 @@ Run the following command to install evaluation framework in the same env:
53
53
  pip install -e .
54
54
  ```
55
55
 
56
+ ## contribution guide
57
+ ### secret resolution
58
+ install detect secret utilities:
59
+ ```
60
+ pip install --upgrade git+https://github.com/ibm/detect-secrets.git@master#egg=detect-secrets
61
+ ```
62
+ run the scan & resolve detections:
63
+ ```
64
+ detect-secrets scan --exclude-files "benchmark|results" --update .secrets.baseline && detect-secrets audit .secrets.baseline && git add .secrets.baseline
65
+ ```
66
+
56
67
 
57
68
  ## quick experiment against the default wxo-dev env
58
69
  ```bash
@@ -1,20 +1,20 @@
1
1
  wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
3
3
  wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
4
- wxo_agentic_evaluation/arg_configs.py,sha256=UCrGcakFaAM3reFquMn03qNtKe7Pg8ScbOF0K7o8VDU,2240
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
5
5
  wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
6
- wxo_agentic_evaluation/data_annotator.py,sha256=DJVG2CdhJRAJ3X1ARbrsn9bPjTuytCDGIBM4PEexfnk,8214
6
+ wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
7
7
  wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
8
8
  wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
9
9
  wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
10
10
  wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
11
11
  wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
12
12
  wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
13
- wxo_agentic_evaluation/record_chat.py,sha256=IAKCZ6Bc4natHA4SyNtC4tjo-0MDglwBcY5AWvXSgR0,7317
14
- wxo_agentic_evaluation/resource_map.py,sha256=-dIWQdpEpPeSCbDeYfRupG9KV1Q4NlHGb5KXywjkulM,1645
13
+ wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
14
+ wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
15
15
  wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
16
16
  wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
17
- wxo_agentic_evaluation/tool_planner.py,sha256=e-lBb4w1klT1HOL9BTwae3lkGv5VBuYC397mSJgOhus,12622
17
+ wxo_agentic_evaluation/tool_planner.py,sha256=JW5o0VYaaUorB3FBcrwLzgG3-iqEWrqjVhh82u7x8YM,12960
18
18
  wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
19
19
  wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
20
20
  wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
@@ -44,13 +44,13 @@ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TC
44
44
  wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
46
46
  wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
47
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=X5tiE0IKCR2CqhwEGm91LOdzFZQWSXzXQgLOtzi6ng0,4002
47
+ wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=Y36Ryv4nPG8RdVP_zsQsRlEWv8F_hGi7-wOppWPQTwc,4026
48
48
  wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
49
49
  wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
50
50
  wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
51
51
  wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
52
52
  wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
53
- ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/METADATA,sha256=BqQELgtuSVS6tHNQ5nGkgfwPBiAFgTnvgZbWG3hjCgM,17674
54
- ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
55
- ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
56
- ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/RECORD,,
53
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/METADATA,sha256=wz60je0UK3ogKLH9qiDLS808j57cfWOosONyCuQR95g,18051
54
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
55
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
56
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD,,
@@ -74,6 +74,7 @@ class ChatRecordingConfig:
74
74
  service_url: str = "http://localhost:4321"
75
75
  tenant_name: str = "local"
76
76
  token: str = None
77
+ max_retries: int = 5
77
78
 
78
79
 
79
80
  @dataclass
@@ -247,11 +247,14 @@ class DataAnnotator:
247
247
  }
248
248
  goal_details.append(summarize_step)
249
249
  break
250
-
251
- if summarize_step:
252
- goals[previous] = ["summarize"]
253
- else:
250
+
251
+ if previous is None:
252
+ goals["summarize"] = []
253
+ elif summarize_step is None:
254
254
  goals[previous] = []
255
+ else:
256
+ goals[previous] = ["summarize"]
257
+
255
258
 
256
259
  def generate(self) -> Dict:
257
260
  """Generate the final dataset"""
@@ -43,17 +43,13 @@ def get_all_runs(wxo_client: WXOClient):
43
43
  else:
44
44
  path = "v1/orchestrate/runs"
45
45
 
46
- initial_response = wxo_client.get(
47
- path, {"limit": limit, "offset": 0}
48
- ).json()
46
+ initial_response = wxo_client.get(path, {"limit": limit, "offset": 0}).json()
49
47
  total_runs = initial_response["total"]
50
48
  all_runs.extend(initial_response["data"])
51
49
 
52
50
  while len(all_runs) < total_runs:
53
51
  offset += limit
54
- response = wxo_client.get(
55
- path, {"limit": limit, "offset": offset}
56
- ).json()
52
+ response = wxo_client.get(path, {"limit": limit, "offset": offset}).json()
57
53
  all_runs.extend(response["data"])
58
54
 
59
55
  # Sort runs by completed_at in descending order (most recent first)
@@ -92,9 +88,10 @@ def annotate_messages(
92
88
  annotated_data["agent"] = agent_name
93
89
 
94
90
  annotated_data["story"] = generate_story(annotated_data)
95
-
91
+
96
92
  return annotated_data
97
93
 
94
+
98
95
  def has_messages_changed(
99
96
  thread_id: str,
100
97
  messages: List[Message],
@@ -111,32 +108,27 @@ def has_messages_changed(
111
108
  return False
112
109
 
113
110
 
114
- def record_chats(config: ChatRecordingConfig):
111
+ def _record(config: ChatRecordingConfig, bad_threads: set):
115
112
  """Record chats in background mode"""
116
113
  start_time = datetime.utcnow()
117
114
  processed_threads = set()
118
115
  previous_input_hash: dict[str, str] = {}
119
116
 
120
- rich.print(
121
- f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
122
- )
123
117
  if config.token is None:
124
118
  config.token = tenant_setup(config.service_url, config.tenant_name)
125
119
  wxo_client = get_wxo_client(config.service_url, config.tenant_name, config.token)
126
120
  inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
127
- try:
128
- while True:
121
+
122
+ retry_count = 0
123
+ while retry_count < config.max_retries:
124
+ thread_id = None
125
+ try:
129
126
  all_runs = get_all_runs(wxo_client)
130
127
  seen_threads = set()
131
128
  # Process only new runs that started after our recording began
132
129
  for run in all_runs:
133
130
  thread_id = run.get("thread_id")
134
- try:
135
- agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
136
- except Exception as e:
137
- rich.print(f"[yellow]WARNING:[/yellow]Failure in getting thread id {thread_id}")
138
- continue
139
- if thread_id in seen_threads or agent_name is None:
131
+ if (thread_id in bad_threads) or (thread_id in seen_threads):
140
132
  continue
141
133
  seen_threads.add(thread_id)
142
134
  started_at = run.get("started_at")
@@ -162,11 +154,17 @@ def record_chats(config: ChatRecordingConfig):
162
154
  try:
163
155
  messages = inference_backend.get_messages(thread_id)
164
156
 
165
- if not has_messages_changed(
166
- thread_id,
167
- messages,
168
- previous_input_hash,
169
- ):
157
+ if not has_messages_changed(thread_id, messages, previous_input_hash):
158
+ continue
159
+
160
+ try:
161
+ agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
162
+ except Exception as e:
163
+ rich.print(f"[yellow]WARNING:[/yellow] Failure getting agent name for thread_id {thread_id}: {e}")
164
+ raise
165
+
166
+ if agent_name is None:
167
+ rich.print(f"[yellow]WARNING:[/yellow] No agent name found for thread_id {thread_id}. Skipping ...")
170
168
  continue
171
169
 
172
170
  annotated_data = annotate_messages(
@@ -180,19 +178,37 @@ def record_chats(config: ChatRecordingConfig):
180
178
  with open(annotation_filename, "w") as f:
181
179
  json.dump(annotated_data, f, indent=4)
182
180
  except Exception as e:
183
- rich.print(
184
- f"[red]ERROR:[/red] Failed to process thread {thread_id}: {str(e)}"
185
- )
181
+ rich.print(f"[yellow]WARNING:[/yellow] Failed to process thread {thread_id}: {e}")
182
+ raise
186
183
  except (ValueError, TypeError) as e:
187
- rich.print(
188
- f"[yellow]WARNING:[/yellow] Invalid timestamp format for thread {thread_id}: {str(e)}"
189
- )
184
+ rich.print(f"[yellow]WARNING:[/yellow] Invalid timestamp for thread {thread_id}: {e}")
185
+ raise
186
+
187
+ retry_count = 0
188
+ time.sleep(2)
190
189
 
191
- time.sleep(2) # Poll every 2 seconds
190
+ except KeyboardInterrupt:
191
+ rich.print("\n[yellow]Recording stopped by user[/yellow]")
192
+ break
192
193
 
193
- except KeyboardInterrupt:
194
- rich.print("\n[yellow]Recording stopped by user[/yellow]")
194
+ except Exception as e:
195
+ if thread_id is None:
196
+ rich.print(f"[red]ERROR:[/red] {e}")
197
+ break
195
198
 
199
+ time.sleep(1)
200
+ retry_count += 1
201
+ if retry_count >= config.max_retries:
202
+ rich.print(f"[red]ERROR:[/red] Maximum retries reached. Skipping thread {thread_id}")
203
+ bad_threads.add(thread_id)
204
+ _record(config, bad_threads)
205
+
206
+ def record_chats(config: ChatRecordingConfig):
207
+ rich.print(
208
+ f"[green]INFO:[/green] Chat recording started. Press Ctrl+C to stop."
209
+ )
210
+ bad_threads = set()
211
+ _record(config, bad_threads)
196
212
 
197
213
  if __name__ == "__main__":
198
214
  record_chats(CLI(ChatRecordingConfig, as_positional=False))
@@ -14,7 +14,7 @@ class ResourceMap:
14
14
  if is_saas_url(self.wxo_client.service_url):
15
15
  # TO-DO: this is not validated after the v1 prefix change
16
16
  # need additional validation
17
- tools_path = "v1/orchestrate/tools/"
17
+ tools_path = "v1/orchestrate/tools"
18
18
  agents_path = "v1/orchestrate/agents"
19
19
  else:
20
20
  tools_path = "v1/tools/"
@@ -10,8 +10,6 @@ from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url
10
10
 
11
11
  AUTH_ENDPOINT_AWS = "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
12
12
  AUTH_ENDPOINT_IBM_CLOUD = "https://iam.cloud.ibm.com/identity/token"
13
- WO_INSTANCE = os.environ.get("WO_INSTANCE")
14
- WO_API_KEY = os.environ.get("WO_API_KEY")
15
13
  DEFAULT_PARAM = {"min_new_tokens": 1, "decoding_method": "greedy", "max_new_tokens": 400}
16
14
 
17
15
 
@@ -19,14 +17,16 @@ class ModelProxyProvider(Provider):
19
17
  def __init__(
20
18
  self,
21
19
  model_id=None,
22
- api_key=WO_API_KEY,
23
- instance_url=WO_INSTANCE,
20
+ api_key=None,
21
+ instance_url=None,
24
22
  timeout=300,
25
23
  embedding_model_id=None,
26
24
  params=None
27
25
  ):
28
26
  super().__init__()
29
27
 
28
+ instance_url = os.environ.get("WO_INSTANCE", instance_url)
29
+ api_key = os.environ.get("WO_API_KEY", api_key)
30
30
  if not instance_url or not api_key:
31
31
  raise RuntimeError("instance url and WO apikey must be specified to use WO model proxy")
32
32
 
@@ -6,6 +6,7 @@ import importlib.util
6
6
  import re
7
7
  from jsonargparse import CLI
8
8
  import os
9
+ import sys
9
10
  import textwrap
10
11
  from dataclasses import is_dataclass, asdict
11
12
 
@@ -83,8 +84,16 @@ def load_tools_module(tools_path: Path) -> dict:
83
84
  module_name = file_path.stem
84
85
  spec = importlib.util.spec_from_file_location(module_name, file_path)
85
86
  module = importlib.util.module_from_spec(spec)
86
- spec.loader.exec_module(module)
87
-
87
+ parent_dir = str(file_path.parent)
88
+ sys_path_modified = False
89
+ if parent_dir not in sys.path:
90
+ sys.path.append(parent_dir)
91
+ sys_path_modified = True
92
+ try:
93
+ spec.loader.exec_module(module)
94
+ finally:
95
+ if sys_path_modified:
96
+ sys.path.pop()
88
97
  # Add all module's non-private functions to tools_dict
89
98
  for attr_name in dir(module):
90
99
  attr = getattr(module, attr_name)