ibm-watsonx-orchestrate-evaluation-framework 1.1.4__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.1.4
3
+ Version: 1.1.5
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
@@ -15,7 +15,7 @@ wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7Me
15
15
  wxo_agentic_evaluation/main.py,sha256=5WDJN-cpK0Dt0niVKg7b_f9CNTDC54g1psN20MYyGEw,18100
16
16
  wxo_agentic_evaluation/main_v2.py,sha256=96pujdcfZJyDo1naGlLAk5mFrrSY0hIxrlH4qTdSCSs,14896
17
17
  wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
18
- wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
18
+ wxo_agentic_evaluation/record_chat.py,sha256=YQD35ZCPMz5N_u5C1wBEmvp1vptI_voUqz3RYZp8hEY,8488
19
19
  wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
20
20
  wxo_agentic_evaluation/service_instance.py,sha256=Mgr4UjnwYts91J_iLyygubsZw3aLenPnIfKcqz8OrRU,8515
21
21
  wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
@@ -26,9 +26,9 @@ wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24H
26
26
  wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
27
27
  wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
28
28
  wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
29
- wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkVvomgY0hlS44N_n_7ld3YAQ5PFZdfU,4200
29
+ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=eBN13OACh2Xk5-ph__bhaRK4rYUubyl3Mr_t4iYdICY,4184
30
30
  wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
31
- wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
31
+ wxo_agentic_evaluation/external_agent/types.py,sha256=56DRfrd_hCKnk3lk3lSJI4_Ga6ZNSezOK3EutowpCe4,1464
32
32
  wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=-JtRcCSYIafMRAL1W7mz0oLRySD1Thje8ankbFmCoMQ,1755
34
34
  wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
@@ -99,7 +99,7 @@ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT
99
99
  wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
100
100
  wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
101
101
  wxo_agentic_evaluation/utils/utils.py,sha256=yDPF0hsd_ypMUanf4AZOQbbBh5KhjTc_VOgIcQ-6htI,12682
102
- ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/METADATA,sha256=3HNLootsqOLwTThaqwx33iaBDlbSh1UgoUEBRRra1LE,1728
103
- ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
104
- ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
105
- ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/RECORD,,
102
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/METADATA,sha256=E47VTRmWgRVX63K0gBHQuX4EXaRv4MsEKyNtddRGMB4,1728
103
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
104
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
105
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/RECORD,,
@@ -41,15 +41,15 @@ class ExternalAgentValidation:
41
41
  data = b""
42
42
  for chunk in resp:
43
43
  for line in chunk.splitlines(True):
44
- if line.startswith(b"data:"):
45
- line = line.replace(b"data:", b"")
46
- if line.strip() == b"[DONE]":
47
- return
44
+ if line.startswith(b"event:"):
45
+ continue
48
46
  data += line
49
47
  if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
50
48
  # NOTE: edge case, "data" can be sent in two different chunks
51
49
  if data.startswith(b"data:"):
52
50
  data = data.replace(b"data:", b"")
51
+ if data.strip() == b"[DONE]":
52
+ return
53
53
  yield data
54
54
  data = b""
55
55
  if data:
@@ -74,7 +74,7 @@ class ExternalAgentValidation:
74
74
  payload = {"stream": True}
75
75
  payload["messages"] = messages
76
76
  resp = requests.post(
77
- url=self.service_url, headers=self.header, json=payload
77
+ url=self.service_url, headers=self.header, json=payload,
78
78
  )
79
79
  success, logged_events = self._validate_streaming_response(resp)
80
80
 
@@ -1,4 +1,4 @@
1
- from typing import Any, List, Literal, Mapping, Union
1
+ from typing import Any, List, Literal, Mapping, Union, Optional
2
2
 
3
3
  from pydantic import BaseModel
4
4
 
@@ -46,7 +46,7 @@ class ThreadRunStepDeltaChoice(BaseModel):
46
46
  class BaseEventData(BaseModel):
47
47
  id: str
48
48
  object: str
49
- thread_id: str
49
+ thread_id: Optional[str] = None
50
50
  model: str | None = None
51
51
  created: int | None = None
52
52
 
@@ -62,13 +62,7 @@ class ThreadRunStepDeltaData(BaseEventData):
62
62
 
63
63
 
64
64
  class UniversalData(BaseEventData):
65
- object: Union[
66
- Literal["thread.message.delta"],
67
- Literal["thread.run.step.delta"],
68
- Literal["thread.run.step.created"],
69
- Literal["thread.run.step.completed"],
70
- ]
71
- choices: List[ThreadMessageDeltaChoice]
65
+ object: Optional[str]
72
66
  choices: List[Union[ThreadMessageDeltaChoice, dict]]
73
67
 
74
68
 
@@ -37,11 +37,7 @@ STORY_GENERATION_PROMPT_PATH = os.path.join(
37
37
  )
38
38
 
39
39
 
40
- def get_all_runs(wxo_client: WXOClient):
41
- limit = 20 # Maximum allowed limit per request
42
- offset = 0
43
- all_runs = []
44
-
40
+ def get_recent_runs(wxo_client: WXOClient, limit: int = 20):
45
41
  if is_saas_url(wxo_client.service_url):
46
42
  # TO-DO: this is not validated after the v1 prefix change
47
43
  # need additional validation
@@ -49,22 +45,22 @@ def get_all_runs(wxo_client: WXOClient):
49
45
  else:
50
46
  path = "v1/orchestrate/runs"
51
47
 
52
- initial_response = wxo_client.get(
53
- path, {"limit": limit, "offset": 0}
54
- ).json()
55
- total_runs = initial_response["total"]
56
- all_runs.extend(initial_response["data"])
57
-
58
- while len(all_runs) < total_runs:
59
- offset += limit
60
- response = wxo_client.get(
61
- path, {"limit": limit, "offset": offset}
62
- ).json()
63
- all_runs.extend(response["data"])
64
-
65
- # Sort runs by completed_at in descending order (most recent first)
66
- # Put runs with no completion time at the end
67
- all_runs.sort(
48
+
49
+ meta_resp = wxo_client.get(path, params={"limit": 1, "offset": 0}).json()
50
+ total = meta_resp.get("total", 0)
51
+
52
+ if total == 0:
53
+ return []
54
+
55
+ # fetch the most recent runs
56
+ offset_for_latest = max(total - limit, 0)
57
+ resp = wxo_client.get(path, params={"limit": limit, "offset": offset_for_latest}).json()
58
+
59
+ runs = []
60
+ if isinstance(resp, dict):
61
+ runs = resp.get("data", [])
62
+
63
+ runs.sort(
68
64
  key=lambda x: (
69
65
  datetime.strptime(x["completed_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
70
66
  if x.get("completed_at")
@@ -73,7 +69,7 @@ def get_all_runs(wxo_client: WXOClient):
73
69
  reverse=True,
74
70
  )
75
71
 
76
- return all_runs
72
+ return runs
77
73
 
78
74
 
79
75
  def generate_story(annotated_data: dict):
@@ -141,10 +137,10 @@ def _record(config: ChatRecordingConfig, bad_threads: set):
141
137
  while retry_count < config.max_retries:
142
138
  thread_id = None
143
139
  try:
144
- all_runs = get_all_runs(wxo_client)
140
+ recent_runs = get_recent_runs(wxo_client)
145
141
  seen_threads = set()
146
142
  # Process only new runs that started after our recording began
147
- for run in all_runs:
143
+ for run in recent_runs:
148
144
  thread_id = run.get("thread_id")
149
145
  if (thread_id in bad_threads) or (thread_id in seen_threads):
150
146
  continue