PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.4__py3-none-any.whl → 1.1.5__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.4py3-none-any.whl → 1.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (7) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ibm-watsonx-orchestrate-evaluation-framework
-Version: 1.1.4
+Version: 1.1.5
 Summary: The WxO evaluation framework
 Author-email: Haode Qi <Haode.Qi@ibm.com>
 License: MIT

{ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/RECORD RENAMED Viewed

@@ -15,7 +15,7 @@ wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7Me
 wxo_agentic_evaluation/main.py,sha256=5WDJN-cpK0Dt0niVKg7b_f9CNTDC54g1psN20MYyGEw,18100
 wxo_agentic_evaluation/main_v2.py,sha256=96pujdcfZJyDo1naGlLAk5mFrrSY0hIxrlH4qTdSCSs,14896
 wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
-wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
+wxo_agentic_evaluation/record_chat.py,sha256=YQD35ZCPMz5N_u5C1wBEmvp1vptI_voUqz3RYZp8hEY,8488
 wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
 wxo_agentic_evaluation/service_instance.py,sha256=Mgr4UjnwYts91J_iLyygubsZw3aLenPnIfKcqz8OrRU,8515
 wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
@@ -26,9 +26,9 @@ wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24H
 wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
 wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
 wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
-wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkVvomgY0hlS44N_n_7ld3YAQ5PFZdfU,4200
+wxo_agentic_evaluation/external_agent/external_validate.py,sha256=eBN13OACh2Xk5-ph__bhaRK4rYUubyl3Mr_t4iYdICY,4184
 wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
-wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
+wxo_agentic_evaluation/external_agent/types.py,sha256=56DRfrd_hCKnk3lk3lSJI4_Ga6ZNSezOK3EutowpCe4,1464
 wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=-JtRcCSYIafMRAL1W7mz0oLRySD1Thje8ankbFmCoMQ,1755
 wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
@@ -99,7 +99,7 @@ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT
 wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
 wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
 wxo_agentic_evaluation/utils/utils.py,sha256=yDPF0hsd_ypMUanf4AZOQbbBh5KhjTc_VOgIcQ-6htI,12682
-ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/METADATA,sha256=3HNLootsqOLwTThaqwx33iaBDlbSh1UgoUEBRRra1LE,1728
-ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
-ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/RECORD,,
+ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/METADATA,sha256=E47VTRmWgRVX63K0gBHQuX4EXaRv4MsEKyNtddRGMB4,1728
+ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
+ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/RECORD,,

wxo_agentic_evaluation/external_agent/external_validate.py CHANGED Viewed

@@ -41,15 +41,15 @@ class ExternalAgentValidation:
         data = b""
         for chunk in resp:
             for line in chunk.splitlines(True):
-                if line.startswith(b"data:"):
-                    line = line.replace(b"data:", b"")
-                if line.strip() == b"[DONE]":
-                    return
+                if line.startswith(b"event:"):
+                    continue
                 data += line
                 if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
                     # NOTE: edge case, "data" can be sent in two different chunks
                     if data.startswith(b"data:"):
                         data = data.replace(b"data:", b"")
+                    if data.strip() == b"[DONE]":
+                        return
                     yield data
                     data = b""
         if data:
@@ -74,7 +74,7 @@ class ExternalAgentValidation:
         payload = {"stream": True}
         payload["messages"] = messages
         resp = requests.post(
-            url=self.service_url, headers=self.header, json=payload
+            url=self.service_url, headers=self.header, json=payload,
         )
         success, logged_events = self._validate_streaming_response(resp)

wxo_agentic_evaluation/external_agent/types.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, List, Literal, Mapping, Union
+from typing import Any, List, Literal, Mapping, Union, Optional
 from pydantic import BaseModel
@@ -46,7 +46,7 @@ class ThreadRunStepDeltaChoice(BaseModel):
 class BaseEventData(BaseModel):
     id: str
     object: str
-    thread_id: str
+    thread_id: Optional[str] = None
     model: str | None = None
     created: int | None = None
@@ -62,13 +62,7 @@ class ThreadRunStepDeltaData(BaseEventData):
 class UniversalData(BaseEventData):
-    object: Union[
-        Literal["thread.message.delta"],
-        Literal["thread.run.step.delta"],
-        Literal["thread.run.step.created"],
-        Literal["thread.run.step.completed"],
-    ]
-    choices: List[ThreadMessageDeltaChoice]
+    object: Optional[str]
     choices: List[Union[ThreadMessageDeltaChoice, dict]]

wxo_agentic_evaluation/record_chat.py CHANGED Viewed

@@ -37,11 +37,7 @@ STORY_GENERATION_PROMPT_PATH = os.path.join(
 )
-def get_all_runs(wxo_client: WXOClient):
-    limit = 20  # Maximum allowed limit per request
-    offset = 0
-    all_runs = []
+def get_recent_runs(wxo_client: WXOClient, limit: int = 20):
     if is_saas_url(wxo_client.service_url):
         # TO-DO: this is not validated after the v1 prefix change
         # need additional validation
@@ -49,22 +45,22 @@ def get_all_runs(wxo_client: WXOClient):
     else:
         path = "v1/orchestrate/runs"
-    initial_response = wxo_client.get(
-        path, {"limit": limit, "offset": 0}
-    ).json()
-    total_runs = initial_response["total"]
-    all_runs.extend(initial_response["data"])
-    while len(all_runs) < total_runs:
-        offset += limit
-        response = wxo_client.get(
-            path, {"limit": limit, "offset": offset}
-        ).json()
-        all_runs.extend(response["data"])
-    # Sort runs by completed_at in descending order (most recent first)
-    # Put runs with no completion time at the end
-    all_runs.sort(
+    meta_resp = wxo_client.get(path, params={"limit": 1, "offset": 0}).json()
+    total = meta_resp.get("total", 0)
+    if total == 0:
+        return []
+    # fetch the most recent runs
+    offset_for_latest = max(total - limit, 0)
+    resp = wxo_client.get(path, params={"limit": limit, "offset": offset_for_latest}).json()
+    runs = []
+    if isinstance(resp, dict):
+        runs = resp.get("data", [])
+    runs.sort(
         key=lambda x: (
             datetime.strptime(x["completed_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
             if x.get("completed_at")
@@ -73,7 +69,7 @@ def get_all_runs(wxo_client: WXOClient):
         reverse=True,
     )
-    return all_runs
+    return runs
 def generate_story(annotated_data: dict):
@@ -141,10 +137,10 @@ def _record(config: ChatRecordingConfig, bad_threads: set):
     while retry_count < config.max_retries:
         thread_id = None
         try:
-            all_runs = get_all_runs(wxo_client)
+            recent_runs = get_recent_runs(wxo_client)
             seen_threads = set()
             # Process only new runs that started after our recording began
-            for run in all_runs:
+            for run in recent_runs:
                 thread_id = run.get("thread_id")
                 if (thread_id in bad_threads) or (thread_id in seen_threads):
                     continue

{ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

ibm-watsonx-orchestrate-evaluation-framework 1.1.4__py3-none-any.whl → 1.1.5__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.4py3-none-any.whl → 1.1.5py3-none-any.whl