ibm-watsonx-orchestrate-evaluation-framework 1.1.4__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/METADATA +1 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/RECORD +7 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +5 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/record_chat.py +20 -24
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/top_level.txt +0 -0
|
@@ -15,7 +15,7 @@ wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7Me
|
|
|
15
15
|
wxo_agentic_evaluation/main.py,sha256=5WDJN-cpK0Dt0niVKg7b_f9CNTDC54g1psN20MYyGEw,18100
|
|
16
16
|
wxo_agentic_evaluation/main_v2.py,sha256=96pujdcfZJyDo1naGlLAk5mFrrSY0hIxrlH4qTdSCSs,14896
|
|
17
17
|
wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
|
|
18
|
-
wxo_agentic_evaluation/record_chat.py,sha256=
|
|
18
|
+
wxo_agentic_evaluation/record_chat.py,sha256=YQD35ZCPMz5N_u5C1wBEmvp1vptI_voUqz3RYZp8hEY,8488
|
|
19
19
|
wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
|
|
20
20
|
wxo_agentic_evaluation/service_instance.py,sha256=Mgr4UjnwYts91J_iLyygubsZw3aLenPnIfKcqz8OrRU,8515
|
|
21
21
|
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
@@ -26,9 +26,9 @@ wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24H
|
|
|
26
26
|
wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
|
|
27
27
|
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
|
|
28
28
|
wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
|
|
29
|
-
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=
|
|
29
|
+
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=eBN13OACh2Xk5-ph__bhaRK4rYUubyl3Mr_t4iYdICY,4184
|
|
30
30
|
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
|
|
31
|
-
wxo_agentic_evaluation/external_agent/types.py,sha256=
|
|
31
|
+
wxo_agentic_evaluation/external_agent/types.py,sha256=56DRfrd_hCKnk3lk3lSJI4_Ga6ZNSezOK3EutowpCe4,1464
|
|
32
32
|
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
33
|
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=-JtRcCSYIafMRAL1W7mz0oLRySD1Thje8ankbFmCoMQ,1755
|
|
34
34
|
wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
|
|
@@ -99,7 +99,7 @@ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT
|
|
|
99
99
|
wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
|
|
100
100
|
wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
|
|
101
101
|
wxo_agentic_evaluation/utils/utils.py,sha256=yDPF0hsd_ypMUanf4AZOQbbBh5KhjTc_VOgIcQ-6htI,12682
|
|
102
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
103
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
104
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
105
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
102
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/METADATA,sha256=E47VTRmWgRVX63K0gBHQuX4EXaRv4MsEKyNtddRGMB4,1728
|
|
103
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
104
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
105
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info/RECORD,,
|
|
@@ -41,15 +41,15 @@ class ExternalAgentValidation:
|
|
|
41
41
|
data = b""
|
|
42
42
|
for chunk in resp:
|
|
43
43
|
for line in chunk.splitlines(True):
|
|
44
|
-
if line.startswith(b"
|
|
45
|
-
|
|
46
|
-
if line.strip() == b"[DONE]":
|
|
47
|
-
return
|
|
44
|
+
if line.startswith(b"event:"):
|
|
45
|
+
continue
|
|
48
46
|
data += line
|
|
49
47
|
if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
|
|
50
48
|
# NOTE: edge case, "data" can be sent in two different chunks
|
|
51
49
|
if data.startswith(b"data:"):
|
|
52
50
|
data = data.replace(b"data:", b"")
|
|
51
|
+
if data.strip() == b"[DONE]":
|
|
52
|
+
return
|
|
53
53
|
yield data
|
|
54
54
|
data = b""
|
|
55
55
|
if data:
|
|
@@ -74,7 +74,7 @@ class ExternalAgentValidation:
|
|
|
74
74
|
payload = {"stream": True}
|
|
75
75
|
payload["messages"] = messages
|
|
76
76
|
resp = requests.post(
|
|
77
|
-
url=self.service_url, headers=self.header, json=payload
|
|
77
|
+
url=self.service_url, headers=self.header, json=payload,
|
|
78
78
|
)
|
|
79
79
|
success, logged_events = self._validate_streaming_response(resp)
|
|
80
80
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, List, Literal, Mapping, Union
|
|
1
|
+
from typing import Any, List, Literal, Mapping, Union, Optional
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
@@ -46,7 +46,7 @@ class ThreadRunStepDeltaChoice(BaseModel):
|
|
|
46
46
|
class BaseEventData(BaseModel):
|
|
47
47
|
id: str
|
|
48
48
|
object: str
|
|
49
|
-
thread_id: str
|
|
49
|
+
thread_id: Optional[str] = None
|
|
50
50
|
model: str | None = None
|
|
51
51
|
created: int | None = None
|
|
52
52
|
|
|
@@ -62,13 +62,7 @@ class ThreadRunStepDeltaData(BaseEventData):
|
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
class UniversalData(BaseEventData):
|
|
65
|
-
object:
|
|
66
|
-
Literal["thread.message.delta"],
|
|
67
|
-
Literal["thread.run.step.delta"],
|
|
68
|
-
Literal["thread.run.step.created"],
|
|
69
|
-
Literal["thread.run.step.completed"],
|
|
70
|
-
]
|
|
71
|
-
choices: List[ThreadMessageDeltaChoice]
|
|
65
|
+
object: Optional[str]
|
|
72
66
|
choices: List[Union[ThreadMessageDeltaChoice, dict]]
|
|
73
67
|
|
|
74
68
|
|
|
@@ -37,11 +37,7 @@ STORY_GENERATION_PROMPT_PATH = os.path.join(
|
|
|
37
37
|
)
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
def
|
|
41
|
-
limit = 20 # Maximum allowed limit per request
|
|
42
|
-
offset = 0
|
|
43
|
-
all_runs = []
|
|
44
|
-
|
|
40
|
+
def get_recent_runs(wxo_client: WXOClient, limit: int = 20):
|
|
45
41
|
if is_saas_url(wxo_client.service_url):
|
|
46
42
|
# TO-DO: this is not validated after the v1 prefix change
|
|
47
43
|
# need additional validation
|
|
@@ -49,22 +45,22 @@ def get_all_runs(wxo_client: WXOClient):
|
|
|
49
45
|
else:
|
|
50
46
|
path = "v1/orchestrate/runs"
|
|
51
47
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
48
|
+
|
|
49
|
+
meta_resp = wxo_client.get(path, params={"limit": 1, "offset": 0}).json()
|
|
50
|
+
total = meta_resp.get("total", 0)
|
|
51
|
+
|
|
52
|
+
if total == 0:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
# fetch the most recent runs
|
|
56
|
+
offset_for_latest = max(total - limit, 0)
|
|
57
|
+
resp = wxo_client.get(path, params={"limit": limit, "offset": offset_for_latest}).json()
|
|
58
|
+
|
|
59
|
+
runs = []
|
|
60
|
+
if isinstance(resp, dict):
|
|
61
|
+
runs = resp.get("data", [])
|
|
62
|
+
|
|
63
|
+
runs.sort(
|
|
68
64
|
key=lambda x: (
|
|
69
65
|
datetime.strptime(x["completed_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
70
66
|
if x.get("completed_at")
|
|
@@ -73,7 +69,7 @@ def get_all_runs(wxo_client: WXOClient):
|
|
|
73
69
|
reverse=True,
|
|
74
70
|
)
|
|
75
71
|
|
|
76
|
-
return
|
|
72
|
+
return runs
|
|
77
73
|
|
|
78
74
|
|
|
79
75
|
def generate_story(annotated_data: dict):
|
|
@@ -141,10 +137,10 @@ def _record(config: ChatRecordingConfig, bad_threads: set):
|
|
|
141
137
|
while retry_count < config.max_retries:
|
|
142
138
|
thread_id = None
|
|
143
139
|
try:
|
|
144
|
-
|
|
140
|
+
recent_runs = get_recent_runs(wxo_client)
|
|
145
141
|
seen_threads = set()
|
|
146
142
|
# Process only new runs that started after our recording began
|
|
147
|
-
for run in
|
|
143
|
+
for run in recent_runs:
|
|
148
144
|
thread_id = run.get("thread_id")
|
|
149
145
|
if (thread_id in bad_threads) or (thread_id in seen_threads):
|
|
150
146
|
continue
|
|
File without changes
|