ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.evaluation_controller.evaluation_controller import EvaluationController
|
|
2
|
+
from langfuse import get_client
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import RuntimeAdapter
|
|
5
|
+
from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import WXORuntimeAdapter
|
|
6
|
+
from wxo_agentic_evaluation.type import Message, RuntimeResponse
|
|
7
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
8
|
+
from wxo_agentic_evaluation.llm_user_v2 import LLMUserV2
|
|
9
|
+
from wxo_agentic_evaluation.arg_configs import ControllerConfig
|
|
10
|
+
from wxo_agentic_evaluation.hr_agent_langgraph import agent
|
|
11
|
+
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
load_dotenv()
|
|
14
|
+
import os
|
|
15
|
+
import base64
|
|
16
|
+
|
|
17
|
+
os.environ["USE_PORTKEY_PROVIDER"] = "true"
|
|
18
|
+
|
|
19
|
+
lf_public = os.getenv("LANGFUSE_PUBLIC_KEY")
|
|
20
|
+
lf_secret = os.getenv("LANGFUSE_SECRET_KEY")
|
|
21
|
+
auth_bytes = f"{lf_public}:{lf_secret}".encode("utf-8")
|
|
22
|
+
auth_b64 = base64.b64encode(auth_bytes).decode("ascii")
|
|
23
|
+
HEADERS = {"Authorization": f"Basic {auth_b64}"}
|
|
24
|
+
|
|
25
|
+
lf_base_url = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com").rstrip("/")
|
|
26
|
+
OTEL_ENDPOINT = f"{lf_base_url}/api/public/otel/v1/traces"
|
|
27
|
+
|
|
28
|
+
from phoenix.otel import register
|
|
29
|
+
register(endpoint=OTEL_ENDPOINT, headers=HEADERS, auto_instrument=True)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
context = {"session_id": "1", "chat_history": []}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class MyAgentWrapper(RuntimeAdapter):
|
|
36
|
+
def run(
|
|
37
|
+
self,
|
|
38
|
+
user_message: Message,
|
|
39
|
+
context: dict,
|
|
40
|
+
thread_id=None,
|
|
41
|
+
) -> RuntimeResponse:
|
|
42
|
+
|
|
43
|
+
message_json = user_message.model_dump()
|
|
44
|
+
messages = {"messages": [ message_json ]}
|
|
45
|
+
result = agent.invoke(messages)
|
|
46
|
+
# print(result)
|
|
47
|
+
message = Message(role="assistant", content=result["messages"][-1].content)
|
|
48
|
+
# messages = [Message(role="assistant", content=msg.content, type="tool_call") for msg in result["messages"]]
|
|
49
|
+
return RuntimeResponse(messages=[message])
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
agent_wrapper = MyAgentWrapper()
|
|
54
|
+
from openinference.instrumentation import using_session
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SimulationRunner:
|
|
58
|
+
def __init__(self, user_agent: LLMUser,
|
|
59
|
+
agent: RuntimeAdapter,
|
|
60
|
+
config: ControllerConfig):
|
|
61
|
+
self.evaluation_controller = EvaluationController(
|
|
62
|
+
runtime=agent,
|
|
63
|
+
llm_user=user_agent,
|
|
64
|
+
config=config,
|
|
65
|
+
)
|
|
66
|
+
self.counter = 0
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def run_wrapper(self, session_id = 'session-id-test-00'):
|
|
70
|
+
def run_task(*, item, **kwargs):
|
|
71
|
+
"""
|
|
72
|
+
Task function for Langfuse experiment.
|
|
73
|
+
Item input should be: {"persona": "...", "scenario": "..."}
|
|
74
|
+
"""
|
|
75
|
+
# print(item)
|
|
76
|
+
with using_session(session_id + "-" + self.counter.__str__()):
|
|
77
|
+
input = item.input
|
|
78
|
+
user_story = input.get("story")
|
|
79
|
+
starting_sentence = input.get("starting_sentence")
|
|
80
|
+
agent_name = input.get("agent")
|
|
81
|
+
_, _, _, thread_id = self.evaluation_controller.run(self.counter, agent_name=agent_name, story=user_story, starting_user_input=starting_sentence)
|
|
82
|
+
self.counter += 1
|
|
83
|
+
if isinstance(self.evaluation_controller.runtime, WXORuntimeAdapter):
|
|
84
|
+
return thread_id
|
|
85
|
+
return session_id
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
return run_task
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
import json
|
|
92
|
+
with open("benchmarks/hr_sample/data_simple.json") as f:
|
|
93
|
+
data = json.load(f)
|
|
94
|
+
langfuse = get_client()
|
|
95
|
+
langfuse.create_dataset(name="dataset-test-00")
|
|
96
|
+
# Upload to Langfuse
|
|
97
|
+
|
|
98
|
+
langfuse.create_dataset_item(
|
|
99
|
+
dataset_name="dataset-test-00",
|
|
100
|
+
# any python object or value
|
|
101
|
+
input={"story": data["story"], "starting_sentence": data["starting_sentence"]},
|
|
102
|
+
# any python object or value, optional
|
|
103
|
+
expected_output={"goals": data["goals"], "goal_details": data["goal_details"]},
|
|
104
|
+
)
|
|
105
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
106
|
+
|
|
107
|
+
model_id = "gpt-4o-mini"
|
|
108
|
+
provider = get_provider(provider="openai", model_id=model_id, api_key=os.getenv("OPENAI_API_KEY"),
|
|
109
|
+
use_portkey_provider=True)
|
|
110
|
+
llm_user = LLMUserV2(llm_client=provider, user_prompt_path="src/wxo_agentic_evaluation/prompt/universal_user_template.jinja2")
|
|
111
|
+
config = ControllerConfig()
|
|
112
|
+
simluation_runner = SimulationRunner(agent = agent_wrapper, user_agent=llm_user, config=config)
|
|
113
|
+
dataset = langfuse.get_dataset("dataset-test-00")
|
|
114
|
+
|
|
115
|
+
result = dataset.run_experiment(
|
|
116
|
+
name="experiment-test-00",
|
|
117
|
+
description="Synthetic conversations from persona/scenario pairs",
|
|
118
|
+
task=simluation_runner.run_wrapper()
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
get_client().flush()
|
|
122
|
+
session_id = "dummy-1"
|
|
123
|
+
with using_session(session_id):
|
|
124
|
+
result = agent_wrapper.run(Message(role="user", content="hi"), context={})
|
|
125
|
+
print(result)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
|
|
5
4
|
def parse_json_string(input_string):
|
|
6
5
|
json_char_count = 0
|
|
7
6
|
json_objects = []
|
|
@@ -31,9 +30,10 @@ def parse_json_string(input_string):
|
|
|
31
30
|
is_thinking_step = len(input_string) - json_char_count > 10
|
|
32
31
|
return json_objects
|
|
33
32
|
|
|
33
|
+
|
|
34
34
|
wai_client = WatsonXProvider(model_id="meta-llama/llama-3-405b-instruct")
|
|
35
35
|
|
|
36
|
-
prompt =
|
|
36
|
+
prompt = """
|
|
37
37
|
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
38
38
|
You are trying to make tool calls. Given a raw input and tool output. Try to extract the information to make the tool call
|
|
39
39
|
|
|
@@ -83,12 +83,12 @@ test_sample2 = """
|
|
|
83
83
|
<|start_header_id|>ipython<|end_header_id|>"""
|
|
84
84
|
|
|
85
85
|
|
|
86
|
-
|
|
87
86
|
outputs = wai_client.query(prompt + test_sample1)
|
|
88
87
|
|
|
89
88
|
import json
|
|
89
|
+
|
|
90
90
|
print(outputs["generated_text"])
|
|
91
91
|
|
|
92
92
|
json_obj = parse_json_string(outputs["generated_text"])[0]
|
|
93
93
|
|
|
94
|
-
print(json_obj)
|
|
94
|
+
print(json_obj)
|
wxo_agentic_evaluation/type.py
CHANGED
|
@@ -1,8 +1,21 @@
|
|
|
1
|
-
from enum import StrEnum
|
|
2
|
-
from
|
|
1
|
+
from enum import Enum, StrEnum
|
|
2
|
+
from hashlib import md5
|
|
3
|
+
from typing import Any, Dict, List, Literal, Mapping, Optional, Union
|
|
3
4
|
|
|
4
|
-
from pydantic import
|
|
5
|
-
|
|
5
|
+
from pydantic import (
|
|
6
|
+
BaseModel,
|
|
7
|
+
ConfigDict,
|
|
8
|
+
Field,
|
|
9
|
+
computed_field,
|
|
10
|
+
model_validator,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CallTracker(BaseModel):
|
|
15
|
+
tool_call: List = []
|
|
16
|
+
tool_response: List = []
|
|
17
|
+
generic: List = []
|
|
18
|
+
metadata: Dict[str, Any] = Field(default={})
|
|
6
19
|
|
|
7
20
|
|
|
8
21
|
class EventTypes(StrEnum):
|
|
@@ -27,6 +40,11 @@ class AttackCategory(StrEnum):
|
|
|
27
40
|
off_policy = "off_policy"
|
|
28
41
|
|
|
29
42
|
|
|
43
|
+
class Roles(Enum):
|
|
44
|
+
ASSISTANT = "assistant"
|
|
45
|
+
USER = "user"
|
|
46
|
+
|
|
47
|
+
|
|
30
48
|
class ConversationalSearchCitations(BaseModel):
|
|
31
49
|
url: str
|
|
32
50
|
body: str
|
|
@@ -90,10 +108,35 @@ class ConversationalSearch(BaseModel):
|
|
|
90
108
|
response_length_option: str
|
|
91
109
|
|
|
92
110
|
|
|
111
|
+
class OTelParserFunction(BaseModel):
|
|
112
|
+
"""OpenAI chat completion function structure for OTel parser tool calls"""
|
|
113
|
+
|
|
114
|
+
name: str
|
|
115
|
+
arguments: str # JSON string of arguments
|
|
116
|
+
|
|
117
|
+
model_config = ConfigDict(frozen=True)
|
|
118
|
+
|
|
119
|
+
def __str__(self):
|
|
120
|
+
return f"{self.name}:{self.arguments}"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class OTelParserToolCall(BaseModel):
|
|
124
|
+
"""OpenAI chat completion tool call structure for OTel parser"""
|
|
125
|
+
|
|
126
|
+
id: str
|
|
127
|
+
function: OTelParserFunction
|
|
128
|
+
type: Literal["function"] = "function"
|
|
129
|
+
|
|
130
|
+
model_config = ConfigDict(frozen=True)
|
|
131
|
+
|
|
132
|
+
def __str__(self):
|
|
133
|
+
return f"{self.id}:{self.type}:{self.function}"
|
|
134
|
+
|
|
135
|
+
|
|
93
136
|
class Message(BaseModel):
|
|
94
137
|
role: str
|
|
95
138
|
content: Union[str, Dict[str, Any]]
|
|
96
|
-
type: ContentType
|
|
139
|
+
type: ContentType = None
|
|
97
140
|
# event that produced the message
|
|
98
141
|
event: Optional[str] = None
|
|
99
142
|
# used to correlate the Message with the retrieval context (ConversationalSearch)
|
|
@@ -107,18 +150,70 @@ class ExtendedMessage(BaseModel):
|
|
|
107
150
|
reason: dict | list | None = None
|
|
108
151
|
|
|
109
152
|
|
|
153
|
+
class OTelParserMessage(Message):
|
|
154
|
+
"""Message class for OTel parser with OpenAI-compatible tool call fields.
|
|
155
|
+
|
|
156
|
+
Inherits from Message and adds structured tool call fields for compatibility
|
|
157
|
+
with OpenTelemetry trace parsing (LangGraph, Pydantic AI, etc.)
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
tool_calls: Optional[List[OTelParserToolCall]] = None
|
|
161
|
+
tool_call_id: Optional[str] = None
|
|
162
|
+
|
|
163
|
+
def hash(self) -> str:
|
|
164
|
+
"""Generate hash for message deduplication"""
|
|
165
|
+
parts = [
|
|
166
|
+
self.role,
|
|
167
|
+
str(self.content) if self.content else "",
|
|
168
|
+
(
|
|
169
|
+
":".join(str(tc) for tc in self.tool_calls)
|
|
170
|
+
if self.tool_calls
|
|
171
|
+
else ""
|
|
172
|
+
),
|
|
173
|
+
self.tool_call_id or "",
|
|
174
|
+
]
|
|
175
|
+
return md5(":".join(parts).encode("utf-8")).hexdigest()
|
|
176
|
+
|
|
177
|
+
|
|
110
178
|
class KnowledgeBaseGoalDetail(BaseModel):
|
|
111
179
|
enabled: bool = False
|
|
112
180
|
metrics: list = []
|
|
113
181
|
|
|
114
182
|
|
|
183
|
+
class MatchingStrategy(StrEnum):
|
|
184
|
+
"""Argument matching strategy:\n
|
|
185
|
+
Strict: exact match\n
|
|
186
|
+
Optional: optional argument, exact match if the field exists\n
|
|
187
|
+
Fuzzy: semantic/similarity match\n"""
|
|
188
|
+
|
|
189
|
+
strict = "strict"
|
|
190
|
+
optional = "optional"
|
|
191
|
+
fuzzy = "fuzzy"
|
|
192
|
+
|
|
193
|
+
|
|
115
194
|
class GoalDetail(BaseModel):
|
|
116
195
|
name: str
|
|
117
|
-
tool_name: str = None
|
|
196
|
+
tool_name: Optional[str] = None
|
|
118
197
|
type: ContentType
|
|
119
|
-
args: Dict = None
|
|
120
|
-
|
|
121
|
-
|
|
198
|
+
args: Optional[Dict] = None
|
|
199
|
+
# matching strategy defaults to `strict` matching if not specified in the test case
|
|
200
|
+
arg_matching: Optional[dict[str, MatchingStrategy]] = Field(
|
|
201
|
+
default_factory=dict
|
|
202
|
+
)
|
|
203
|
+
response: Optional[str] = None
|
|
204
|
+
keywords: Optional[List] = None
|
|
205
|
+
|
|
206
|
+
@model_validator(mode="after")
|
|
207
|
+
def validate_arg_matching(self):
|
|
208
|
+
for field in self.arg_matching:
|
|
209
|
+
if field not in self.args:
|
|
210
|
+
raise ValueError(
|
|
211
|
+
f"{field} not in goal arguments for goal {self.name}"
|
|
212
|
+
)
|
|
213
|
+
return self
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class GoalDetailOrchestrate(GoalDetail):
|
|
122
217
|
knowledge_base: KnowledgeBaseGoalDetail = KnowledgeBaseGoalDetail()
|
|
123
218
|
|
|
124
219
|
|
|
@@ -131,23 +226,97 @@ class AttackData(BaseModel):
|
|
|
131
226
|
|
|
132
227
|
class AttackData(BaseModel):
|
|
133
228
|
agent: str
|
|
134
|
-
|
|
229
|
+
agents_list_or_path: Union[List[str], str]
|
|
135
230
|
attack_data: AttackData
|
|
136
231
|
story: str
|
|
137
232
|
starting_sentence: str
|
|
138
|
-
goals:
|
|
139
|
-
goal_details:
|
|
233
|
+
goals: dict | None = None
|
|
234
|
+
goal_details: list[GoalDetail] | None = None
|
|
140
235
|
|
|
141
236
|
|
|
142
|
-
class
|
|
143
|
-
|
|
144
|
-
goals: Dict
|
|
237
|
+
class DatasetModel(BaseModel):
|
|
238
|
+
starting_sentence: str | None = None
|
|
145
239
|
story: str
|
|
240
|
+
goals: Mapping[str, Any]
|
|
146
241
|
goal_details: List[GoalDetail]
|
|
147
|
-
|
|
242
|
+
max_user_turns: int | None = None
|
|
243
|
+
agent: str | None = None
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class LangfuseDatasetModel(DatasetModel):
|
|
247
|
+
@computed_field
|
|
248
|
+
@property
|
|
249
|
+
def langfuse_input(self) -> Mapping[str, Any]:
|
|
250
|
+
input = {
|
|
251
|
+
"starting_sentence": self.starting_sentence,
|
|
252
|
+
"story": self.story,
|
|
253
|
+
"agent": self.agent
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
return input
|
|
257
|
+
|
|
258
|
+
@computed_field
|
|
259
|
+
@property
|
|
260
|
+
def langfuse_output(self) -> Mapping[str, Any]:
|
|
261
|
+
output = {"goals": self.goals, "goal_details": self.goal_details}
|
|
262
|
+
|
|
263
|
+
return output
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _convert_to_langfuse_format(langfuse_row) -> LangfuseDatasetModel:
|
|
267
|
+
input = langfuse_row.input
|
|
268
|
+
output = langfuse_row.expected_output
|
|
269
|
+
|
|
270
|
+
for goal in output.get("goal_details"):
|
|
271
|
+
GoalDetail.model_validate(goal)
|
|
272
|
+
|
|
273
|
+
return LangfuseDatasetModel(
|
|
274
|
+
starting_sentence=input.get("starting_sentence"),
|
|
275
|
+
story=input.get("story"),
|
|
276
|
+
goals=output.get("goals"),
|
|
277
|
+
goal_details=[
|
|
278
|
+
GoalDetail.model_validate(goal)
|
|
279
|
+
for goal in output.get("goal_details")
|
|
280
|
+
],
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class OrchestrateDataset(DatasetModel):
|
|
285
|
+
goal_details: List[GoalDetailOrchestrate]
|
|
286
|
+
agent: str
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class LangfuseCollectionModel(BaseModel):
|
|
290
|
+
collection_name: str
|
|
291
|
+
datasets: List[LangfuseDatasetModel]
|
|
292
|
+
collection_description: Optional[str] = ""
|
|
293
|
+
metadata: Optional[Mapping[str, str]] = None
|
|
148
294
|
|
|
149
295
|
|
|
150
296
|
class ToolDefinition(BaseModel):
|
|
151
297
|
tool_description: Optional[str]
|
|
152
298
|
tool_name: str
|
|
153
299
|
tool_params: List[str]
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
class ProviderInstancesCacheKey(BaseModel):
|
|
303
|
+
provider: str
|
|
304
|
+
hashed_args: str
|
|
305
|
+
hashed_kwargs: str
|
|
306
|
+
|
|
307
|
+
def __str__(self) -> str:
|
|
308
|
+
return f"{self.provider}|{self.hashed_args}|{self.hashed_kwargs}"
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
class RuntimeResponse(BaseModel):
|
|
312
|
+
messages: List[Message]
|
|
313
|
+
thread_id: str | None = None
|
|
314
|
+
context: dict = Field(default={})
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
class ExperimentResult(BaseModel):
|
|
318
|
+
experiment_name: str
|
|
319
|
+
run_id: str
|
|
320
|
+
experiment_id: str
|
|
321
|
+
metrics: list
|
|
322
|
+
session_ids: List[str]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.llm_user_v2 import LLMUser
|
|
2
|
+
from wxo_agentic_evaluation.service_provider.portkey_provider import (
|
|
3
|
+
PortkeyProvider,
|
|
4
|
+
)
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
import os
|
|
7
|
+
import uuid
|
|
8
|
+
|
|
9
|
+
from wxo_agentic_evaluation.type import Message, ContentType
|
|
10
|
+
|
|
11
|
+
user_story = "Your user id is mia_li_3668. You want to fly from New York to Seattle on May 20 (one way). You do not want to fly before 11am est. You want to fly in economy. You prefer direct flights but one stopover also fine. If there are multiple options, you prefer the one with the lowest price. You have 3 baggages. You do not want insurance. You want to use your two certificates to pay. If only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card. You are reactive to the agent and will not say anything that is not asked. Your birthday is in your user profile so you do not prefer to provide it."
|
|
12
|
+
|
|
13
|
+
portkey_client = PortkeyProvider(
|
|
14
|
+
provider="@openai",
|
|
15
|
+
model_id="gpt-4o-mini",
|
|
16
|
+
api_key=os.environ.get("PORTKEY_API_KEY"),
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
user_response_style = [
|
|
20
|
+
"reactive to the agent and will not say anything that is not asked",
|
|
21
|
+
"replies only in very short sentences and few words",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
user_agent = LLMUser(
|
|
25
|
+
llm_client=portkey_client,
|
|
26
|
+
user_prompt_path="../prompt/universal_user_template.jinja2",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
agent = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_agent_response(messages: list[dict]) -> str:
|
|
33
|
+
|
|
34
|
+
response = agent.chat.completions.create(
|
|
35
|
+
model="gpt-4o-mini", messages=messages
|
|
36
|
+
)
|
|
37
|
+
return response.choices[0].message.content
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
starting_user_input = Message(
|
|
41
|
+
role="user", content="I want to fly.", type=ContentType.text
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
agent_system_prompt = Message(
|
|
46
|
+
role="system",
|
|
47
|
+
content="You are a helpful assistant. Keep your responses short and concise.",
|
|
48
|
+
type=ContentType.text,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
session_id = str(uuid.uuid4())
|
|
52
|
+
max_turns = 30
|
|
53
|
+
conversation_history = []
|
|
54
|
+
for i in range(max_turns):
|
|
55
|
+
|
|
56
|
+
if len(conversation_history) == 0:
|
|
57
|
+
conversation_history.append(agent_system_prompt)
|
|
58
|
+
conversation_history.append(
|
|
59
|
+
Message(
|
|
60
|
+
role="assistant",
|
|
61
|
+
content="Hi! How can I help you today?",
|
|
62
|
+
type=ContentType.text,
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
user_response = user_agent.generate_user_input(
|
|
67
|
+
user_story=user_story,
|
|
68
|
+
conversation_history=conversation_history,
|
|
69
|
+
user_response_style=user_response_style,
|
|
70
|
+
starting_user_input=starting_user_input,
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
user_response = user_agent.generate_user_input(
|
|
74
|
+
user_story=user_story,
|
|
75
|
+
conversation_history=conversation_history,
|
|
76
|
+
user_response_style=user_response_style,
|
|
77
|
+
starting_user_input=None,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
conversation_history.append(user_response)
|
|
81
|
+
print(f"User: {user_response.content}")
|
|
82
|
+
|
|
83
|
+
if "END" in user_response.content:
|
|
84
|
+
break
|
|
85
|
+
|
|
86
|
+
# Get agent response
|
|
87
|
+
agent_response_content = get_agent_response(
|
|
88
|
+
[msg.model_dump() for msg in conversation_history]
|
|
89
|
+
)
|
|
90
|
+
# agent_response_content = get_langflow_agent_response(conversation_history, session_id)
|
|
91
|
+
# agent_response_content = asyncio.run(get_langgraph_agent_response(conversation_history, session_id))
|
|
92
|
+
print(f"Agent: {agent_response_content}")
|
|
93
|
+
|
|
94
|
+
agent_response = Message(
|
|
95
|
+
role="assistant", content=agent_response_content, type=ContentType.text
|
|
96
|
+
)
|
|
97
|
+
conversation_history.append(agent_response)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
print(conversation_history)
|
|
@@ -1,6 +1,47 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
2
5
|
|
|
6
|
+
from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
|
|
7
|
+
ToolExtractionOpenAIFormat,
|
|
8
|
+
)
|
|
9
|
+
from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
|
|
10
|
+
from wxo_agentic_evaluation.utils.utils import (
|
|
11
|
+
N_A,
|
|
12
|
+
TestCaseResources,
|
|
13
|
+
add_line_seperator,
|
|
14
|
+
list_run_files,
|
|
15
|
+
load_run_metrics,
|
|
16
|
+
)
|
|
3
17
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
18
|
+
|
|
19
|
+
def json_dump(output_path, obj):
|
|
20
|
+
"""
|
|
21
|
+
Atomically dump JSON to `output_path`.
|
|
22
|
+
|
|
23
|
+
- Writes to a temporary file first
|
|
24
|
+
- Then atomically replaces the target file
|
|
25
|
+
- Prevents corrupted/half-written JSON if process is interrupted
|
|
26
|
+
"""
|
|
27
|
+
output_path = Path(output_path)
|
|
28
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
fd, tmp_path = tempfile.mkstemp(
|
|
31
|
+
dir=output_path.parent,
|
|
32
|
+
prefix=output_path.stem,
|
|
33
|
+
suffix=".tmp",
|
|
34
|
+
text=True,
|
|
35
|
+
)
|
|
36
|
+
try:
|
|
37
|
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
38
|
+
json.dump(obj, f, indent=4, ensure_ascii=False)
|
|
39
|
+
f.flush()
|
|
40
|
+
os.fsync(f.fileno())
|
|
41
|
+
os.replace(tmp_path, output_path)
|
|
42
|
+
except Exception:
|
|
43
|
+
try:
|
|
44
|
+
os.remove(tmp_path)
|
|
45
|
+
except OSError:
|
|
46
|
+
pass
|
|
47
|
+
raise
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation discovery mechanism.
|
|
3
|
+
|
|
4
|
+
This module provides functionality for discovering classes that inherit from Evaluation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import importlib.util
|
|
8
|
+
import inspect
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def find_evaluation_subclasses(directory: str, base_class_name="Evaluation"):
|
|
13
|
+
"""
|
|
14
|
+
Dynamically import Python files under 'directory' and find classes that
|
|
15
|
+
inherit from a class named 'Evaluation'. Returns a list of non-abstract
|
|
16
|
+
class objects.
|
|
17
|
+
"""
|
|
18
|
+
subclasses = []
|
|
19
|
+
|
|
20
|
+
for root, _, files in os.walk(directory):
|
|
21
|
+
for file in files:
|
|
22
|
+
if file.endswith(".py") and not file.startswith("__"):
|
|
23
|
+
filepath = os.path.join(root, file)
|
|
24
|
+
module_name = os.path.splitext(os.path.basename(filepath))[0]
|
|
25
|
+
|
|
26
|
+
spec = importlib.util.spec_from_file_location(
|
|
27
|
+
module_name, filepath
|
|
28
|
+
)
|
|
29
|
+
if spec and spec.loader:
|
|
30
|
+
module = importlib.util.module_from_spec(spec)
|
|
31
|
+
try:
|
|
32
|
+
spec.loader.exec_module(module)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
print(f"Skipping {filepath} due to import error: {e}")
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
# Inspect for subclasses
|
|
38
|
+
for name, obj in inspect.getmembers(
|
|
39
|
+
module, inspect.isclass
|
|
40
|
+
):
|
|
41
|
+
if any(
|
|
42
|
+
base.__name__ == base_class_name
|
|
43
|
+
for base in obj.__mro__[1:]
|
|
44
|
+
) and not inspect.isabstract(obj):
|
|
45
|
+
subclasses.append(obj)
|
|
46
|
+
|
|
47
|
+
return subclasses
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.arg_configs import AuthConfig
|
|
5
|
+
from wxo_agentic_evaluation.service_provider import USE_GATEWAY_MODEL_PROVIDER
|
|
6
|
+
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
7
|
+
|
|
8
|
+
WXO_AUTH_CONFIG_DEFAULTS = AuthConfig(
|
|
9
|
+
url=os.getenv("WXO_URL", "http://localhost:4321"),
|
|
10
|
+
tenant_name=os.getenv("WXO_TENANT", "wxo-dev"),
|
|
11
|
+
token=os.getenv("WXO_TOKEN", None),
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@lru_cache(maxsize=1)
|
|
16
|
+
def _get_cached_wxo_client():
|
|
17
|
+
# TODO: remove this once the client is implemented as a Singleton.
|
|
18
|
+
return get_wxo_client(
|
|
19
|
+
WXO_AUTH_CONFIG_DEFAULTS.url,
|
|
20
|
+
WXO_AUTH_CONFIG_DEFAULTS.tenant_name,
|
|
21
|
+
WXO_AUTH_CONFIG_DEFAULTS.token,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_provider_kwargs(**base_kwargs: dict) -> dict:
|
|
26
|
+
|
|
27
|
+
if not USE_GATEWAY_MODEL_PROVIDER:
|
|
28
|
+
return base_kwargs
|
|
29
|
+
|
|
30
|
+
if "instance_url" in base_kwargs and "token" in base_kwargs:
|
|
31
|
+
return base_kwargs
|
|
32
|
+
|
|
33
|
+
wxo_client = _get_cached_wxo_client()
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
**base_kwargs,
|
|
37
|
+
"instance_url": wxo_client.service_url,
|
|
38
|
+
"token": wxo_client.api_key,
|
|
39
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ParsedMessages(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
A parsed history of messages.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
messages: list[Message] = Field(description="The list of messages")
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def user_input(self) -> Optional[str]:
|
|
17
|
+
"""Find the original user message."""
|
|
18
|
+
for message in self.messages:
|
|
19
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
20
|
+
return str(message.content)
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def agent_response(self) -> Optional[str]:
|
|
25
|
+
"""Find the most recent assistant message."""
|
|
26
|
+
messages_in_reverse = reversed(self.messages)
|
|
27
|
+
for message in messages_in_reverse:
|
|
28
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
29
|
+
return str(message.content)
|
|
30
|
+
return None
|