ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
wxo_agentic_evaluation/type.py
CHANGED
|
@@ -1,6 +1,21 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
from
|
|
1
|
+
from enum import Enum, StrEnum
|
|
2
|
+
from hashlib import md5
|
|
3
|
+
from typing import Any, Dict, List, Literal, Mapping, Optional, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import (
|
|
6
|
+
BaseModel,
|
|
7
|
+
ConfigDict,
|
|
8
|
+
Field,
|
|
9
|
+
computed_field,
|
|
10
|
+
model_validator,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CallTracker(BaseModel):
|
|
15
|
+
tool_call: List = []
|
|
16
|
+
tool_response: List = []
|
|
17
|
+
generic: List = []
|
|
18
|
+
metadata: Dict[str, Any] = Field(default={})
|
|
4
19
|
|
|
5
20
|
|
|
6
21
|
class EventTypes(StrEnum):
|
|
@@ -20,6 +35,16 @@ class ContentType(StrEnum):
|
|
|
20
35
|
conversational_search = "conversational_search"
|
|
21
36
|
|
|
22
37
|
|
|
38
|
+
class AttackCategory(StrEnum):
|
|
39
|
+
on_policy = "on_policy"
|
|
40
|
+
off_policy = "off_policy"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Roles(Enum):
|
|
44
|
+
ASSISTANT = "assistant"
|
|
45
|
+
USER = "user"
|
|
46
|
+
|
|
47
|
+
|
|
23
48
|
class ConversationalSearchCitations(BaseModel):
|
|
24
49
|
url: str
|
|
25
50
|
body: str
|
|
@@ -51,9 +76,13 @@ class ConversationalConfidenceThresholdScore(BaseModel):
|
|
|
51
76
|
def table(self):
|
|
52
77
|
return {
|
|
53
78
|
"response_confidence": str(self.response_confidence),
|
|
54
|
-
"response_confidence_threshold": str(
|
|
79
|
+
"response_confidence_threshold": str(
|
|
80
|
+
self.response_confidence_threshold
|
|
81
|
+
),
|
|
55
82
|
"retrieval_confidence": str(self.retrieval_confidence),
|
|
56
|
-
"retrieval_confidence_threshold": str(
|
|
83
|
+
"retrieval_confidence_threshold": str(
|
|
84
|
+
self.retrieval_confidence_threshold
|
|
85
|
+
),
|
|
57
86
|
}
|
|
58
87
|
|
|
59
88
|
|
|
@@ -79,10 +108,35 @@ class ConversationalSearch(BaseModel):
|
|
|
79
108
|
response_length_option: str
|
|
80
109
|
|
|
81
110
|
|
|
111
|
+
class OTelParserFunction(BaseModel):
|
|
112
|
+
"""OpenAI chat completion function structure for OTel parser tool calls"""
|
|
113
|
+
|
|
114
|
+
name: str
|
|
115
|
+
arguments: str # JSON string of arguments
|
|
116
|
+
|
|
117
|
+
model_config = ConfigDict(frozen=True)
|
|
118
|
+
|
|
119
|
+
def __str__(self):
|
|
120
|
+
return f"{self.name}:{self.arguments}"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class OTelParserToolCall(BaseModel):
|
|
124
|
+
"""OpenAI chat completion tool call structure for OTel parser"""
|
|
125
|
+
|
|
126
|
+
id: str
|
|
127
|
+
function: OTelParserFunction
|
|
128
|
+
type: Literal["function"] = "function"
|
|
129
|
+
|
|
130
|
+
model_config = ConfigDict(frozen=True)
|
|
131
|
+
|
|
132
|
+
def __str__(self):
|
|
133
|
+
return f"{self.id}:{self.type}:{self.function}"
|
|
134
|
+
|
|
135
|
+
|
|
82
136
|
class Message(BaseModel):
|
|
83
137
|
role: str
|
|
84
138
|
content: Union[str, Dict[str, Any]]
|
|
85
|
-
type: ContentType
|
|
139
|
+
type: ContentType = None
|
|
86
140
|
# event that produced the message
|
|
87
141
|
event: Optional[str] = None
|
|
88
142
|
# used to correlate the Message with the retrieval context (ConversationalSearch)
|
|
@@ -93,7 +147,32 @@ class Message(BaseModel):
|
|
|
93
147
|
|
|
94
148
|
class ExtendedMessage(BaseModel):
|
|
95
149
|
message: Message
|
|
96
|
-
reason: dict | None = None
|
|
150
|
+
reason: dict | list | None = None
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class OTelParserMessage(Message):
|
|
154
|
+
"""Message class for OTel parser with OpenAI-compatible tool call fields.
|
|
155
|
+
|
|
156
|
+
Inherits from Message and adds structured tool call fields for compatibility
|
|
157
|
+
with OpenTelemetry trace parsing (LangGraph, Pydantic AI, etc.)
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
tool_calls: Optional[List[OTelParserToolCall]] = None
|
|
161
|
+
tool_call_id: Optional[str] = None
|
|
162
|
+
|
|
163
|
+
def hash(self) -> str:
|
|
164
|
+
"""Generate hash for message deduplication"""
|
|
165
|
+
parts = [
|
|
166
|
+
self.role,
|
|
167
|
+
str(self.content) if self.content else "",
|
|
168
|
+
(
|
|
169
|
+
":".join(str(tc) for tc in self.tool_calls)
|
|
170
|
+
if self.tool_calls
|
|
171
|
+
else ""
|
|
172
|
+
),
|
|
173
|
+
self.tool_call_id or "",
|
|
174
|
+
]
|
|
175
|
+
return md5(":".join(parts).encode("utf-8")).hexdigest()
|
|
97
176
|
|
|
98
177
|
|
|
99
178
|
class KnowledgeBaseGoalDetail(BaseModel):
|
|
@@ -101,19 +180,143 @@ class KnowledgeBaseGoalDetail(BaseModel):
|
|
|
101
180
|
metrics: list = []
|
|
102
181
|
|
|
103
182
|
|
|
183
|
+
class MatchingStrategy(StrEnum):
|
|
184
|
+
"""Argument matching strategy:\n
|
|
185
|
+
Strict: exact match\n
|
|
186
|
+
Optional: optional argument, exact match if the field exists\n
|
|
187
|
+
Fuzzy: semantic/similarity match\n"""
|
|
188
|
+
|
|
189
|
+
strict = "strict"
|
|
190
|
+
optional = "optional"
|
|
191
|
+
fuzzy = "fuzzy"
|
|
192
|
+
|
|
193
|
+
|
|
104
194
|
class GoalDetail(BaseModel):
|
|
105
195
|
name: str
|
|
106
|
-
tool_name: str = None
|
|
196
|
+
tool_name: Optional[str] = None
|
|
107
197
|
type: ContentType
|
|
108
|
-
args: Dict = None
|
|
109
|
-
|
|
110
|
-
|
|
198
|
+
args: Optional[Dict] = None
|
|
199
|
+
# matching strategy defaults to `strict` matching if not specified in the test case
|
|
200
|
+
arg_matching: Optional[dict[str, MatchingStrategy]] = Field(
|
|
201
|
+
default_factory=dict
|
|
202
|
+
)
|
|
203
|
+
response: Optional[str] = None
|
|
204
|
+
keywords: Optional[List] = None
|
|
205
|
+
|
|
206
|
+
@model_validator(mode="after")
|
|
207
|
+
def validate_arg_matching(self):
|
|
208
|
+
for field in self.arg_matching:
|
|
209
|
+
if field not in self.args:
|
|
210
|
+
raise ValueError(
|
|
211
|
+
f"{field} not in goal arguments for goal {self.name}"
|
|
212
|
+
)
|
|
213
|
+
return self
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class GoalDetailOrchestrate(GoalDetail):
|
|
111
217
|
knowledge_base: KnowledgeBaseGoalDetail = KnowledgeBaseGoalDetail()
|
|
112
218
|
|
|
113
219
|
|
|
114
|
-
class
|
|
220
|
+
class AttackData(BaseModel):
|
|
221
|
+
attack_category: AttackCategory
|
|
222
|
+
attack_type: str
|
|
223
|
+
attack_name: str
|
|
224
|
+
attack_instructions: str
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class AttackData(BaseModel):
|
|
115
228
|
agent: str
|
|
116
|
-
|
|
229
|
+
agents_list_or_path: Union[List[str], str]
|
|
230
|
+
attack_data: AttackData
|
|
117
231
|
story: str
|
|
232
|
+
starting_sentence: str
|
|
233
|
+
goals: dict | None = None
|
|
234
|
+
goal_details: list[GoalDetail] | None = None
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class DatasetModel(BaseModel):
|
|
238
|
+
starting_sentence: str | None = None
|
|
239
|
+
story: str
|
|
240
|
+
goals: Mapping[str, Any]
|
|
118
241
|
goal_details: List[GoalDetail]
|
|
119
|
-
|
|
242
|
+
max_user_turns: int | None = None
|
|
243
|
+
agent: str | None = None
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class LangfuseDatasetModel(DatasetModel):
|
|
247
|
+
@computed_field
|
|
248
|
+
@property
|
|
249
|
+
def langfuse_input(self) -> Mapping[str, Any]:
|
|
250
|
+
input = {
|
|
251
|
+
"starting_sentence": self.starting_sentence,
|
|
252
|
+
"story": self.story,
|
|
253
|
+
"agent": self.agent
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
return input
|
|
257
|
+
|
|
258
|
+
@computed_field
|
|
259
|
+
@property
|
|
260
|
+
def langfuse_output(self) -> Mapping[str, Any]:
|
|
261
|
+
output = {"goals": self.goals, "goal_details": self.goal_details}
|
|
262
|
+
|
|
263
|
+
return output
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _convert_to_langfuse_format(langfuse_row) -> LangfuseDatasetModel:
|
|
267
|
+
input = langfuse_row.input
|
|
268
|
+
output = langfuse_row.expected_output
|
|
269
|
+
|
|
270
|
+
for goal in output.get("goal_details"):
|
|
271
|
+
GoalDetail.model_validate(goal)
|
|
272
|
+
|
|
273
|
+
return LangfuseDatasetModel(
|
|
274
|
+
starting_sentence=input.get("starting_sentence"),
|
|
275
|
+
story=input.get("story"),
|
|
276
|
+
goals=output.get("goals"),
|
|
277
|
+
goal_details=[
|
|
278
|
+
GoalDetail.model_validate(goal)
|
|
279
|
+
for goal in output.get("goal_details")
|
|
280
|
+
],
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class OrchestrateDataset(DatasetModel):
|
|
285
|
+
goal_details: List[GoalDetailOrchestrate]
|
|
286
|
+
agent: str
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class LangfuseCollectionModel(BaseModel):
|
|
290
|
+
collection_name: str
|
|
291
|
+
datasets: List[LangfuseDatasetModel]
|
|
292
|
+
collection_description: Optional[str] = ""
|
|
293
|
+
metadata: Optional[Mapping[str, str]] = None
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class ToolDefinition(BaseModel):
|
|
297
|
+
tool_description: Optional[str]
|
|
298
|
+
tool_name: str
|
|
299
|
+
tool_params: List[str]
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
class ProviderInstancesCacheKey(BaseModel):
|
|
303
|
+
provider: str
|
|
304
|
+
hashed_args: str
|
|
305
|
+
hashed_kwargs: str
|
|
306
|
+
|
|
307
|
+
def __str__(self) -> str:
|
|
308
|
+
return f"{self.provider}|{self.hashed_args}|{self.hashed_kwargs}"
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
class RuntimeResponse(BaseModel):
|
|
312
|
+
messages: List[Message]
|
|
313
|
+
thread_id: str | None = None
|
|
314
|
+
context: dict = Field(default={})
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
class ExperimentResult(BaseModel):
|
|
318
|
+
experiment_name: str
|
|
319
|
+
run_id: str
|
|
320
|
+
experiment_id: str
|
|
321
|
+
metrics: list
|
|
322
|
+
session_ids: List[str]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.llm_user_v2 import LLMUser
|
|
2
|
+
from wxo_agentic_evaluation.service_provider.portkey_provider import (
|
|
3
|
+
PortkeyProvider,
|
|
4
|
+
)
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
import os
|
|
7
|
+
import uuid
|
|
8
|
+
|
|
9
|
+
from wxo_agentic_evaluation.type import Message, ContentType
|
|
10
|
+
|
|
11
|
+
user_story = "Your user id is mia_li_3668. You want to fly from New York to Seattle on May 20 (one way). You do not want to fly before 11am est. You want to fly in economy. You prefer direct flights but one stopover also fine. If there are multiple options, you prefer the one with the lowest price. You have 3 baggages. You do not want insurance. You want to use your two certificates to pay. If only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card. You are reactive to the agent and will not say anything that is not asked. Your birthday is in your user profile so you do not prefer to provide it."
|
|
12
|
+
|
|
13
|
+
portkey_client = PortkeyProvider(
|
|
14
|
+
provider="@openai",
|
|
15
|
+
model_id="gpt-4o-mini",
|
|
16
|
+
api_key=os.environ.get("PORTKEY_API_KEY"),
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
user_response_style = [
|
|
20
|
+
"reactive to the agent and will not say anything that is not asked",
|
|
21
|
+
"replies only in very short sentences and few words",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
user_agent = LLMUser(
|
|
25
|
+
llm_client=portkey_client,
|
|
26
|
+
user_prompt_path="../prompt/universal_user_template.jinja2",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
agent = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_agent_response(messages: list[dict]) -> str:
|
|
33
|
+
|
|
34
|
+
response = agent.chat.completions.create(
|
|
35
|
+
model="gpt-4o-mini", messages=messages
|
|
36
|
+
)
|
|
37
|
+
return response.choices[0].message.content
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
starting_user_input = Message(
|
|
41
|
+
role="user", content="I want to fly.", type=ContentType.text
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
agent_system_prompt = Message(
|
|
46
|
+
role="system",
|
|
47
|
+
content="You are a helpful assistant. Keep your responses short and concise.",
|
|
48
|
+
type=ContentType.text,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
session_id = str(uuid.uuid4())
|
|
52
|
+
max_turns = 30
|
|
53
|
+
conversation_history = []
|
|
54
|
+
for i in range(max_turns):
|
|
55
|
+
|
|
56
|
+
if len(conversation_history) == 0:
|
|
57
|
+
conversation_history.append(agent_system_prompt)
|
|
58
|
+
conversation_history.append(
|
|
59
|
+
Message(
|
|
60
|
+
role="assistant",
|
|
61
|
+
content="Hi! How can I help you today?",
|
|
62
|
+
type=ContentType.text,
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
user_response = user_agent.generate_user_input(
|
|
67
|
+
user_story=user_story,
|
|
68
|
+
conversation_history=conversation_history,
|
|
69
|
+
user_response_style=user_response_style,
|
|
70
|
+
starting_user_input=starting_user_input,
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
user_response = user_agent.generate_user_input(
|
|
74
|
+
user_story=user_story,
|
|
75
|
+
conversation_history=conversation_history,
|
|
76
|
+
user_response_style=user_response_style,
|
|
77
|
+
starting_user_input=None,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
conversation_history.append(user_response)
|
|
81
|
+
print(f"User: {user_response.content}")
|
|
82
|
+
|
|
83
|
+
if "END" in user_response.content:
|
|
84
|
+
break
|
|
85
|
+
|
|
86
|
+
# Get agent response
|
|
87
|
+
agent_response_content = get_agent_response(
|
|
88
|
+
[msg.model_dump() for msg in conversation_history]
|
|
89
|
+
)
|
|
90
|
+
# agent_response_content = get_langflow_agent_response(conversation_history, session_id)
|
|
91
|
+
# agent_response_content = asyncio.run(get_langgraph_agent_response(conversation_history, session_id))
|
|
92
|
+
print(f"Agent: {agent_response_content}")
|
|
93
|
+
|
|
94
|
+
agent_response = Message(
|
|
95
|
+
role="assistant", content=agent_response_content, type=ContentType.text
|
|
96
|
+
)
|
|
97
|
+
conversation_history.append(agent_response)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
print(conversation_history)
|
|
@@ -1,6 +1,47 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
2
5
|
|
|
6
|
+
from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
|
|
7
|
+
ToolExtractionOpenAIFormat,
|
|
8
|
+
)
|
|
9
|
+
from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
|
|
10
|
+
from wxo_agentic_evaluation.utils.utils import (
|
|
11
|
+
N_A,
|
|
12
|
+
TestCaseResources,
|
|
13
|
+
add_line_seperator,
|
|
14
|
+
list_run_files,
|
|
15
|
+
load_run_metrics,
|
|
16
|
+
)
|
|
3
17
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
18
|
+
|
|
19
|
+
def json_dump(output_path, obj):
|
|
20
|
+
"""
|
|
21
|
+
Atomically dump JSON to `output_path`.
|
|
22
|
+
|
|
23
|
+
- Writes to a temporary file first
|
|
24
|
+
- Then atomically replaces the target file
|
|
25
|
+
- Prevents corrupted/half-written JSON if process is interrupted
|
|
26
|
+
"""
|
|
27
|
+
output_path = Path(output_path)
|
|
28
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
fd, tmp_path = tempfile.mkstemp(
|
|
31
|
+
dir=output_path.parent,
|
|
32
|
+
prefix=output_path.stem,
|
|
33
|
+
suffix=".tmp",
|
|
34
|
+
text=True,
|
|
35
|
+
)
|
|
36
|
+
try:
|
|
37
|
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
38
|
+
json.dump(obj, f, indent=4, ensure_ascii=False)
|
|
39
|
+
f.flush()
|
|
40
|
+
os.fsync(f.fileno())
|
|
41
|
+
os.replace(tmp_path, output_path)
|
|
42
|
+
except Exception:
|
|
43
|
+
try:
|
|
44
|
+
os.remove(tmp_path)
|
|
45
|
+
except OSError:
|
|
46
|
+
pass
|
|
47
|
+
raise
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation discovery mechanism.
|
|
3
|
+
|
|
4
|
+
This module provides functionality for discovering classes that inherit from Evaluation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import importlib.util
|
|
8
|
+
import inspect
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def find_evaluation_subclasses(directory: str, base_class_name="Evaluation"):
|
|
13
|
+
"""
|
|
14
|
+
Dynamically import Python files under 'directory' and find classes that
|
|
15
|
+
inherit from a class named 'Evaluation'. Returns a list of non-abstract
|
|
16
|
+
class objects.
|
|
17
|
+
"""
|
|
18
|
+
subclasses = []
|
|
19
|
+
|
|
20
|
+
for root, _, files in os.walk(directory):
|
|
21
|
+
for file in files:
|
|
22
|
+
if file.endswith(".py") and not file.startswith("__"):
|
|
23
|
+
filepath = os.path.join(root, file)
|
|
24
|
+
module_name = os.path.splitext(os.path.basename(filepath))[0]
|
|
25
|
+
|
|
26
|
+
spec = importlib.util.spec_from_file_location(
|
|
27
|
+
module_name, filepath
|
|
28
|
+
)
|
|
29
|
+
if spec and spec.loader:
|
|
30
|
+
module = importlib.util.module_from_spec(spec)
|
|
31
|
+
try:
|
|
32
|
+
spec.loader.exec_module(module)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
print(f"Skipping {filepath} due to import error: {e}")
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
# Inspect for subclasses
|
|
38
|
+
for name, obj in inspect.getmembers(
|
|
39
|
+
module, inspect.isclass
|
|
40
|
+
):
|
|
41
|
+
if any(
|
|
42
|
+
base.__name__ == base_class_name
|
|
43
|
+
for base in obj.__mro__[1:]
|
|
44
|
+
) and not inspect.isabstract(obj):
|
|
45
|
+
subclasses.append(obj)
|
|
46
|
+
|
|
47
|
+
return subclasses
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.arg_configs import AuthConfig
|
|
5
|
+
from wxo_agentic_evaluation.service_provider import USE_GATEWAY_MODEL_PROVIDER
|
|
6
|
+
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
7
|
+
|
|
8
|
+
WXO_AUTH_CONFIG_DEFAULTS = AuthConfig(
|
|
9
|
+
url=os.getenv("WXO_URL", "http://localhost:4321"),
|
|
10
|
+
tenant_name=os.getenv("WXO_TENANT", "wxo-dev"),
|
|
11
|
+
token=os.getenv("WXO_TOKEN", None),
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@lru_cache(maxsize=1)
|
|
16
|
+
def _get_cached_wxo_client():
|
|
17
|
+
# TODO: remove this once the client is implemented as a Singleton.
|
|
18
|
+
return get_wxo_client(
|
|
19
|
+
WXO_AUTH_CONFIG_DEFAULTS.url,
|
|
20
|
+
WXO_AUTH_CONFIG_DEFAULTS.tenant_name,
|
|
21
|
+
WXO_AUTH_CONFIG_DEFAULTS.token,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_provider_kwargs(**base_kwargs: dict) -> dict:
|
|
26
|
+
|
|
27
|
+
if not USE_GATEWAY_MODEL_PROVIDER:
|
|
28
|
+
return base_kwargs
|
|
29
|
+
|
|
30
|
+
if "instance_url" in base_kwargs and "token" in base_kwargs:
|
|
31
|
+
return base_kwargs
|
|
32
|
+
|
|
33
|
+
wxo_client = _get_cached_wxo_client()
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
**base_kwargs,
|
|
37
|
+
"instance_url": wxo_client.service_url,
|
|
38
|
+
"token": wxo_client.api_key,
|
|
39
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ParsedMessages(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
A parsed history of messages.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
messages: list[Message] = Field(description="The list of messages")
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def user_input(self) -> Optional[str]:
|
|
17
|
+
"""Find the original user message."""
|
|
18
|
+
for message in self.messages:
|
|
19
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
20
|
+
return str(message.content)
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def agent_response(self) -> Optional[str]:
|
|
25
|
+
"""Find the most recent assistant message."""
|
|
26
|
+
messages_in_reverse = reversed(self.messages)
|
|
27
|
+
for message in messages_in_reverse:
|
|
28
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
29
|
+
return str(message.content)
|
|
30
|
+
return None
|