ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,125 @@
1
+ from wxo_agentic_evaluation.evaluation_controller.evaluation_controller import EvaluationController
2
+ from langfuse import get_client
3
+
4
+ from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import RuntimeAdapter
5
+ from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import WXORuntimeAdapter
6
+ from wxo_agentic_evaluation.type import Message, RuntimeResponse
7
+ from wxo_agentic_evaluation.llm_user import LLMUser
8
+ from wxo_agentic_evaluation.llm_user_v2 import LLMUserV2
9
+ from wxo_agentic_evaluation.arg_configs import ControllerConfig
10
+ from wxo_agentic_evaluation.hr_agent_langgraph import agent
11
+
12
+ from dotenv import load_dotenv
13
+ load_dotenv()
14
+ import os
15
+ import base64
16
+
17
+ os.environ["USE_PORTKEY_PROVIDER"] = "true"
18
+
19
+ lf_public = os.getenv("LANGFUSE_PUBLIC_KEY")
20
+ lf_secret = os.getenv("LANGFUSE_SECRET_KEY")
21
+ auth_bytes = f"{lf_public}:{lf_secret}".encode("utf-8")
22
+ auth_b64 = base64.b64encode(auth_bytes).decode("ascii")
23
+ HEADERS = {"Authorization": f"Basic {auth_b64}"}
24
+
25
+ lf_base_url = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com").rstrip("/")
26
+ OTEL_ENDPOINT = f"{lf_base_url}/api/public/otel/v1/traces"
27
+
28
+ from phoenix.otel import register
29
+ register(endpoint=OTEL_ENDPOINT, headers=HEADERS, auto_instrument=True)
30
+
31
+
32
+ context = {"session_id": "1", "chat_history": []}
33
+
34
+
35
+ class MyAgentWrapper(RuntimeAdapter):
36
+ def run(
37
+ self,
38
+ user_message: Message,
39
+ context: dict,
40
+ thread_id=None,
41
+ ) -> RuntimeResponse:
42
+
43
+ message_json = user_message.model_dump()
44
+ messages = {"messages": [ message_json ]}
45
+ result = agent.invoke(messages)
46
+ # print(result)
47
+ message = Message(role="assistant", content=result["messages"][-1].content)
48
+ # messages = [Message(role="assistant", content=msg.content, type="tool_call") for msg in result["messages"]]
49
+ return RuntimeResponse(messages=[message])
50
+
51
+
52
+
53
+ agent_wrapper = MyAgentWrapper()
54
+ from openinference.instrumentation import using_session
55
+
56
+
57
+ class SimulationRunner:
58
+ def __init__(self, user_agent: LLMUser,
59
+ agent: RuntimeAdapter,
60
+ config: ControllerConfig):
61
+ self.evaluation_controller = EvaluationController(
62
+ runtime=agent,
63
+ llm_user=user_agent,
64
+ config=config,
65
+ )
66
+ self.counter = 0
67
+
68
+
69
+ def run_wrapper(self, session_id = 'session-id-test-00'):
70
+ def run_task(*, item, **kwargs):
71
+ """
72
+ Task function for Langfuse experiment.
73
+ Item input should be: {"persona": "...", "scenario": "..."}
74
+ """
75
+ # print(item)
76
+ with using_session(session_id + "-" + self.counter.__str__()):
77
+ input = item.input
78
+ user_story = input.get("story")
79
+ starting_sentence = input.get("starting_sentence")
80
+ agent_name = input.get("agent")
81
+ _, _, _, thread_id = self.evaluation_controller.run(self.counter, agent_name=agent_name, story=user_story, starting_user_input=starting_sentence)
82
+ self.counter += 1
83
+ if isinstance(self.evaluation_controller.runtime, WXORuntimeAdapter):
84
+ return thread_id
85
+ return session_id
86
+
87
+
88
+ return run_task
89
+
90
+ if __name__ == "__main__":
91
+ import json
92
+ with open("benchmarks/hr_sample/data_simple.json") as f:
93
+ data = json.load(f)
94
+ langfuse = get_client()
95
+ langfuse.create_dataset(name="dataset-test-00")
96
+ # Upload to Langfuse
97
+
98
+ langfuse.create_dataset_item(
99
+ dataset_name="dataset-test-00",
100
+ # any python object or value
101
+ input={"story": data["story"], "starting_sentence": data["starting_sentence"]},
102
+ # any python object or value, optional
103
+ expected_output={"goals": data["goals"], "goal_details": data["goal_details"]},
104
+ )
105
+ from wxo_agentic_evaluation.service_provider import get_provider
106
+
107
+ model_id = "gpt-4o-mini"
108
+ provider = get_provider(provider="openai", model_id=model_id, api_key=os.getenv("OPENAI_API_KEY"),
109
+ use_portkey_provider=True)
110
+ llm_user = LLMUserV2(llm_client=provider, user_prompt_path="src/wxo_agentic_evaluation/prompt/universal_user_template.jinja2")
111
+ config = ControllerConfig()
112
+ simluation_runner = SimulationRunner(agent = agent_wrapper, user_agent=llm_user, config=config)
113
+ dataset = langfuse.get_dataset("dataset-test-00")
114
+
115
+ result = dataset.run_experiment(
116
+ name="experiment-test-00",
117
+ description="Synthetic conversations from persona/scenario pairs",
118
+ task=simluation_runner.run_wrapper()
119
+ )
120
+
121
+ get_client().flush()
122
+ session_id = "dummy-1"
123
+ with using_session(session_id):
124
+ result = agent_wrapper.run(Message(role="user", content="hi"), context={})
125
+ print(result)
@@ -1,7 +1,6 @@
1
1
  from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
2
2
 
3
3
 
4
-
5
4
  def parse_json_string(input_string):
6
5
  json_char_count = 0
7
6
  json_objects = []
@@ -31,9 +30,10 @@ def parse_json_string(input_string):
31
30
  is_thinking_step = len(input_string) - json_char_count > 10
32
31
  return json_objects
33
32
 
33
+
34
34
  wai_client = WatsonXProvider(model_id="meta-llama/llama-3-405b-instruct")
35
35
 
36
- prompt = """
36
+ prompt = """
37
37
  <|begin_of_text|><|start_header_id|>system<|end_header_id|>
38
38
  You are trying to make tool calls. Given a raw input and tool output. Try to extract the information to make the tool call
39
39
 
@@ -83,12 +83,12 @@ test_sample2 = """
83
83
  <|start_header_id|>ipython<|end_header_id|>"""
84
84
 
85
85
 
86
-
87
86
  outputs = wai_client.query(prompt + test_sample1)
88
87
 
89
88
  import json
89
+
90
90
  print(outputs["generated_text"])
91
91
 
92
92
  json_obj = parse_json_string(outputs["generated_text"])[0]
93
93
 
94
- print(json_obj)
94
+ print(json_obj)
@@ -1,8 +1,21 @@
1
- from enum import StrEnum
2
- from typing import Any, Dict, List, Optional, Union
1
+ from enum import Enum, StrEnum
2
+ from hashlib import md5
3
+ from typing import Any, Dict, List, Literal, Mapping, Optional, Union
3
4
 
4
- from pydantic import BaseModel, ConfigDict, Field
5
- from rich.text import Text
5
+ from pydantic import (
6
+ BaseModel,
7
+ ConfigDict,
8
+ Field,
9
+ computed_field,
10
+ model_validator,
11
+ )
12
+
13
+
14
+ class CallTracker(BaseModel):
15
+ tool_call: List = []
16
+ tool_response: List = []
17
+ generic: List = []
18
+ metadata: Dict[str, Any] = Field(default={})
6
19
 
7
20
 
8
21
  class EventTypes(StrEnum):
@@ -27,6 +40,11 @@ class AttackCategory(StrEnum):
27
40
  off_policy = "off_policy"
28
41
 
29
42
 
43
+ class Roles(Enum):
44
+ ASSISTANT = "assistant"
45
+ USER = "user"
46
+
47
+
30
48
  class ConversationalSearchCitations(BaseModel):
31
49
  url: str
32
50
  body: str
@@ -90,10 +108,35 @@ class ConversationalSearch(BaseModel):
90
108
  response_length_option: str
91
109
 
92
110
 
111
+ class OTelParserFunction(BaseModel):
112
+ """OpenAI chat completion function structure for OTel parser tool calls"""
113
+
114
+ name: str
115
+ arguments: str # JSON string of arguments
116
+
117
+ model_config = ConfigDict(frozen=True)
118
+
119
+ def __str__(self):
120
+ return f"{self.name}:{self.arguments}"
121
+
122
+
123
+ class OTelParserToolCall(BaseModel):
124
+ """OpenAI chat completion tool call structure for OTel parser"""
125
+
126
+ id: str
127
+ function: OTelParserFunction
128
+ type: Literal["function"] = "function"
129
+
130
+ model_config = ConfigDict(frozen=True)
131
+
132
+ def __str__(self):
133
+ return f"{self.id}:{self.type}:{self.function}"
134
+
135
+
93
136
  class Message(BaseModel):
94
137
  role: str
95
138
  content: Union[str, Dict[str, Any]]
96
- type: ContentType
139
+ type: ContentType = None
97
140
  # event that produced the message
98
141
  event: Optional[str] = None
99
142
  # used to correlate the Message with the retrieval context (ConversationalSearch)
@@ -107,18 +150,70 @@ class ExtendedMessage(BaseModel):
107
150
  reason: dict | list | None = None
108
151
 
109
152
 
153
+ class OTelParserMessage(Message):
154
+ """Message class for OTel parser with OpenAI-compatible tool call fields.
155
+
156
+ Inherits from Message and adds structured tool call fields for compatibility
157
+ with OpenTelemetry trace parsing (LangGraph, Pydantic AI, etc.)
158
+ """
159
+
160
+ tool_calls: Optional[List[OTelParserToolCall]] = None
161
+ tool_call_id: Optional[str] = None
162
+
163
+ def hash(self) -> str:
164
+ """Generate hash for message deduplication"""
165
+ parts = [
166
+ self.role,
167
+ str(self.content) if self.content else "",
168
+ (
169
+ ":".join(str(tc) for tc in self.tool_calls)
170
+ if self.tool_calls
171
+ else ""
172
+ ),
173
+ self.tool_call_id or "",
174
+ ]
175
+ return md5(":".join(parts).encode("utf-8")).hexdigest()
176
+
177
+
110
178
  class KnowledgeBaseGoalDetail(BaseModel):
111
179
  enabled: bool = False
112
180
  metrics: list = []
113
181
 
114
182
 
183
+ class MatchingStrategy(StrEnum):
184
+ """Argument matching strategy:\n
185
+ Strict: exact match\n
186
+ Optional: optional argument, exact match if the field exists\n
187
+ Fuzzy: semantic/similarity match\n"""
188
+
189
+ strict = "strict"
190
+ optional = "optional"
191
+ fuzzy = "fuzzy"
192
+
193
+
115
194
  class GoalDetail(BaseModel):
116
195
  name: str
117
- tool_name: str = None
196
+ tool_name: Optional[str] = None
118
197
  type: ContentType
119
- args: Dict = None
120
- response: str = None
121
- keywords: List = None
198
+ args: Optional[Dict] = None
199
+ # matching strategy defaults to `strict` matching if not specified in the test case
200
+ arg_matching: Optional[dict[str, MatchingStrategy]] = Field(
201
+ default_factory=dict
202
+ )
203
+ response: Optional[str] = None
204
+ keywords: Optional[List] = None
205
+
206
+ @model_validator(mode="after")
207
+ def validate_arg_matching(self):
208
+ for field in self.arg_matching:
209
+ if field not in self.args:
210
+ raise ValueError(
211
+ f"{field} not in goal arguments for goal {self.name}"
212
+ )
213
+ return self
214
+
215
+
216
+ class GoalDetailOrchestrate(GoalDetail):
122
217
  knowledge_base: KnowledgeBaseGoalDetail = KnowledgeBaseGoalDetail()
123
218
 
124
219
 
@@ -131,23 +226,97 @@ class AttackData(BaseModel):
131
226
 
132
227
  class AttackData(BaseModel):
133
228
  agent: str
134
- agents_path: str
229
+ agents_list_or_path: Union[List[str], str]
135
230
  attack_data: AttackData
136
231
  story: str
137
232
  starting_sentence: str
138
- goals: Dict = None
139
- goal_details: List[GoalDetail] = None
233
+ goals: dict | None = None
234
+ goal_details: list[GoalDetail] | None = None
140
235
 
141
236
 
142
- class EvaluationData(BaseModel):
143
- agent: str
144
- goals: Dict
237
+ class DatasetModel(BaseModel):
238
+ starting_sentence: str | None = None
145
239
  story: str
240
+ goals: Mapping[str, Any]
146
241
  goal_details: List[GoalDetail]
147
- starting_sentence: str = None
242
+ max_user_turns: int | None = None
243
+ agent: str | None = None
244
+
245
+
246
+ class LangfuseDatasetModel(DatasetModel):
247
+ @computed_field
248
+ @property
249
+ def langfuse_input(self) -> Mapping[str, Any]:
250
+ input = {
251
+ "starting_sentence": self.starting_sentence,
252
+ "story": self.story,
253
+ "agent": self.agent
254
+ }
255
+
256
+ return input
257
+
258
+ @computed_field
259
+ @property
260
+ def langfuse_output(self) -> Mapping[str, Any]:
261
+ output = {"goals": self.goals, "goal_details": self.goal_details}
262
+
263
+ return output
264
+
265
+
266
+ def _convert_to_langfuse_format(langfuse_row) -> LangfuseDatasetModel:
267
+ input = langfuse_row.input
268
+ output = langfuse_row.expected_output
269
+
270
+ for goal in output.get("goal_details"):
271
+ GoalDetail.model_validate(goal)
272
+
273
+ return LangfuseDatasetModel(
274
+ starting_sentence=input.get("starting_sentence"),
275
+ story=input.get("story"),
276
+ goals=output.get("goals"),
277
+ goal_details=[
278
+ GoalDetail.model_validate(goal)
279
+ for goal in output.get("goal_details")
280
+ ],
281
+ )
282
+
283
+
284
+ class OrchestrateDataset(DatasetModel):
285
+ goal_details: List[GoalDetailOrchestrate]
286
+ agent: str
287
+
288
+
289
+ class LangfuseCollectionModel(BaseModel):
290
+ collection_name: str
291
+ datasets: List[LangfuseDatasetModel]
292
+ collection_description: Optional[str] = ""
293
+ metadata: Optional[Mapping[str, str]] = None
148
294
 
149
295
 
150
296
  class ToolDefinition(BaseModel):
151
297
  tool_description: Optional[str]
152
298
  tool_name: str
153
299
  tool_params: List[str]
300
+
301
+
302
+ class ProviderInstancesCacheKey(BaseModel):
303
+ provider: str
304
+ hashed_args: str
305
+ hashed_kwargs: str
306
+
307
+ def __str__(self) -> str:
308
+ return f"{self.provider}|{self.hashed_args}|{self.hashed_kwargs}"
309
+
310
+
311
+ class RuntimeResponse(BaseModel):
312
+ messages: List[Message]
313
+ thread_id: str | None = None
314
+ context: dict = Field(default={})
315
+
316
+
317
+ class ExperimentResult(BaseModel):
318
+ experiment_name: str
319
+ run_id: str
320
+ experiment_id: str
321
+ metrics: list
322
+ session_ids: List[str]
@@ -0,0 +1,100 @@
1
+ from wxo_agentic_evaluation.llm_user_v2 import LLMUser
2
+ from wxo_agentic_evaluation.service_provider.portkey_provider import (
3
+ PortkeyProvider,
4
+ )
5
+ from openai import OpenAI
6
+ import os
7
+ import uuid
8
+
9
+ from wxo_agentic_evaluation.type import Message, ContentType
10
+
11
+ user_story = "Your user id is mia_li_3668. You want to fly from New York to Seattle on May 20 (one way). You do not want to fly before 11am est. You want to fly in economy. You prefer direct flights but one stopover also fine. If there are multiple options, you prefer the one with the lowest price. You have 3 baggages. You do not want insurance. You want to use your two certificates to pay. If only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card. You are reactive to the agent and will not say anything that is not asked. Your birthday is in your user profile so you do not prefer to provide it."
12
+
13
+ portkey_client = PortkeyProvider(
14
+ provider="@openai",
15
+ model_id="gpt-4o-mini",
16
+ api_key=os.environ.get("PORTKEY_API_KEY"),
17
+ )
18
+
19
+ user_response_style = [
20
+ "reactive to the agent and will not say anything that is not asked",
21
+ "replies only in very short sentences and few words",
22
+ ]
23
+
24
+ user_agent = LLMUser(
25
+ llm_client=portkey_client,
26
+ user_prompt_path="../prompt/universal_user_template.jinja2",
27
+ )
28
+
29
+ agent = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
30
+
31
+
32
+ def get_agent_response(messages: list[dict]) -> str:
33
+
34
+ response = agent.chat.completions.create(
35
+ model="gpt-4o-mini", messages=messages
36
+ )
37
+ return response.choices[0].message.content
38
+
39
+
40
+ starting_user_input = Message(
41
+ role="user", content="I want to fly.", type=ContentType.text
42
+ )
43
+
44
+
45
+ agent_system_prompt = Message(
46
+ role="system",
47
+ content="You are a helpful assistant. Keep your responses short and concise.",
48
+ type=ContentType.text,
49
+ )
50
+
51
+ session_id = str(uuid.uuid4())
52
+ max_turns = 30
53
+ conversation_history = []
54
+ for i in range(max_turns):
55
+
56
+ if len(conversation_history) == 0:
57
+ conversation_history.append(agent_system_prompt)
58
+ conversation_history.append(
59
+ Message(
60
+ role="assistant",
61
+ content="Hi! How can I help you today?",
62
+ type=ContentType.text,
63
+ )
64
+ )
65
+
66
+ user_response = user_agent.generate_user_input(
67
+ user_story=user_story,
68
+ conversation_history=conversation_history,
69
+ user_response_style=user_response_style,
70
+ starting_user_input=starting_user_input,
71
+ )
72
+ else:
73
+ user_response = user_agent.generate_user_input(
74
+ user_story=user_story,
75
+ conversation_history=conversation_history,
76
+ user_response_style=user_response_style,
77
+ starting_user_input=None,
78
+ )
79
+
80
+ conversation_history.append(user_response)
81
+ print(f"User: {user_response.content}")
82
+
83
+ if "END" in user_response.content:
84
+ break
85
+
86
+ # Get agent response
87
+ agent_response_content = get_agent_response(
88
+ [msg.model_dump() for msg in conversation_history]
89
+ )
90
+ # agent_response_content = get_langflow_agent_response(conversation_history, session_id)
91
+ # agent_response_content = asyncio.run(get_langgraph_agent_response(conversation_history, session_id))
92
+ print(f"Agent: {agent_response_content}")
93
+
94
+ agent_response = Message(
95
+ role="assistant", content=agent_response_content, type=ContentType.text
96
+ )
97
+ conversation_history.append(agent_response)
98
+
99
+
100
+ print(conversation_history)
@@ -1,6 +1,47 @@
1
1
  import json
2
+ import os
3
+ import tempfile
4
+ from pathlib import Path
2
5
 
6
+ from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
7
+ ToolExtractionOpenAIFormat,
8
+ )
9
+ from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
10
+ from wxo_agentic_evaluation.utils.utils import (
11
+ N_A,
12
+ TestCaseResources,
13
+ add_line_seperator,
14
+ list_run_files,
15
+ load_run_metrics,
16
+ )
3
17
 
4
- def json_dump(output_path, object):
5
- with open(output_path, "w", encoding="utf-8") as f:
6
- json.dump(object, f, indent=4)
18
+
19
+ def json_dump(output_path, obj):
20
+ """
21
+ Atomically dump JSON to `output_path`.
22
+
23
+ - Writes to a temporary file first
24
+ - Then atomically replaces the target file
25
+ - Prevents corrupted/half-written JSON if process is interrupted
26
+ """
27
+ output_path = Path(output_path)
28
+ output_path.parent.mkdir(parents=True, exist_ok=True)
29
+
30
+ fd, tmp_path = tempfile.mkstemp(
31
+ dir=output_path.parent,
32
+ prefix=output_path.stem,
33
+ suffix=".tmp",
34
+ text=True,
35
+ )
36
+ try:
37
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
38
+ json.dump(obj, f, indent=4, ensure_ascii=False)
39
+ f.flush()
40
+ os.fsync(f.fileno())
41
+ os.replace(tmp_path, output_path)
42
+ except Exception:
43
+ try:
44
+ os.remove(tmp_path)
45
+ except OSError:
46
+ pass
47
+ raise
@@ -0,0 +1,47 @@
1
+ """
2
+ Evaluation discovery mechanism.
3
+
4
+ This module provides functionality for discovering classes that inherit from Evaluation.
5
+ """
6
+
7
+ import importlib.util
8
+ import inspect
9
+ import os
10
+
11
+
12
+ def find_evaluation_subclasses(directory: str, base_class_name="Evaluation"):
13
+ """
14
+ Dynamically import Python files under 'directory' and find classes that
15
+ inherit from a class named 'Evaluation'. Returns a list of non-abstract
16
+ class objects.
17
+ """
18
+ subclasses = []
19
+
20
+ for root, _, files in os.walk(directory):
21
+ for file in files:
22
+ if file.endswith(".py") and not file.startswith("__"):
23
+ filepath = os.path.join(root, file)
24
+ module_name = os.path.splitext(os.path.basename(filepath))[0]
25
+
26
+ spec = importlib.util.spec_from_file_location(
27
+ module_name, filepath
28
+ )
29
+ if spec and spec.loader:
30
+ module = importlib.util.module_from_spec(spec)
31
+ try:
32
+ spec.loader.exec_module(module)
33
+ except Exception as e:
34
+ print(f"Skipping {filepath} due to import error: {e}")
35
+ continue
36
+
37
+ # Inspect for subclasses
38
+ for name, obj in inspect.getmembers(
39
+ module, inspect.isclass
40
+ ):
41
+ if any(
42
+ base.__name__ == base_class_name
43
+ for base in obj.__mro__[1:]
44
+ ) and not inspect.isabstract(obj):
45
+ subclasses.append(obj)
46
+
47
+ return subclasses
@@ -0,0 +1,39 @@
1
+ import os
2
+ from functools import lru_cache
3
+
4
+ from wxo_agentic_evaluation.arg_configs import AuthConfig
5
+ from wxo_agentic_evaluation.service_provider import USE_GATEWAY_MODEL_PROVIDER
6
+ from wxo_agentic_evaluation.wxo_client import get_wxo_client
7
+
8
+ WXO_AUTH_CONFIG_DEFAULTS = AuthConfig(
9
+ url=os.getenv("WXO_URL", "http://localhost:4321"),
10
+ tenant_name=os.getenv("WXO_TENANT", "wxo-dev"),
11
+ token=os.getenv("WXO_TOKEN", None),
12
+ )
13
+
14
+
15
+ @lru_cache(maxsize=1)
16
+ def _get_cached_wxo_client():
17
+ # TODO: remove this once the client is implemented as a Singleton.
18
+ return get_wxo_client(
19
+ WXO_AUTH_CONFIG_DEFAULTS.url,
20
+ WXO_AUTH_CONFIG_DEFAULTS.tenant_name,
21
+ WXO_AUTH_CONFIG_DEFAULTS.token,
22
+ )
23
+
24
+
25
+ def get_provider_kwargs(**base_kwargs: dict) -> dict:
26
+
27
+ if not USE_GATEWAY_MODEL_PROVIDER:
28
+ return base_kwargs
29
+
30
+ if "instance_url" in base_kwargs and "token" in base_kwargs:
31
+ return base_kwargs
32
+
33
+ wxo_client = _get_cached_wxo_client()
34
+
35
+ return {
36
+ **base_kwargs,
37
+ "instance_url": wxo_client.service_url,
38
+ "token": wxo_client.api_key,
39
+ }
@@ -0,0 +1,30 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from wxo_agentic_evaluation.type import ContentType, Message
6
+
7
+
8
+ class ParsedMessages(BaseModel):
9
+ """
10
+ A parsed history of messages.
11
+ """
12
+
13
+ messages: list[Message] = Field(description="The list of messages")
14
+
15
+ @property
16
+ def user_input(self) -> Optional[str]:
17
+ """Find the original user message."""
18
+ for message in self.messages:
19
+ if message.role == "user" and message.type == ContentType.text:
20
+ return str(message.content)
21
+ return None
22
+
23
+ @property
24
+ def agent_response(self) -> Optional[str]:
25
+ """Find the most recent assistant message."""
26
+ messages_in_reverse = reversed(self.messages)
27
+ for message in messages_in_reverse:
28
+ if message.role == "assistant" and message.type == ContentType.text:
29
+ return str(message.content)
30
+ return None