ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,21 @@
1
- from typing import Dict, List, Union, Any, Optional
2
- from pydantic import BaseModel, computed_field, ConfigDict
3
- from enum import StrEnum
1
+ from enum import Enum, StrEnum
2
+ from hashlib import md5
3
+ from typing import Any, Dict, List, Literal, Mapping, Optional, Union
4
+
5
+ from pydantic import (
6
+ BaseModel,
7
+ ConfigDict,
8
+ Field,
9
+ computed_field,
10
+ model_validator,
11
+ )
12
+
13
+
14
+ class CallTracker(BaseModel):
15
+ tool_call: List = []
16
+ tool_response: List = []
17
+ generic: List = []
18
+ metadata: Dict[str, Any] = Field(default={})
4
19
 
5
20
 
6
21
  class EventTypes(StrEnum):
@@ -20,6 +35,16 @@ class ContentType(StrEnum):
20
35
  conversational_search = "conversational_search"
21
36
 
22
37
 
38
+ class AttackCategory(StrEnum):
39
+ on_policy = "on_policy"
40
+ off_policy = "off_policy"
41
+
42
+
43
+ class Roles(Enum):
44
+ ASSISTANT = "assistant"
45
+ USER = "user"
46
+
47
+
23
48
  class ConversationalSearchCitations(BaseModel):
24
49
  url: str
25
50
  body: str
@@ -51,9 +76,13 @@ class ConversationalConfidenceThresholdScore(BaseModel):
51
76
  def table(self):
52
77
  return {
53
78
  "response_confidence": str(self.response_confidence),
54
- "response_confidence_threshold": str(self.response_confidence_threshold),
79
+ "response_confidence_threshold": str(
80
+ self.response_confidence_threshold
81
+ ),
55
82
  "retrieval_confidence": str(self.retrieval_confidence),
56
- "retrieval_confidence_threshold": str(self.retrieval_confidence_threshold),
83
+ "retrieval_confidence_threshold": str(
84
+ self.retrieval_confidence_threshold
85
+ ),
57
86
  }
58
87
 
59
88
 
@@ -79,10 +108,35 @@ class ConversationalSearch(BaseModel):
79
108
  response_length_option: str
80
109
 
81
110
 
111
+ class OTelParserFunction(BaseModel):
112
+ """OpenAI chat completion function structure for OTel parser tool calls"""
113
+
114
+ name: str
115
+ arguments: str # JSON string of arguments
116
+
117
+ model_config = ConfigDict(frozen=True)
118
+
119
+ def __str__(self):
120
+ return f"{self.name}:{self.arguments}"
121
+
122
+
123
+ class OTelParserToolCall(BaseModel):
124
+ """OpenAI chat completion tool call structure for OTel parser"""
125
+
126
+ id: str
127
+ function: OTelParserFunction
128
+ type: Literal["function"] = "function"
129
+
130
+ model_config = ConfigDict(frozen=True)
131
+
132
+ def __str__(self):
133
+ return f"{self.id}:{self.type}:{self.function}"
134
+
135
+
82
136
  class Message(BaseModel):
83
137
  role: str
84
138
  content: Union[str, Dict[str, Any]]
85
- type: ContentType
139
+ type: ContentType = None
86
140
  # event that produced the message
87
141
  event: Optional[str] = None
88
142
  # used to correlate the Message with the retrieval context (ConversationalSearch)
@@ -93,7 +147,32 @@ class Message(BaseModel):
93
147
 
94
148
  class ExtendedMessage(BaseModel):
95
149
  message: Message
96
- reason: dict | None = None
150
+ reason: dict | list | None = None
151
+
152
+
153
+ class OTelParserMessage(Message):
154
+ """Message class for OTel parser with OpenAI-compatible tool call fields.
155
+
156
+ Inherits from Message and adds structured tool call fields for compatibility
157
+ with OpenTelemetry trace parsing (LangGraph, Pydantic AI, etc.)
158
+ """
159
+
160
+ tool_calls: Optional[List[OTelParserToolCall]] = None
161
+ tool_call_id: Optional[str] = None
162
+
163
+ def hash(self) -> str:
164
+ """Generate hash for message deduplication"""
165
+ parts = [
166
+ self.role,
167
+ str(self.content) if self.content else "",
168
+ (
169
+ ":".join(str(tc) for tc in self.tool_calls)
170
+ if self.tool_calls
171
+ else ""
172
+ ),
173
+ self.tool_call_id or "",
174
+ ]
175
+ return md5(":".join(parts).encode("utf-8")).hexdigest()
97
176
 
98
177
 
99
178
  class KnowledgeBaseGoalDetail(BaseModel):
@@ -101,19 +180,143 @@ class KnowledgeBaseGoalDetail(BaseModel):
101
180
  metrics: list = []
102
181
 
103
182
 
183
+ class MatchingStrategy(StrEnum):
184
+ """Argument matching strategy:\n
185
+ Strict: exact match\n
186
+ Optional: optional argument, exact match if the field exists\n
187
+ Fuzzy: semantic/similarity match\n"""
188
+
189
+ strict = "strict"
190
+ optional = "optional"
191
+ fuzzy = "fuzzy"
192
+
193
+
104
194
  class GoalDetail(BaseModel):
105
195
  name: str
106
- tool_name: str = None
196
+ tool_name: Optional[str] = None
107
197
  type: ContentType
108
- args: Dict = None
109
- response: str = None
110
- keywords: List = None
198
+ args: Optional[Dict] = None
199
+ # matching strategy defaults to `strict` matching if not specified in the test case
200
+ arg_matching: Optional[dict[str, MatchingStrategy]] = Field(
201
+ default_factory=dict
202
+ )
203
+ response: Optional[str] = None
204
+ keywords: Optional[List] = None
205
+
206
+ @model_validator(mode="after")
207
+ def validate_arg_matching(self):
208
+ for field in self.arg_matching:
209
+ if field not in self.args:
210
+ raise ValueError(
211
+ f"{field} not in goal arguments for goal {self.name}"
212
+ )
213
+ return self
214
+
215
+
216
+ class GoalDetailOrchestrate(GoalDetail):
111
217
  knowledge_base: KnowledgeBaseGoalDetail = KnowledgeBaseGoalDetail()
112
218
 
113
219
 
114
- class EvaluationData(BaseModel):
220
+ class AttackData(BaseModel):
221
+ attack_category: AttackCategory
222
+ attack_type: str
223
+ attack_name: str
224
+ attack_instructions: str
225
+
226
+
227
+ class AttackData(BaseModel):
115
228
  agent: str
116
- goals: Dict
229
+ agents_list_or_path: Union[List[str], str]
230
+ attack_data: AttackData
117
231
  story: str
232
+ starting_sentence: str
233
+ goals: dict | None = None
234
+ goal_details: list[GoalDetail] | None = None
235
+
236
+
237
+ class DatasetModel(BaseModel):
238
+ starting_sentence: str | None = None
239
+ story: str
240
+ goals: Mapping[str, Any]
118
241
  goal_details: List[GoalDetail]
119
- starting_sentence: str = None
242
+ max_user_turns: int | None = None
243
+ agent: str | None = None
244
+
245
+
246
+ class LangfuseDatasetModel(DatasetModel):
247
+ @computed_field
248
+ @property
249
+ def langfuse_input(self) -> Mapping[str, Any]:
250
+ input = {
251
+ "starting_sentence": self.starting_sentence,
252
+ "story": self.story,
253
+ "agent": self.agent
254
+ }
255
+
256
+ return input
257
+
258
+ @computed_field
259
+ @property
260
+ def langfuse_output(self) -> Mapping[str, Any]:
261
+ output = {"goals": self.goals, "goal_details": self.goal_details}
262
+
263
+ return output
264
+
265
+
266
+ def _convert_to_langfuse_format(langfuse_row) -> LangfuseDatasetModel:
267
+ input = langfuse_row.input
268
+ output = langfuse_row.expected_output
269
+
270
+ for goal in output.get("goal_details"):
271
+ GoalDetail.model_validate(goal)
272
+
273
+ return LangfuseDatasetModel(
274
+ starting_sentence=input.get("starting_sentence"),
275
+ story=input.get("story"),
276
+ goals=output.get("goals"),
277
+ goal_details=[
278
+ GoalDetail.model_validate(goal)
279
+ for goal in output.get("goal_details")
280
+ ],
281
+ )
282
+
283
+
284
+ class OrchestrateDataset(DatasetModel):
285
+ goal_details: List[GoalDetailOrchestrate]
286
+ agent: str
287
+
288
+
289
+ class LangfuseCollectionModel(BaseModel):
290
+ collection_name: str
291
+ datasets: List[LangfuseDatasetModel]
292
+ collection_description: Optional[str] = ""
293
+ metadata: Optional[Mapping[str, str]] = None
294
+
295
+
296
+ class ToolDefinition(BaseModel):
297
+ tool_description: Optional[str]
298
+ tool_name: str
299
+ tool_params: List[str]
300
+
301
+
302
+ class ProviderInstancesCacheKey(BaseModel):
303
+ provider: str
304
+ hashed_args: str
305
+ hashed_kwargs: str
306
+
307
+ def __str__(self) -> str:
308
+ return f"{self.provider}|{self.hashed_args}|{self.hashed_kwargs}"
309
+
310
+
311
+ class RuntimeResponse(BaseModel):
312
+ messages: List[Message]
313
+ thread_id: str | None = None
314
+ context: dict = Field(default={})
315
+
316
+
317
+ class ExperimentResult(BaseModel):
318
+ experiment_name: str
319
+ run_id: str
320
+ experiment_id: str
321
+ metrics: list
322
+ session_ids: List[str]
@@ -0,0 +1,100 @@
1
+ from wxo_agentic_evaluation.llm_user_v2 import LLMUser
2
+ from wxo_agentic_evaluation.service_provider.portkey_provider import (
3
+ PortkeyProvider,
4
+ )
5
+ from openai import OpenAI
6
+ import os
7
+ import uuid
8
+
9
+ from wxo_agentic_evaluation.type import Message, ContentType
10
+
11
+ user_story = "Your user id is mia_li_3668. You want to fly from New York to Seattle on May 20 (one way). You do not want to fly before 11am est. You want to fly in economy. You prefer direct flights but one stopover also fine. If there are multiple options, you prefer the one with the lowest price. You have 3 baggages. You do not want insurance. You want to use your two certificates to pay. If only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card. You are reactive to the agent and will not say anything that is not asked. Your birthday is in your user profile so you do not prefer to provide it."
12
+
13
+ portkey_client = PortkeyProvider(
14
+ provider="@openai",
15
+ model_id="gpt-4o-mini",
16
+ api_key=os.environ.get("PORTKEY_API_KEY"),
17
+ )
18
+
19
+ user_response_style = [
20
+ "reactive to the agent and will not say anything that is not asked",
21
+ "replies only in very short sentences and few words",
22
+ ]
23
+
24
+ user_agent = LLMUser(
25
+ llm_client=portkey_client,
26
+ user_prompt_path="../prompt/universal_user_template.jinja2",
27
+ )
28
+
29
+ agent = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
30
+
31
+
32
+ def get_agent_response(messages: list[dict]) -> str:
33
+
34
+ response = agent.chat.completions.create(
35
+ model="gpt-4o-mini", messages=messages
36
+ )
37
+ return response.choices[0].message.content
38
+
39
+
40
+ starting_user_input = Message(
41
+ role="user", content="I want to fly.", type=ContentType.text
42
+ )
43
+
44
+
45
+ agent_system_prompt = Message(
46
+ role="system",
47
+ content="You are a helpful assistant. Keep your responses short and concise.",
48
+ type=ContentType.text,
49
+ )
50
+
51
+ session_id = str(uuid.uuid4())
52
+ max_turns = 30
53
+ conversation_history = []
54
+ for i in range(max_turns):
55
+
56
+ if len(conversation_history) == 0:
57
+ conversation_history.append(agent_system_prompt)
58
+ conversation_history.append(
59
+ Message(
60
+ role="assistant",
61
+ content="Hi! How can I help you today?",
62
+ type=ContentType.text,
63
+ )
64
+ )
65
+
66
+ user_response = user_agent.generate_user_input(
67
+ user_story=user_story,
68
+ conversation_history=conversation_history,
69
+ user_response_style=user_response_style,
70
+ starting_user_input=starting_user_input,
71
+ )
72
+ else:
73
+ user_response = user_agent.generate_user_input(
74
+ user_story=user_story,
75
+ conversation_history=conversation_history,
76
+ user_response_style=user_response_style,
77
+ starting_user_input=None,
78
+ )
79
+
80
+ conversation_history.append(user_response)
81
+ print(f"User: {user_response.content}")
82
+
83
+ if "END" in user_response.content:
84
+ break
85
+
86
+ # Get agent response
87
+ agent_response_content = get_agent_response(
88
+ [msg.model_dump() for msg in conversation_history]
89
+ )
90
+ # agent_response_content = get_langflow_agent_response(conversation_history, session_id)
91
+ # agent_response_content = asyncio.run(get_langgraph_agent_response(conversation_history, session_id))
92
+ print(f"Agent: {agent_response_content}")
93
+
94
+ agent_response = Message(
95
+ role="assistant", content=agent_response_content, type=ContentType.text
96
+ )
97
+ conversation_history.append(agent_response)
98
+
99
+
100
+ print(conversation_history)
@@ -1,6 +1,47 @@
1
1
  import json
2
+ import os
3
+ import tempfile
4
+ from pathlib import Path
2
5
 
6
+ from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
7
+ ToolExtractionOpenAIFormat,
8
+ )
9
+ from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
10
+ from wxo_agentic_evaluation.utils.utils import (
11
+ N_A,
12
+ TestCaseResources,
13
+ add_line_seperator,
14
+ list_run_files,
15
+ load_run_metrics,
16
+ )
3
17
 
4
- def json_dump(output_path, object):
5
- with open(output_path, "w", encoding="utf-8") as f:
6
- json.dump(object, f, indent=4)
18
+
19
+ def json_dump(output_path, obj):
20
+ """
21
+ Atomically dump JSON to `output_path`.
22
+
23
+ - Writes to a temporary file first
24
+ - Then atomically replaces the target file
25
+ - Prevents corrupted/half-written JSON if process is interrupted
26
+ """
27
+ output_path = Path(output_path)
28
+ output_path.parent.mkdir(parents=True, exist_ok=True)
29
+
30
+ fd, tmp_path = tempfile.mkstemp(
31
+ dir=output_path.parent,
32
+ prefix=output_path.stem,
33
+ suffix=".tmp",
34
+ text=True,
35
+ )
36
+ try:
37
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
38
+ json.dump(obj, f, indent=4, ensure_ascii=False)
39
+ f.flush()
40
+ os.fsync(f.fileno())
41
+ os.replace(tmp_path, output_path)
42
+ except Exception:
43
+ try:
44
+ os.remove(tmp_path)
45
+ except OSError:
46
+ pass
47
+ raise
@@ -0,0 +1,47 @@
1
+ """
2
+ Evaluation discovery mechanism.
3
+
4
+ This module provides functionality for discovering classes that inherit from Evaluation.
5
+ """
6
+
7
+ import importlib.util
8
+ import inspect
9
+ import os
10
+
11
+
12
+ def find_evaluation_subclasses(directory: str, base_class_name="Evaluation"):
13
+ """
14
+ Dynamically import Python files under 'directory' and find classes that
15
+ inherit from a class named 'Evaluation'. Returns a list of non-abstract
16
+ class objects.
17
+ """
18
+ subclasses = []
19
+
20
+ for root, _, files in os.walk(directory):
21
+ for file in files:
22
+ if file.endswith(".py") and not file.startswith("__"):
23
+ filepath = os.path.join(root, file)
24
+ module_name = os.path.splitext(os.path.basename(filepath))[0]
25
+
26
+ spec = importlib.util.spec_from_file_location(
27
+ module_name, filepath
28
+ )
29
+ if spec and spec.loader:
30
+ module = importlib.util.module_from_spec(spec)
31
+ try:
32
+ spec.loader.exec_module(module)
33
+ except Exception as e:
34
+ print(f"Skipping {filepath} due to import error: {e}")
35
+ continue
36
+
37
+ # Inspect for subclasses
38
+ for name, obj in inspect.getmembers(
39
+ module, inspect.isclass
40
+ ):
41
+ if any(
42
+ base.__name__ == base_class_name
43
+ for base in obj.__mro__[1:]
44
+ ) and not inspect.isabstract(obj):
45
+ subclasses.append(obj)
46
+
47
+ return subclasses
@@ -0,0 +1,39 @@
1
+ import os
2
+ from functools import lru_cache
3
+
4
+ from wxo_agentic_evaluation.arg_configs import AuthConfig
5
+ from wxo_agentic_evaluation.service_provider import USE_GATEWAY_MODEL_PROVIDER
6
+ from wxo_agentic_evaluation.wxo_client import get_wxo_client
7
+
8
+ WXO_AUTH_CONFIG_DEFAULTS = AuthConfig(
9
+ url=os.getenv("WXO_URL", "http://localhost:4321"),
10
+ tenant_name=os.getenv("WXO_TENANT", "wxo-dev"),
11
+ token=os.getenv("WXO_TOKEN", None),
12
+ )
13
+
14
+
15
+ @lru_cache(maxsize=1)
16
+ def _get_cached_wxo_client():
17
+ # TODO: remove this once the client is implemented as a Singleton.
18
+ return get_wxo_client(
19
+ WXO_AUTH_CONFIG_DEFAULTS.url,
20
+ WXO_AUTH_CONFIG_DEFAULTS.tenant_name,
21
+ WXO_AUTH_CONFIG_DEFAULTS.token,
22
+ )
23
+
24
+
25
+ def get_provider_kwargs(**base_kwargs: dict) -> dict:
26
+
27
+ if not USE_GATEWAY_MODEL_PROVIDER:
28
+ return base_kwargs
29
+
30
+ if "instance_url" in base_kwargs and "token" in base_kwargs:
31
+ return base_kwargs
32
+
33
+ wxo_client = _get_cached_wxo_client()
34
+
35
+ return {
36
+ **base_kwargs,
37
+ "instance_url": wxo_client.service_url,
38
+ "token": wxo_client.api_key,
39
+ }
@@ -0,0 +1,30 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from wxo_agentic_evaluation.type import ContentType, Message
6
+
7
+
8
+ class ParsedMessages(BaseModel):
9
+ """
10
+ A parsed history of messages.
11
+ """
12
+
13
+ messages: list[Message] = Field(description="The list of messages")
14
+
15
+ @property
16
+ def user_input(self) -> Optional[str]:
17
+ """Find the original user message."""
18
+ for message in self.messages:
19
+ if message.role == "user" and message.type == ContentType.text:
20
+ return str(message.content)
21
+ return None
22
+
23
+ @property
24
+ def agent_response(self) -> Optional[str]:
25
+ """Find the most recent assistant message."""
26
+ messages_in_reverse = reversed(self.messages)
27
+ for message in messages_in_reverse:
28
+ if message.role == "assistant" and message.type == ContentType.text:
29
+ return str(message.content)
30
+ return None