ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,14 @@
1
+ from wxo_agentic_evaluation.type import CallTracker, Message, RuntimeResponse
2
+ from abc import abstractmethod
3
+
4
+
5
+ class RuntimeAdapter:
6
+
7
+ @abstractmethod
8
+ def run(
9
+ self,
10
+ user_message: Message,
11
+ context: dict,
12
+ thread_id=None,
13
+ ) -> RuntimeResponse:
14
+ pass
@@ -1,33 +1,31 @@
1
- import requests
2
- import os
3
- import yaml
4
1
  import json
5
- import rich
2
+ import os
6
3
  import time
7
- from pydantic import BaseModel
8
- from typing import List, Generator, Dict, Tuple, Mapping, Any
4
+ from typing import Any, Dict, Generator, List, Mapping
5
+
6
+ import requests
7
+ import rich
8
+ import yaml
9
9
 
10
+ from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import (
11
+ RuntimeAdapter,
12
+ )
13
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
14
+ WatsonXProvider,
15
+ )
10
16
  from wxo_agentic_evaluation.type import (
11
17
  ContentType,
12
- Message,
18
+ ConversationalConfidenceThresholdScore,
13
19
  ConversationalSearch,
14
20
  ConversationalSearchCitations,
15
21
  ConversationalSearchResultMetadata,
16
- ConversationalConfidenceThresholdScore,
17
22
  ConversationalSearchResults,
18
23
  ConversationSearchMetadata,
24
+ Message,
25
+ RuntimeResponse,
19
26
  )
20
- from wxo_agentic_evaluation.llm_user import LLMUser
21
- from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
22
- from wxo_agentic_evaluation.arg_configs import TestConfig
23
- from wxo_agentic_evaluation.service_instance import tenant_setup
24
27
  from wxo_agentic_evaluation.utils.utils import is_saas_url
25
-
26
-
27
- def is_end(user_input: Message):
28
- if "END" in user_input.content.strip():
29
- return True
30
- return False
28
+ from wxo_agentic_evaluation.wxo_client import WXOClient
31
29
 
32
30
 
33
31
  def is_transfer_response(step_detail: Dict):
@@ -39,40 +37,12 @@ def is_transfer_response(step_detail: Dict):
39
37
  return False
40
38
 
41
39
 
42
- class CallTracker(BaseModel):
43
- tool_call: List = []
44
- tool_response: List = []
45
- generic: List = []
46
-
47
-
48
- class WXOClient:
49
- def __init__(self, service_url, api_key):
50
- self.service_url = service_url
51
- self.api_key = api_key
52
-
53
- def _get_headers(self) -> dict:
54
- headers = {}
55
- if self.api_key:
56
- headers["Authorization"] = f"Bearer {self.api_key}"
57
- return headers
58
-
59
- def post(self, payload: dict, path: str, stream=False):
60
- url = f"{self.service_url}/{path}"
61
- return requests.post(
62
- url=url, headers=self._get_headers(), json=payload, stream=stream
63
- )
64
-
65
- def get(self, path: str, params: dict = None):
66
- url = f"{self.service_url}/{path}"
67
- return requests.get(url, params=params, headers=self._get_headers())
68
-
69
-
70
- class WXOInferenceBackend:
40
+ class WXORuntimeAdapter(RuntimeAdapter):
71
41
  def __init__(self, wxo_client):
72
42
  self.wxo_client = wxo_client
73
43
  self.enable_saas_mode = is_saas_url(wxo_client.service_url)
74
44
 
75
- def run(self, user_input: Message, agent_name, thread_id=None):
45
+ def _runs_endpoint(self, user_input: Message, agent_name, thread_id=None):
76
46
  agent_id = self.get_agent_id(agent_name)
77
47
  payload = {"message": user_input.model_dump(), "agent_id": agent_id}
78
48
  if thread_id:
@@ -108,7 +78,9 @@ class WXOInferenceBackend:
108
78
  else:
109
79
  path = "v1/orchestrate/runs?stream=true"
110
80
 
111
- response: requests.Response = self.wxo_client.post(payload, path, stream=True)
81
+ response: requests.Response = self.wxo_client.post(
82
+ payload, path, stream=True
83
+ )
112
84
  import json
113
85
 
114
86
  for chunk in self._parse_events(response):
@@ -161,7 +133,9 @@ class WXOInferenceBackend:
161
133
  citations = parse_citations()
162
134
  retrieval_context = parsed_search_results()
163
135
  citations_title = conversational_search.get("citations_title", "")
164
- response_length_option = conversational_search.get("response_length_option", "")
136
+ response_length_option = conversational_search.get(
137
+ "response_length_option", ""
138
+ )
165
139
  text = conversational_search.get("text", "")
166
140
 
167
141
  confidence_scores = ConversationalConfidenceThresholdScore(
@@ -184,20 +158,21 @@ class WXOInferenceBackend:
184
158
 
185
159
  return conversational_search
186
160
 
187
- def stream_messages(
161
+ def run(
188
162
  self,
189
163
  user_input: Message,
190
- agent_name: str,
191
- call_tracker: CallTracker,
164
+ context: dict,
192
165
  thread_id=None,
193
- ) -> Tuple[List[Message], str, List[ConversationalSearch]]:
166
+ ) -> RuntimeResponse:
167
+
168
+ agent_name = context["agent_name"]
169
+ call_tracker = context["call_tracker"]
194
170
  recover = False
195
171
  messages = list()
196
172
  conversational_search_data = []
197
173
 
198
174
  start_time = time.time()
199
175
  for chunk in self._stream_events(user_input, agent_name, thread_id):
200
-
201
176
  event = chunk.get("event", "")
202
177
  if _thread_id := chunk.get("data", {}).get("thread_id"):
203
178
  thread_id = _thread_id
@@ -234,7 +209,9 @@ class WXOInferenceBackend:
234
209
  )
235
210
  )
236
211
  end_time = time.time()
237
- call_tracker.tool_call.append(end_time - start_time)
212
+ call_tracker.tool_call.append(
213
+ end_time - start_time
214
+ )
238
215
  start_time = end_time
239
216
  elif step_detail["type"] == "tool_call":
240
217
  # in step details, we could have [tool_response, tool_call]
@@ -252,7 +229,9 @@ class WXOInferenceBackend:
252
229
  )
253
230
  )
254
231
  end_time = time.time()
255
- call_tracker.tool_call.append(end_time - start_time)
232
+ call_tracker.tool_call.append(
233
+ end_time - start_time
234
+ )
256
235
  start_time = end_time
257
236
  elif step_detail["type"] == "tool_response":
258
237
  content = json.dumps(step_detail)
@@ -266,7 +245,9 @@ class WXOInferenceBackend:
266
245
  )
267
246
  )
268
247
  end_time = time.time()
269
- call_tracker.tool_response.append(end_time - start_time)
248
+ call_tracker.tool_response.append(
249
+ end_time - start_time
250
+ )
270
251
  start_time = end_time
271
252
  elif content_field := delta.get("content"):
272
253
  for val in content_field:
@@ -285,7 +266,9 @@ class WXOInferenceBackend:
285
266
  chunk=event,
286
267
  )
287
268
  end_time = time.time()
288
- call_tracker.generic.append(end_time - start_time)
269
+ call_tracker.generic.append(
270
+ end_time - start_time
271
+ )
289
272
  start_time = end_time
290
273
 
291
274
  # NOTE: The event here that is parsed is part of the "message.created" event
@@ -309,10 +292,14 @@ class WXOInferenceBackend:
309
292
  """
310
293
 
311
294
  last_message = json.loads(messages[-1].content)
312
- tool_call_id = last_message.get("tool_call_id", None)
295
+ tool_call_id = last_message.get(
296
+ "tool_call_id", None
297
+ )
313
298
  assert tool_call_id is not None
314
- conversational_search_metadata = ConversationSearchMetadata(
315
- tool_call_id=tool_call_id
299
+ conversational_search_metadata = (
300
+ ConversationSearchMetadata(
301
+ tool_call_id=tool_call_id
302
+ )
316
303
  )
317
304
  conversational_search = (
318
305
  self.parse_conversational_search_response(
@@ -320,7 +307,9 @@ class WXOInferenceBackend:
320
307
  metadata=conversational_search_metadata,
321
308
  )
322
309
  )
323
- conversational_search_data.append(conversational_search)
310
+ conversational_search_data.append(
311
+ conversational_search
312
+ )
324
313
  messages.append(
325
314
  Message(
326
315
  role=role,
@@ -361,7 +350,11 @@ class WXOInferenceBackend:
361
350
  f"Recovered {len(messages)} messages from thread_id {thread_id}",
362
351
  )
363
352
 
364
- return messages, thread_id, conversational_search_data
353
+ return RuntimeResponse(
354
+ messages=messages,
355
+ thread_id=thread_id,
356
+ context={"conversational_search_data": conversational_search_data},
357
+ )
365
358
 
366
359
  def _parse_events(
367
360
  self, stream: Generator[bytes, None, None]
@@ -406,6 +399,13 @@ class WXOInferenceBackend:
406
399
  tool_json = {"type": "tool_call"}
407
400
  tool_json.update(tool)
408
401
  content = json.dumps(tool_json)
402
+ # TO-DO: review do we even need the get messages for retry loop anymore?
403
+ if msg_content := entry.get("content"):
404
+ if (
405
+ msg_content[0].get("response_type")
406
+ == "conversational_search"
407
+ ):
408
+ continue
409
409
  messages.append(
410
410
  Message(
411
411
  role=role,
@@ -419,7 +419,9 @@ class WXOInferenceBackend:
419
419
  content = json.dumps(step_detail)
420
420
  messages.append(
421
421
  Message(
422
- role=role, content=content, type=content_type
422
+ role=role,
423
+ content=content,
424
+ type=content_type,
423
425
  )
424
426
  )
425
427
  else:
@@ -427,7 +429,9 @@ class WXOInferenceBackend:
427
429
  content_type = ContentType.tool_response
428
430
  messages.append(
429
431
  Message(
430
- role=role, content=content, type=content_type
432
+ role=role,
433
+ content=content,
434
+ type=content_type,
431
435
  )
432
436
  )
433
437
  if content_field := entry.get("content"):
@@ -436,12 +440,19 @@ class WXOInferenceBackend:
436
440
  if val["response_type"] == ContentType.text:
437
441
  messages.append(
438
442
  Message(
439
- role=role, content=val["text"], type=ContentType.text
443
+ role=role,
444
+ content=val["text"],
445
+ type=ContentType.text,
440
446
  )
441
447
  )
442
- if val["response_type"] == ContentType.conversational_search:
443
- conversational_search_metadata = ConversationSearchMetadata(
444
- tool_call_id=tool_call_id
448
+ if (
449
+ val["response_type"]
450
+ == ContentType.conversational_search
451
+ ):
452
+ conversational_search_metadata = (
453
+ ConversationSearchMetadata(
454
+ tool_call_id=tool_call_id
455
+ )
445
456
  )
446
457
  messages.append(
447
458
  Message(
@@ -503,94 +514,21 @@ class WXOInferenceBackend:
503
514
  return None
504
515
 
505
516
 
506
- class EvaluationController:
507
- def __init__(
508
- self,
509
- wxo_inference_backend: WXOInferenceBackend,
510
- llm_user: LLMUser,
511
- config: TestConfig,
512
- ):
513
- self.wxo_inference_backend = wxo_inference_backend
514
- self.llm_user = llm_user
515
- self.config = config
516
-
517
- def run(
518
- self, task_n, story, agent_name: str, starting_user_input: str = None
519
- ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
520
- step = 0
521
- thread_id = None
522
- conversation_history: List[Message] = []
523
- conversational_search_history_data = []
524
- call_tracker = CallTracker()
525
- # make this configurable
526
- while step < 20:
527
-
528
- if step == 0 and starting_user_input:
529
- user_input = Message(
530
- role="user", content=starting_user_input, type=ContentType.text
531
- )
532
- else:
533
- if self.config.enable_manual_user_input == True:
534
- content = input(
535
- "[medium_orchid1]Enter your input[/medium_orchid1] ✍️: "
536
- )
537
- user_input = Message(
538
- role="user", content=content, type=ContentType.text
539
- )
540
- else: # llm
541
- user_input = self.llm_user.generate_user_input(
542
- story, conversation_history
543
- )
544
- if self.config.enable_verbose_logging:
545
- rich.print(
546
- f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
547
- user_input.content,
548
- )
549
- if is_end(user_input):
550
- break
551
- conversation_history.append(user_input)
552
- messages, thread_id, conversational_search_data = (
553
- self.wxo_inference_backend.stream_messages(
554
- user_input,
555
- agent_name=agent_name,
556
- thread_id=thread_id,
557
- call_tracker=call_tracker,
558
- )
559
- )
560
- if not messages:
561
- raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
562
- if self.config.enable_verbose_logging:
563
- for message in messages:
564
- rich.print(
565
- f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
566
- message.content,
567
- )
568
- conversation_history.extend(messages)
569
- conversational_search_history_data.extend(conversational_search_data)
570
- step += 1
571
- return conversation_history, call_tracker, conversational_search_history_data
572
-
573
- def get_wxo_client(
574
- service_url: str, tenant_name: str, token: str = None
575
- ) -> WXOClient:
576
- if not token:
577
- token = tenant_setup(service_url, tenant_name)
578
- wxo_client = WXOClient(service_url=service_url, api_key=token)
579
- return wxo_client
580
-
581
-
582
517
  if __name__ == "__main__":
583
518
  wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
584
- auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
519
+ auth_config_path = (
520
+ f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
521
+ )
585
522
  with open(auth_config_path, "r") as f:
586
523
  auth_config = yaml.safe_load(f)
524
+
587
525
  tenant_name = "local"
588
526
  token = auth_config["auth"][tenant_name]["wxo_mcsp_token"]
589
527
 
590
528
  wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
591
- inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
592
- resp = wxo_client.get("orchestrate/agents")
529
+ inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
530
+ resp = wxo_client.get("v1/orchestrate/agents")
593
531
  resp = resp.json()
594
- print(resp[0])
532
+
595
533
  for agent in resp:
596
534
  print(agent["name"], agent["display_name"])
@@ -0,0 +1,247 @@
1
+ import glob
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from enum import unique
7
+ from pathlib import Path
8
+ from typing import Any, Callable, Dict, List, Set, Tuple
9
+
10
+ from rich import print as rich_print
11
+ from rich.progress import Progress
12
+
13
+ from wxo_agentic_evaluation.arg_configs import TestConfig
14
+ from wxo_agentic_evaluation.clients import Clients
15
+ from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
16
+
17
+
18
+ def discover_tests(
19
+ test_paths: List[str], recursive_search: bool = False
20
+ ) -> List[str]:
21
+ """
22
+ Discover test cases from the given test paths.
23
+
24
+ This function searches for JSON test case files in the provided paths.
25
+ When recursive_search is enabled, it will search through all subdirectories
26
+ recursively. Otherwise, it will only search the top level of each directory.
27
+
28
+ Args:
29
+ test_paths: List of paths to search for test cases
30
+ recursive_search: Whether to search recursively in subdirectories
31
+
32
+ Returns:
33
+ List of unique test case names
34
+ """
35
+ test_cases = []
36
+ for test_path in test_paths:
37
+ # Check if the path exists
38
+ if not glob.glob(test_path):
39
+ rich_print(
40
+ f"[bold yellow]Warning: Path '{test_path}' does not exist. Skipping.[/bold yellow]"
41
+ )
42
+ continue
43
+
44
+ if os.path.isdir(test_path):
45
+ if recursive_search:
46
+ # Use ** pattern for recursive search
47
+ pattern = os.path.join(test_path, "**", "*.json")
48
+ found_files = sorted(glob.glob(pattern, recursive=True))
49
+ rich_print(
50
+ f"Found {len(found_files)} files in '{test_path}' (recursive search)"
51
+ )
52
+ test_cases.extend(found_files)
53
+ else:
54
+ # Original behavior for non-recursive search
55
+ pattern = os.path.join(test_path, "*.json")
56
+ found_files = sorted(glob.glob(pattern))
57
+ rich_print(
58
+ f"Found {len(found_files)} files in '{test_path}' (non-recursive)"
59
+ )
60
+ test_cases.extend(found_files)
61
+ else:
62
+ # If it's a file pattern, just use it directly
63
+ found_files = sorted(glob.glob(test_path))
64
+ test_cases.extend(found_files)
65
+
66
+ # Filter out non-JSON files and agent.json files
67
+ filtered_cases = [
68
+ tc
69
+ for tc in test_cases
70
+ if tc.endswith(".json") and not tc.endswith("agent.json")
71
+ ]
72
+
73
+ # create mapping of test case name to file path
74
+ unique_files_map: dict[str, str] = {}
75
+
76
+ for f in filtered_cases:
77
+ name = Path(f).stem
78
+ if name not in unique_files_map:
79
+ unique_files_map[name] = f
80
+ else:
81
+ rich_print(
82
+ f"[bold red]Duplicate test case name detected:[/bold red] "
83
+ f"'{name}' (skipping file '{f}')"
84
+ )
85
+
86
+ unique_files = list(unique_files_map.values())
87
+ rich_print(
88
+ f"[bold green]Discovered {len(unique_files)} test cases in total[/bold green]"
89
+ )
90
+ return unique_files
91
+
92
+
93
+ def _removesuffix(s: str, suf: str) -> str:
94
+ """Remove suffix from string (for Python < 3.9 compatibility)"""
95
+ return s[: -len(suf)] if s.endswith(suf) else s
96
+
97
+
98
+ def get_available_runs(output_dir: str) -> Dict[str, Set[int]]:
99
+ """
100
+ Get available runs from the output directory.
101
+
102
+ Args:
103
+ output_dir: Output directory path
104
+
105
+ Returns:
106
+ Dictionary mapping test case stems to sets of run numbers
107
+ """
108
+ available_runs = defaultdict(set)
109
+ for f in glob.glob(os.path.join(output_dir, "messages", "*.messages.json")):
110
+ # strip the fixed tail
111
+ name = _removesuffix(os.path.basename(f), ".messages.json")
112
+ # match either "<stem>" (single run) OR "<stem>.runN" (multi-run)
113
+ m = re.match(r"^(?P<stem>.+?)(?:\.run(?P<run>\d+))?$", name)
114
+ if not m:
115
+ continue
116
+ stem = m.group("stem")
117
+ run_num = int(m.group("run") or 1) # no suffix ⇒ run 1
118
+ available_runs[stem].add(run_num)
119
+
120
+ return available_runs
121
+
122
+
123
+ def enumerate_jobs(
124
+ test_cases: List[str],
125
+ n_runs: int,
126
+ skip_available_results: bool,
127
+ output_dir: str,
128
+ ) -> List[Tuple[int, str, int]]:
129
+ """
130
+ Enumerate jobs to be run.
131
+
132
+ Args:
133
+ test_cases: List of test case file paths
134
+ n_runs: Number of runs per test case
135
+ skip_available_results: Whether to skip available results
136
+ output_dir: Output directory path
137
+
138
+ Returns:
139
+ List of tuples (task_n, test_case, run_idx)
140
+ """
141
+ jobs = []
142
+ task_n = 0
143
+
144
+ available_runs = (
145
+ get_available_runs(output_dir) if skip_available_results else {}
146
+ )
147
+
148
+ for test_case in test_cases:
149
+ stem = Path(test_case).stem
150
+
151
+ for run_idx in range(n_runs):
152
+ run_number = run_idx + 1
153
+
154
+ # Skip precisely this (test, run) if results exist
155
+ if skip_available_results and (
156
+ run_number in available_runs.get(stem, set())
157
+ ):
158
+ print(
159
+ f"Skipping {stem} run {run_number} as results already exist."
160
+ )
161
+ continue
162
+
163
+ jobs.append((task_n, test_case, run_idx))
164
+ task_n += 1
165
+
166
+ return jobs
167
+
168
+
169
+ def run_jobs(
170
+ jobs: List[Tuple[int, str, int]],
171
+ config: TestConfig,
172
+ clients: Clients,
173
+ process_func: Callable,
174
+ num_workers: int,
175
+ ) -> List[Any]:
176
+ """
177
+ Run jobs using ThreadPoolExecutor.
178
+
179
+ Args:
180
+ jobs: List of jobs to run
181
+ config: Test configuration
182
+ clients: Tuple of clients (wxo_client, llmaaj_provider, resource_map, inference_backend, llm_user)
183
+ process_func: Function to process each job
184
+ num_workers: Number of worker threads
185
+
186
+ Returns:
187
+ List of results from all jobs
188
+ """
189
+
190
+ if config.num_workers > 1 and config.enable_manual_user_input:
191
+ rich_print(
192
+ "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
193
+ )
194
+ config.enable_manual_user_input = (
195
+ False # disable manual user input for parallel execution
196
+ )
197
+
198
+ executor = ThreadPoolExecutor(max_workers=num_workers)
199
+ futures = []
200
+
201
+ for task_n, test_case, run_idx in jobs:
202
+ future = executor.submit(
203
+ process_func,
204
+ task_n,
205
+ test_case,
206
+ config,
207
+ clients.inference_backend,
208
+ clients.resource_map,
209
+ clients.llm_user,
210
+ clients.llmaaj_provider,
211
+ run_idx,
212
+ )
213
+ futures.append(((test_case, run_idx), future))
214
+
215
+ results = []
216
+
217
+ if futures:
218
+ if LOGGING_ENABLED:
219
+ # No progress bar when logging - just process tasks
220
+ for (test_case, run_idx), future in futures:
221
+ try:
222
+ results.extend(future.result())
223
+ except Exception as e:
224
+ import traceback
225
+
226
+ rich_print(f"test case {test_case} fails with {e}")
227
+
228
+ traceback.print_exc()
229
+ else:
230
+ with Progress() as progress:
231
+ task1 = progress.add_task(
232
+ f"[purple]Evaluating {len(futures)} tasks...",
233
+ total=len(futures),
234
+ )
235
+ for (test_case, run_idx), future in futures:
236
+ try:
237
+ results.extend(future.result())
238
+ except Exception as e:
239
+ import traceback
240
+
241
+ rich_print(f"test case {test_case} fails with {e}")
242
+
243
+ traceback.print_exc()
244
+ finally:
245
+ progress.update(task1, advance=1)
246
+
247
+ return results