ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA +35 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD +65 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +9 -3
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation.py +42 -0
  14. wxo_agentic_evaluation/evaluation_package.py +117 -70
  15. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  16. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  17. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  18. wxo_agentic_evaluation/external_agent/types.py +12 -5
  19. wxo_agentic_evaluation/inference_backend.py +183 -79
  20. wxo_agentic_evaluation/llm_matching.py +4 -3
  21. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  22. wxo_agentic_evaluation/llm_user.py +7 -3
  23. wxo_agentic_evaluation/main.py +175 -67
  24. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  25. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  26. wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
  27. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +176 -0
  28. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
  29. wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
  30. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  31. wxo_agentic_evaluation/quick_eval.py +49 -23
  32. wxo_agentic_evaluation/record_chat.py +70 -33
  33. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  34. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  35. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  40. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  41. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  42. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  43. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  44. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  45. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  46. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  47. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  48. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  49. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  50. wxo_agentic_evaluation/resource_map.py +2 -1
  51. wxo_agentic_evaluation/service_instance.py +103 -21
  52. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  53. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +216 -34
  54. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  55. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  56. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  57. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  58. wxo_agentic_evaluation/tool_planner.py +128 -44
  59. wxo_agentic_evaluation/type.py +12 -9
  60. wxo_agentic_evaluation/utils/__init__.py +1 -0
  61. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  62. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  63. wxo_agentic_evaluation/utils/utils.py +83 -52
  64. ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
  65. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/WHEEL +0 -0
  66. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
+ from typing import Any, List, Literal, Mapping, Union
2
+
1
3
  from pydantic import BaseModel
2
- from typing import List, Union, Literal, Mapping, Any
3
4
 
4
5
 
5
6
  class ThinkingStepDetails(BaseModel):
@@ -25,7 +26,9 @@ class ToolResponseStepDetails(BaseModel):
25
26
  tool_call_id: str
26
27
 
27
28
 
28
- StepDetails = Union[ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails]
29
+ StepDetails = Union[
30
+ ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails
31
+ ]
29
32
 
30
33
 
31
34
  class DeltaMessageChoice(BaseModel):
@@ -59,8 +62,12 @@ class ThreadRunStepDeltaData(BaseEventData):
59
62
 
60
63
 
61
64
  class UniversalData(BaseEventData):
62
- object: Union[Literal["thread.message.delta"], Literal["thread.run.step.delta"],
63
- Literal["thread.run.step.created"], Literal["thread.run.step.completed"]]
65
+ object: Union[
66
+ Literal["thread.message.delta"],
67
+ Literal["thread.run.step.delta"],
68
+ Literal["thread.run.step.created"],
69
+ Literal["thread.run.step.completed"],
70
+ ]
64
71
  choices: List[ThreadMessageDeltaChoice]
65
72
  choices: List[Union[ThreadMessageDeltaChoice, dict]]
66
73
 
@@ -68,4 +75,4 @@ class UniversalData(BaseEventData):
68
75
  class SchemaValidationResults(BaseModel):
69
76
  success: bool
70
77
  logged_events: List[str]
71
- messages: List[Mapping[Any, Any]]
78
+ messages: List[Mapping[Any, Any]]
@@ -1,61 +1,68 @@
1
- import requests
2
- import os
3
- import yaml
4
1
  import json
5
- import rich
2
+ import os
6
3
  import time
7
- from pydantic import BaseModel
8
- from typing import List, Generator, Dict, Tuple, Mapping, Any
9
- from enum import Enum
10
4
  from collections import deque
5
+ from enum import Enum
6
+ from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
7
+
8
+ import requests
9
+ import rich
10
+ import urllib3
11
+ import yaml
12
+ from pydantic import BaseModel
13
+ from urllib3.exceptions import InsecureRequestWarning
11
14
 
15
+ from wxo_agentic_evaluation.arg_configs import TestConfig
16
+ from wxo_agentic_evaluation.llm_user import LLMUser
17
+ from wxo_agentic_evaluation.service_instance import get_env_settings, tenant_setup
18
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
19
+ WatsonXProvider,
20
+ )
12
21
  from wxo_agentic_evaluation.type import (
13
22
  ContentType,
14
- Message,
23
+ ConversationalConfidenceThresholdScore,
15
24
  ConversationalSearch,
16
25
  ConversationalSearchCitations,
17
26
  ConversationalSearchResultMetadata,
18
- ConversationalConfidenceThresholdScore,
19
27
  ConversationalSearchResults,
20
28
  ConversationSearchMetadata,
29
+ Message,
21
30
  )
22
- from wxo_agentic_evaluation.llm_user import LLMUser
23
- from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
24
- from wxo_agentic_evaluation.arg_configs import TestConfig
25
- from wxo_agentic_evaluation.service_instance import tenant_setup
26
31
  from wxo_agentic_evaluation.utils.utils import (
32
+ Tokenizer,
27
33
  is_saas_url,
28
34
  safe_divide,
29
- Tokenizer
30
35
  )
31
36
 
32
37
  tokenizer = Tokenizer()
33
38
 
39
+
34
40
  class Roles(Enum):
35
41
  ASSISTANT = "assistant"
36
42
  USER = "user"
37
43
 
38
- def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
44
+
45
+ def calculate_word_overlap_similarity_score(
46
+ first_message_text: str, second_message_text: str
47
+ ) -> float:
39
48
  """Calculate the word overlap similarity score between the .content field of two Message objects.
40
49
  Args:
41
50
  first_message_text (str): The .content field of the first message.
42
51
  second_message_text (str): The .content field of the second message.
43
52
  """
44
-
53
+
45
54
  words_in_first_message = tokenizer(first_message_text)
46
55
  words_in_second_message = tokenizer(second_message_text)
47
56
 
48
57
  # Calculate the number of common words
49
58
  common_words = set(words_in_first_message) & set(words_in_second_message)
50
59
  unique_words = set(words_in_first_message + words_in_second_message)
51
-
60
+
52
61
  unique_words_count = len(unique_words)
53
62
  common_words_count = len(common_words)
54
63
 
55
- return safe_divide(
56
- common_words_count,
57
- unique_words_count
58
- )
64
+ return safe_divide(common_words_count, unique_words_count)
65
+
59
66
 
60
67
  def is_transfer_response(step_detail: Dict):
61
68
  # this is not very reliable
@@ -73,10 +80,23 @@ class CallTracker(BaseModel):
73
80
 
74
81
 
75
82
  class WXOClient:
76
- def __init__(self, service_url, api_key):
83
+ def __init__(self, service_url, api_key, env: Optional[Dict[str, Any]] = None):
77
84
  self.service_url = service_url
78
85
  self.api_key = api_key
79
86
 
87
+ ov = os.getenv("WO_SSL_VERIFY")
88
+ if ov and ov.strip().lower() in ("true", "false"):
89
+ self._verify_ssl = ov.strip().lower() == "true"
90
+ else:
91
+ v, bs = (env.get("verify") if env else None), (env.get("bypass_ssl") if env else None)
92
+ self._verify_ssl = False if (
93
+ (bs is True) or (isinstance(bs, str) and bs.strip().lower() == "true") or
94
+ (v is None) or (isinstance(v, str) and v.strip().lower() in {"none", "null"})
95
+ ) else (v if isinstance(v, bool) else True)
96
+
97
+ if not self._verify_ssl:
98
+ urllib3.disable_warnings(InsecureRequestWarning)
99
+
80
100
  def _get_headers(self) -> dict:
81
101
  headers = {}
82
102
  if self.api_key:
@@ -86,12 +106,21 @@ class WXOClient:
86
106
  def post(self, payload: dict, path: str, stream=False):
87
107
  url = f"{self.service_url}/{path}"
88
108
  return requests.post(
89
- url=url, headers=self._get_headers(), json=payload, stream=stream
109
+ url=url,
110
+ headers=self._get_headers(),
111
+ json=payload,
112
+ stream=stream,
113
+ verify=self._verify_ssl,
90
114
  )
91
115
 
92
116
  def get(self, path: str, params: dict = None):
93
117
  url = f"{self.service_url}/{path}"
94
- return requests.get(url, params=params, headers=self._get_headers())
118
+ return requests.get(
119
+ url,
120
+ params=params,
121
+ headers=self._get_headers(),
122
+ verify=self._verify_ssl,
123
+ )
95
124
 
96
125
 
97
126
  class WXOInferenceBackend:
@@ -135,7 +164,9 @@ class WXOInferenceBackend:
135
164
  else:
136
165
  path = "v1/orchestrate/runs?stream=true"
137
166
 
138
- response: requests.Response = self.wxo_client.post(payload, path, stream=True)
167
+ response: requests.Response = self.wxo_client.post(
168
+ payload, path, stream=True
169
+ )
139
170
  import json
140
171
 
141
172
  for chunk in self._parse_events(response):
@@ -188,7 +219,9 @@ class WXOInferenceBackend:
188
219
  citations = parse_citations()
189
220
  retrieval_context = parsed_search_results()
190
221
  citations_title = conversational_search.get("citations_title", "")
191
- response_length_option = conversational_search.get("response_length_option", "")
222
+ response_length_option = conversational_search.get(
223
+ "response_length_option", ""
224
+ )
192
225
  text = conversational_search.get("text", "")
193
226
 
194
227
  confidence_scores = ConversationalConfidenceThresholdScore(
@@ -261,7 +294,9 @@ class WXOInferenceBackend:
261
294
  )
262
295
  )
263
296
  end_time = time.time()
264
- call_tracker.tool_call.append(end_time - start_time)
297
+ call_tracker.tool_call.append(
298
+ end_time - start_time
299
+ )
265
300
  start_time = end_time
266
301
  elif step_detail["type"] == "tool_call":
267
302
  # in step details, we could have [tool_response, tool_call]
@@ -279,7 +314,9 @@ class WXOInferenceBackend:
279
314
  )
280
315
  )
281
316
  end_time = time.time()
282
- call_tracker.tool_call.append(end_time - start_time)
317
+ call_tracker.tool_call.append(
318
+ end_time - start_time
319
+ )
283
320
  start_time = end_time
284
321
  elif step_detail["type"] == "tool_response":
285
322
  content = json.dumps(step_detail)
@@ -293,7 +330,9 @@ class WXOInferenceBackend:
293
330
  )
294
331
  )
295
332
  end_time = time.time()
296
- call_tracker.tool_response.append(end_time - start_time)
333
+ call_tracker.tool_response.append(
334
+ end_time - start_time
335
+ )
297
336
  start_time = end_time
298
337
  elif content_field := delta.get("content"):
299
338
  for val in content_field:
@@ -312,7 +351,9 @@ class WXOInferenceBackend:
312
351
  chunk=event,
313
352
  )
314
353
  end_time = time.time()
315
- call_tracker.generic.append(end_time - start_time)
354
+ call_tracker.generic.append(
355
+ end_time - start_time
356
+ )
316
357
  start_time = end_time
317
358
 
318
359
  # NOTE: The event here that is parsed is part of the "message.created" event
@@ -336,10 +377,14 @@ class WXOInferenceBackend:
336
377
  """
337
378
 
338
379
  last_message = json.loads(messages[-1].content)
339
- tool_call_id = last_message.get("tool_call_id", None)
380
+ tool_call_id = last_message.get(
381
+ "tool_call_id", None
382
+ )
340
383
  assert tool_call_id is not None
341
- conversational_search_metadata = ConversationSearchMetadata(
342
- tool_call_id=tool_call_id
384
+ conversational_search_metadata = (
385
+ ConversationSearchMetadata(
386
+ tool_call_id=tool_call_id
387
+ )
343
388
  )
344
389
  conversational_search = (
345
390
  self.parse_conversational_search_response(
@@ -347,7 +392,9 @@ class WXOInferenceBackend:
347
392
  metadata=conversational_search_metadata,
348
393
  )
349
394
  )
350
- conversational_search_data.append(conversational_search)
395
+ conversational_search_data.append(
396
+ conversational_search
397
+ )
351
398
  messages.append(
352
399
  Message(
353
400
  role=role,
@@ -436,7 +483,10 @@ class WXOInferenceBackend:
436
483
  content = json.dumps(tool_json)
437
484
  # TO-DO: review do we even need the get messages for retry loop anymore?
438
485
  if msg_content := entry.get("content"):
439
- if msg_content[0].get("response_type") == "conversational_search":
486
+ if (
487
+ msg_content[0].get("response_type")
488
+ == "conversational_search"
489
+ ):
440
490
  continue
441
491
  messages.append(
442
492
  Message(
@@ -451,7 +501,9 @@ class WXOInferenceBackend:
451
501
  content = json.dumps(step_detail)
452
502
  messages.append(
453
503
  Message(
454
- role=role, content=content, type=content_type
504
+ role=role,
505
+ content=content,
506
+ type=content_type,
455
507
  )
456
508
  )
457
509
  else:
@@ -459,7 +511,9 @@ class WXOInferenceBackend:
459
511
  content_type = ContentType.tool_response
460
512
  messages.append(
461
513
  Message(
462
- role=role, content=content, type=content_type
514
+ role=role,
515
+ content=content,
516
+ type=content_type,
463
517
  )
464
518
  )
465
519
  if content_field := entry.get("content"):
@@ -468,12 +522,19 @@ class WXOInferenceBackend:
468
522
  if val["response_type"] == ContentType.text:
469
523
  messages.append(
470
524
  Message(
471
- role=role, content=val["text"], type=ContentType.text
525
+ role=role,
526
+ content=val["text"],
527
+ type=ContentType.text,
472
528
  )
473
529
  )
474
- if val["response_type"] == ContentType.conversational_search:
475
- conversational_search_metadata = ConversationSearchMetadata(
476
- tool_call_id=tool_call_id
530
+ if (
531
+ val["response_type"]
532
+ == ContentType.conversational_search
533
+ ):
534
+ conversational_search_metadata = (
535
+ ConversationSearchMetadata(
536
+ tool_call_id=tool_call_id
537
+ )
477
538
  )
478
539
  messages.append(
479
540
  Message(
@@ -538,8 +599,12 @@ class WXOInferenceBackend:
538
599
  class EvaluationController:
539
600
 
540
601
  MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
541
- MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)) # if any two consecutive messages are >98% similar, the inference loop will be terminated
542
- MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
602
+ MESSAGE_SIMILARITY_THRESHOLD = float(
603
+ os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
604
+ ) # if any two consecutive messages are >98% similar, the inference loop will be terminated
605
+ MAX_REPEATING_MESSAGES = int(
606
+ os.getenv("MAX_REPEATING_MESSAGES", 3)
607
+ ) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
543
608
 
544
609
  def __init__(
545
610
  self,
@@ -554,11 +619,20 @@ class EvaluationController:
554
619
 
555
620
  if self.repeating_output_detection:
556
621
  # Use deque for efficient O(1) operations
557
- self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
558
- self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
622
+ self.recent_user_messages = deque(
623
+ maxlen=self.MAX_REPEATING_MESSAGES
624
+ )
625
+ self.recent_assistant_messages = deque(
626
+ maxlen=self.MAX_REPEATING_MESSAGES
627
+ )
559
628
 
560
629
  def run(
561
- self, task_n, story, agent_name: str, starting_user_input: str = None, attack_instructions: str = None
630
+ self,
631
+ task_n,
632
+ story,
633
+ agent_name: str,
634
+ starting_user_input: str = None,
635
+ attack_instructions: str = None,
562
636
  ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
563
637
  step = 0
564
638
  thread_id = None
@@ -570,7 +644,9 @@ class EvaluationController:
570
644
  while step < self.MAX_CONVERSATION_STEPS:
571
645
  if step == 0 and starting_user_input:
572
646
  user_input = Message(
573
- role="user", content=starting_user_input, type=ContentType.text
647
+ role="user",
648
+ content=starting_user_input,
649
+ type=ContentType.text,
574
650
  )
575
651
  else:
576
652
  if self.config.enable_manual_user_input == True:
@@ -582,7 +658,9 @@ class EvaluationController:
582
658
  )
583
659
  else: # llm
584
660
  user_input = self.llm_user.generate_user_input(
585
- story, conversation_history, attack_instructions=attack_instructions
661
+ story,
662
+ conversation_history,
663
+ attack_instructions=attack_instructions,
586
664
  )
587
665
  if self.config.enable_verbose_logging:
588
666
  rich.print(
@@ -592,26 +670,33 @@ class EvaluationController:
592
670
 
593
671
  if self._is_end(user_input):
594
672
  break
595
-
673
+
596
674
  if self.repeating_output_detection:
597
675
  self.recent_user_messages.append(user_input.content)
598
676
 
599
677
  conversation_history.append(user_input)
600
-
601
- messages, thread_id, conversational_search_data = (
602
- self.wxo_inference_backend.stream_messages(
603
- user_input,
604
- agent_name=agent_name,
605
- thread_id=thread_id,
606
- call_tracker=call_tracker,
607
- )
678
+
679
+ (
680
+ messages,
681
+ thread_id,
682
+ conversational_search_data,
683
+ ) = self.wxo_inference_backend.stream_messages(
684
+ user_input,
685
+ agent_name=agent_name,
686
+ thread_id=thread_id,
687
+ call_tracker=call_tracker,
608
688
  )
609
689
  if not messages:
610
- raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
611
-
690
+ raise RuntimeError(
691
+ f"[Task-{task_n}] No messages is produced. Exiting task."
692
+ )
693
+
612
694
  for message in messages:
613
695
  if self.repeating_output_detection:
614
- if message.role == Roles.ASSISTANT and message.type == ContentType.text:
696
+ if (
697
+ message.role == Roles.ASSISTANT
698
+ and message.type == ContentType.text
699
+ ):
615
700
  self.recent_assistant_messages.append(message.content)
616
701
 
617
702
  if self.config.enable_verbose_logging:
@@ -621,11 +706,17 @@ class EvaluationController:
621
706
  )
622
707
 
623
708
  conversation_history.extend(messages)
624
- conversational_search_history_data.extend(conversational_search_data)
625
-
709
+ conversational_search_history_data.extend(
710
+ conversational_search_data
711
+ )
712
+
626
713
  step += 1
627
- return conversation_history, call_tracker, conversational_search_history_data
628
-
714
+ return (
715
+ conversation_history,
716
+ call_tracker,
717
+ conversational_search_history_data,
718
+ )
719
+
629
720
  def _is_looping(self, messages: deque) -> bool:
630
721
  """Checks whether the user or assistant is stuck in a loop.
631
722
  Args:
@@ -634,7 +725,7 @@ class EvaluationController:
634
725
  bool: True if stuck in a loop, False otherwise.
635
726
  """
636
727
  sim_count = 0
637
-
728
+
638
729
  if len(messages) >= self.MAX_REPEATING_MESSAGES:
639
730
  oldest_cached_message = messages[0]
640
731
  for i, old_message in enumerate(messages):
@@ -642,11 +733,16 @@ class EvaluationController:
642
733
  continue
643
734
  if oldest_cached_message == old_message:
644
735
  sim_count += 1
645
- elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
736
+ elif (
737
+ calculate_word_overlap_similarity_score(
738
+ oldest_cached_message, old_message
739
+ )
740
+ > self.MESSAGE_SIMILARITY_THRESHOLD
741
+ ):
646
742
  sim_count += 1
647
-
743
+
648
744
  return sim_count >= self.MAX_REPEATING_MESSAGES - 1
649
-
745
+
650
746
  def _is_end(self, current_user_input: Message) -> bool:
651
747
  """
652
748
  Check if the user input indicates the end of the conversation.
@@ -664,27 +760,35 @@ class EvaluationController:
664
760
  # Check if the user message contains 'END'
665
761
  if "END" in current_user_message_content:
666
762
  return True
667
-
763
+
668
764
  if self.repeating_output_detection:
669
765
  # Check for repeating user or assistant messages
670
- if (self._is_looping(self.recent_user_messages) or
671
- self._is_looping(self.recent_assistant_messages)):
766
+ if self._is_looping(self.recent_user_messages) or self._is_looping(
767
+ self.recent_assistant_messages
768
+ ):
672
769
  return True
673
-
674
- return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
770
+
771
+ return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
772
+
675
773
 
676
774
  def get_wxo_client(
677
- service_url: str, tenant_name: str, token: str = None
775
+ service_url: Optional[str], tenant_name: str, token: Optional[str] = None
678
776
  ) -> WXOClient:
679
- if not token:
680
- token = tenant_setup(service_url, tenant_name)
681
- wxo_client = WXOClient(service_url=service_url, api_key=token)
682
- return wxo_client
683
777
 
778
+ token, resolved_url, env = tenant_setup(service_url, tenant_name)
779
+ service_url = service_url or resolved_url
780
+
781
+ if not (service_url and str(service_url).strip()):
782
+ raise ValueError(f"service_url not provided and not found in config for tenant '{tenant_name}'")
783
+
784
+ wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
785
+ return wxo_client
684
786
 
685
787
  if __name__ == "__main__":
686
788
  wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
687
- auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
789
+ auth_config_path = (
790
+ f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
791
+ )
688
792
  with open(auth_config_path, "r") as f:
689
793
  auth_config = yaml.safe_load(f)
690
794
  tenant_name = "local"
@@ -1,9 +1,10 @@
1
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
1
+ from typing import List
2
+
2
3
  from wxo_agentic_evaluation.prompt.template_render import (
3
4
  KeywordMatchingTemplateRenderer,
4
5
  SemanticMatchingTemplateRenderer,
5
6
  )
6
- from typing import List
7
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
7
8
 
8
9
 
9
10
  class LLMMatcher:
@@ -26,7 +27,7 @@ class LLMMatcher:
26
27
  prompt = self.keyword_template.render(
27
28
  keywords_text=keywords_text, response_text=response_text
28
29
  )
29
- output:str = self.llm_client.query(prompt)
30
+ output: str = self.llm_client.query(prompt)
30
31
  result = output.strip().lower()
31
32
  return result.startswith("true")
32
33
 
@@ -1,12 +1,15 @@
1
- from typing import List
2
1
  import json
2
+ from typing import List
3
3
 
4
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
4
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
5
+ AnswerRelevancy,
6
+ Faithfulness,
7
+ )
5
8
  from wxo_agentic_evaluation.prompt.template_render import (
6
- FaithfulnessTemplateRenderer,
7
9
  AnswerRelevancyTemplateRenderer,
10
+ FaithfulnessTemplateRenderer,
8
11
  )
9
- from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
12
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
10
13
 
11
14
 
12
15
  class LLMJudge:
@@ -1,7 +1,8 @@
1
1
  from typing import List, TypeVar
2
- from wxo_agentic_evaluation.type import Message, ContentType
3
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
2
+
4
3
  from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
4
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
5
+ from wxo_agentic_evaluation.type import ContentType, Message
5
6
 
6
7
  T = TypeVar("T", bound=JinjaTemplateRenderer)
7
8
 
@@ -17,7 +18,10 @@ class LLMUser:
17
18
  )
18
19
 
19
20
  def generate_user_input(
20
- self, user_story, conversation_history: List[Message], attack_instructions: str = None
21
+ self,
22
+ user_story,
23
+ conversation_history: List[Message],
24
+ attack_instructions: str = None,
21
25
  ) -> Message | None:
22
26
  # the tool response is already summarized, we don't need that to take over the chat history context window
23
27
  prompt_input = self.prompt_template.render(