ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +8 -2
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation_package.py +114 -70
  14. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  15. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  16. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  17. wxo_agentic_evaluation/external_agent/types.py +12 -5
  18. wxo_agentic_evaluation/inference_backend.py +158 -73
  19. wxo_agentic_evaluation/llm_matching.py +4 -3
  20. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  21. wxo_agentic_evaluation/llm_user.py +7 -3
  22. wxo_agentic_evaluation/main.py +175 -67
  23. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  24. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  25. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  26. wxo_agentic_evaluation/quick_eval.py +49 -23
  27. wxo_agentic_evaluation/record_chat.py +70 -33
  28. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  29. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  30. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  38. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  39. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  40. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  41. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  42. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  43. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  44. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  45. wxo_agentic_evaluation/resource_map.py +2 -1
  46. wxo_agentic_evaluation/service_instance.py +24 -11
  47. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  48. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
  49. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  50. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  51. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  52. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  53. wxo_agentic_evaluation/tool_planner.py +128 -44
  54. wxo_agentic_evaluation/type.py +12 -9
  55. wxo_agentic_evaluation/utils/__init__.py +1 -0
  56. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  57. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  58. wxo_agentic_evaluation/utils/utils.py +83 -52
  59. ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA +0 -385
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
  61. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
+ from typing import Any, List, Literal, Mapping, Union
2
+
1
3
  from pydantic import BaseModel
2
- from typing import List, Union, Literal, Mapping, Any
3
4
 
4
5
 
5
6
  class ThinkingStepDetails(BaseModel):
@@ -25,7 +26,9 @@ class ToolResponseStepDetails(BaseModel):
25
26
  tool_call_id: str
26
27
 
27
28
 
28
- StepDetails = Union[ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails]
29
+ StepDetails = Union[
30
+ ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails
31
+ ]
29
32
 
30
33
 
31
34
  class DeltaMessageChoice(BaseModel):
@@ -59,8 +62,12 @@ class ThreadRunStepDeltaData(BaseEventData):
59
62
 
60
63
 
61
64
  class UniversalData(BaseEventData):
62
- object: Union[Literal["thread.message.delta"], Literal["thread.run.step.delta"],
63
- Literal["thread.run.step.created"], Literal["thread.run.step.completed"]]
65
+ object: Union[
66
+ Literal["thread.message.delta"],
67
+ Literal["thread.run.step.delta"],
68
+ Literal["thread.run.step.created"],
69
+ Literal["thread.run.step.completed"],
70
+ ]
64
71
  choices: List[ThreadMessageDeltaChoice]
65
72
  choices: List[Union[ThreadMessageDeltaChoice, dict]]
66
73
 
@@ -68,4 +75,4 @@ class UniversalData(BaseEventData):
68
75
  class SchemaValidationResults(BaseModel):
69
76
  success: bool
70
77
  logged_events: List[str]
71
- messages: List[Mapping[Any, Any]]
78
+ messages: List[Mapping[Any, Any]]
@@ -1,61 +1,68 @@
1
- import requests
2
- import os
3
- import yaml
4
1
  import json
5
- import rich
2
+ import os
6
3
  import time
7
- from pydantic import BaseModel
8
- from typing import List, Generator, Dict, Tuple, Mapping, Any
9
- from enum import Enum
10
4
  from collections import deque
5
+ import urllib3
6
+ from urllib3.exceptions import InsecureRequestWarning
7
+ from enum import Enum
8
+ from typing import Any, Dict, Generator, List, Mapping, Tuple
11
9
 
10
+ import requests
11
+ import rich
12
+ import yaml
13
+ from pydantic import BaseModel
14
+
15
+ from wxo_agentic_evaluation.arg_configs import TestConfig
16
+ from wxo_agentic_evaluation.llm_user import LLMUser
17
+ from wxo_agentic_evaluation.service_instance import tenant_setup
18
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
19
+ WatsonXProvider,
20
+ )
12
21
  from wxo_agentic_evaluation.type import (
13
22
  ContentType,
14
- Message,
23
+ ConversationalConfidenceThresholdScore,
15
24
  ConversationalSearch,
16
25
  ConversationalSearchCitations,
17
26
  ConversationalSearchResultMetadata,
18
- ConversationalConfidenceThresholdScore,
19
27
  ConversationalSearchResults,
20
28
  ConversationSearchMetadata,
29
+ Message,
21
30
  )
22
- from wxo_agentic_evaluation.llm_user import LLMUser
23
- from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
24
- from wxo_agentic_evaluation.arg_configs import TestConfig
25
- from wxo_agentic_evaluation.service_instance import tenant_setup
26
31
  from wxo_agentic_evaluation.utils.utils import (
32
+ Tokenizer,
27
33
  is_saas_url,
28
34
  safe_divide,
29
- Tokenizer
30
35
  )
31
36
 
32
37
  tokenizer = Tokenizer()
33
38
 
39
+
34
40
  class Roles(Enum):
35
41
  ASSISTANT = "assistant"
36
42
  USER = "user"
37
43
 
38
- def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
44
+
45
+ def calculate_word_overlap_similarity_score(
46
+ first_message_text: str, second_message_text: str
47
+ ) -> float:
39
48
  """Calculate the word overlap similarity score between the .content field of two Message objects.
40
49
  Args:
41
50
  first_message_text (str): The .content field of the first message.
42
51
  second_message_text (str): The .content field of the second message.
43
52
  """
44
-
53
+
45
54
  words_in_first_message = tokenizer(first_message_text)
46
55
  words_in_second_message = tokenizer(second_message_text)
47
56
 
48
57
  # Calculate the number of common words
49
58
  common_words = set(words_in_first_message) & set(words_in_second_message)
50
59
  unique_words = set(words_in_first_message + words_in_second_message)
51
-
60
+
52
61
  unique_words_count = len(unique_words)
53
62
  common_words_count = len(common_words)
54
63
 
55
- return safe_divide(
56
- common_words_count,
57
- unique_words_count
58
- )
64
+ return safe_divide(common_words_count, unique_words_count)
65
+
59
66
 
60
67
  def is_transfer_response(step_detail: Dict):
61
68
  # this is not very reliable
@@ -77,6 +84,13 @@ class WXOClient:
77
84
  self.service_url = service_url
78
85
  self.api_key = api_key
79
86
 
87
+ env_ssl_verify = os.getenv("WO_SSL_VERIFY", "true")
88
+ verify = isinstance(env_ssl_verify, str) and env_ssl_verify.strip().lower() == "true"
89
+ self._verify_ssl = verify
90
+
91
+ if not self._verify_ssl:
92
+ urllib3.disable_warnings(InsecureRequestWarning)
93
+
80
94
  def _get_headers(self) -> dict:
81
95
  headers = {}
82
96
  if self.api_key:
@@ -86,12 +100,12 @@ class WXOClient:
86
100
  def post(self, payload: dict, path: str, stream=False):
87
101
  url = f"{self.service_url}/{path}"
88
102
  return requests.post(
89
- url=url, headers=self._get_headers(), json=payload, stream=stream
103
+ url=url, headers=self._get_headers(), json=payload, stream=stream, verify=self._verify_ssl
90
104
  )
91
105
 
92
106
  def get(self, path: str, params: dict = None):
93
107
  url = f"{self.service_url}/{path}"
94
- return requests.get(url, params=params, headers=self._get_headers())
108
+ return requests.get(url, params=params, headers=self._get_headers(), verify=self._verify_ssl)
95
109
 
96
110
 
97
111
  class WXOInferenceBackend:
@@ -135,7 +149,9 @@ class WXOInferenceBackend:
135
149
  else:
136
150
  path = "v1/orchestrate/runs?stream=true"
137
151
 
138
- response: requests.Response = self.wxo_client.post(payload, path, stream=True)
152
+ response: requests.Response = self.wxo_client.post(
153
+ payload, path, stream=True
154
+ )
139
155
  import json
140
156
 
141
157
  for chunk in self._parse_events(response):
@@ -188,7 +204,9 @@ class WXOInferenceBackend:
188
204
  citations = parse_citations()
189
205
  retrieval_context = parsed_search_results()
190
206
  citations_title = conversational_search.get("citations_title", "")
191
- response_length_option = conversational_search.get("response_length_option", "")
207
+ response_length_option = conversational_search.get(
208
+ "response_length_option", ""
209
+ )
192
210
  text = conversational_search.get("text", "")
193
211
 
194
212
  confidence_scores = ConversationalConfidenceThresholdScore(
@@ -261,7 +279,9 @@ class WXOInferenceBackend:
261
279
  )
262
280
  )
263
281
  end_time = time.time()
264
- call_tracker.tool_call.append(end_time - start_time)
282
+ call_tracker.tool_call.append(
283
+ end_time - start_time
284
+ )
265
285
  start_time = end_time
266
286
  elif step_detail["type"] == "tool_call":
267
287
  # in step details, we could have [tool_response, tool_call]
@@ -279,7 +299,9 @@ class WXOInferenceBackend:
279
299
  )
280
300
  )
281
301
  end_time = time.time()
282
- call_tracker.tool_call.append(end_time - start_time)
302
+ call_tracker.tool_call.append(
303
+ end_time - start_time
304
+ )
283
305
  start_time = end_time
284
306
  elif step_detail["type"] == "tool_response":
285
307
  content = json.dumps(step_detail)
@@ -293,7 +315,9 @@ class WXOInferenceBackend:
293
315
  )
294
316
  )
295
317
  end_time = time.time()
296
- call_tracker.tool_response.append(end_time - start_time)
318
+ call_tracker.tool_response.append(
319
+ end_time - start_time
320
+ )
297
321
  start_time = end_time
298
322
  elif content_field := delta.get("content"):
299
323
  for val in content_field:
@@ -312,7 +336,9 @@ class WXOInferenceBackend:
312
336
  chunk=event,
313
337
  )
314
338
  end_time = time.time()
315
- call_tracker.generic.append(end_time - start_time)
339
+ call_tracker.generic.append(
340
+ end_time - start_time
341
+ )
316
342
  start_time = end_time
317
343
 
318
344
  # NOTE: The event here that is parsed is part of the "message.created" event
@@ -336,10 +362,14 @@ class WXOInferenceBackend:
336
362
  """
337
363
 
338
364
  last_message = json.loads(messages[-1].content)
339
- tool_call_id = last_message.get("tool_call_id", None)
365
+ tool_call_id = last_message.get(
366
+ "tool_call_id", None
367
+ )
340
368
  assert tool_call_id is not None
341
- conversational_search_metadata = ConversationSearchMetadata(
342
- tool_call_id=tool_call_id
369
+ conversational_search_metadata = (
370
+ ConversationSearchMetadata(
371
+ tool_call_id=tool_call_id
372
+ )
343
373
  )
344
374
  conversational_search = (
345
375
  self.parse_conversational_search_response(
@@ -347,7 +377,9 @@ class WXOInferenceBackend:
347
377
  metadata=conversational_search_metadata,
348
378
  )
349
379
  )
350
- conversational_search_data.append(conversational_search)
380
+ conversational_search_data.append(
381
+ conversational_search
382
+ )
351
383
  messages.append(
352
384
  Message(
353
385
  role=role,
@@ -436,7 +468,10 @@ class WXOInferenceBackend:
436
468
  content = json.dumps(tool_json)
437
469
  # TO-DO: review do we even need the get messages for retry loop anymore?
438
470
  if msg_content := entry.get("content"):
439
- if msg_content[0].get("response_type") == "conversational_search":
471
+ if (
472
+ msg_content[0].get("response_type")
473
+ == "conversational_search"
474
+ ):
440
475
  continue
441
476
  messages.append(
442
477
  Message(
@@ -451,7 +486,9 @@ class WXOInferenceBackend:
451
486
  content = json.dumps(step_detail)
452
487
  messages.append(
453
488
  Message(
454
- role=role, content=content, type=content_type
489
+ role=role,
490
+ content=content,
491
+ type=content_type,
455
492
  )
456
493
  )
457
494
  else:
@@ -459,7 +496,9 @@ class WXOInferenceBackend:
459
496
  content_type = ContentType.tool_response
460
497
  messages.append(
461
498
  Message(
462
- role=role, content=content, type=content_type
499
+ role=role,
500
+ content=content,
501
+ type=content_type,
463
502
  )
464
503
  )
465
504
  if content_field := entry.get("content"):
@@ -468,12 +507,19 @@ class WXOInferenceBackend:
468
507
  if val["response_type"] == ContentType.text:
469
508
  messages.append(
470
509
  Message(
471
- role=role, content=val["text"], type=ContentType.text
510
+ role=role,
511
+ content=val["text"],
512
+ type=ContentType.text,
472
513
  )
473
514
  )
474
- if val["response_type"] == ContentType.conversational_search:
475
- conversational_search_metadata = ConversationSearchMetadata(
476
- tool_call_id=tool_call_id
515
+ if (
516
+ val["response_type"]
517
+ == ContentType.conversational_search
518
+ ):
519
+ conversational_search_metadata = (
520
+ ConversationSearchMetadata(
521
+ tool_call_id=tool_call_id
522
+ )
477
523
  )
478
524
  messages.append(
479
525
  Message(
@@ -538,8 +584,12 @@ class WXOInferenceBackend:
538
584
  class EvaluationController:
539
585
 
540
586
  MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
541
- MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)) # if any two consecutive messages are >98% similar, the inference loop will be terminated
542
- MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
587
+ MESSAGE_SIMILARITY_THRESHOLD = float(
588
+ os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
589
+ ) # if any two consecutive messages are >98% similar, the inference loop will be terminated
590
+ MAX_REPEATING_MESSAGES = int(
591
+ os.getenv("MAX_REPEATING_MESSAGES", 3)
592
+ ) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
543
593
 
544
594
  def __init__(
545
595
  self,
@@ -554,11 +604,20 @@ class EvaluationController:
554
604
 
555
605
  if self.repeating_output_detection:
556
606
  # Use deque for efficient O(1) operations
557
- self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
558
- self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
607
+ self.recent_user_messages = deque(
608
+ maxlen=self.MAX_REPEATING_MESSAGES
609
+ )
610
+ self.recent_assistant_messages = deque(
611
+ maxlen=self.MAX_REPEATING_MESSAGES
612
+ )
559
613
 
560
614
  def run(
561
- self, task_n, story, agent_name: str, starting_user_input: str = None, attack_instructions: str = None
615
+ self,
616
+ task_n,
617
+ story,
618
+ agent_name: str,
619
+ starting_user_input: str = None,
620
+ attack_instructions: str = None,
562
621
  ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
563
622
  step = 0
564
623
  thread_id = None
@@ -570,7 +629,9 @@ class EvaluationController:
570
629
  while step < self.MAX_CONVERSATION_STEPS:
571
630
  if step == 0 and starting_user_input:
572
631
  user_input = Message(
573
- role="user", content=starting_user_input, type=ContentType.text
632
+ role="user",
633
+ content=starting_user_input,
634
+ type=ContentType.text,
574
635
  )
575
636
  else:
576
637
  if self.config.enable_manual_user_input == True:
@@ -582,7 +643,9 @@ class EvaluationController:
582
643
  )
583
644
  else: # llm
584
645
  user_input = self.llm_user.generate_user_input(
585
- story, conversation_history, attack_instructions=attack_instructions
646
+ story,
647
+ conversation_history,
648
+ attack_instructions=attack_instructions,
586
649
  )
587
650
  if self.config.enable_verbose_logging:
588
651
  rich.print(
@@ -592,26 +655,33 @@ class EvaluationController:
592
655
 
593
656
  if self._is_end(user_input):
594
657
  break
595
-
658
+
596
659
  if self.repeating_output_detection:
597
660
  self.recent_user_messages.append(user_input.content)
598
661
 
599
662
  conversation_history.append(user_input)
600
-
601
- messages, thread_id, conversational_search_data = (
602
- self.wxo_inference_backend.stream_messages(
603
- user_input,
604
- agent_name=agent_name,
605
- thread_id=thread_id,
606
- call_tracker=call_tracker,
607
- )
663
+
664
+ (
665
+ messages,
666
+ thread_id,
667
+ conversational_search_data,
668
+ ) = self.wxo_inference_backend.stream_messages(
669
+ user_input,
670
+ agent_name=agent_name,
671
+ thread_id=thread_id,
672
+ call_tracker=call_tracker,
608
673
  )
609
674
  if not messages:
610
- raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
611
-
675
+ raise RuntimeError(
676
+ f"[Task-{task_n}] No messages is produced. Exiting task."
677
+ )
678
+
612
679
  for message in messages:
613
680
  if self.repeating_output_detection:
614
- if message.role == Roles.ASSISTANT and message.type == ContentType.text:
681
+ if (
682
+ message.role == Roles.ASSISTANT
683
+ and message.type == ContentType.text
684
+ ):
615
685
  self.recent_assistant_messages.append(message.content)
616
686
 
617
687
  if self.config.enable_verbose_logging:
@@ -621,11 +691,17 @@ class EvaluationController:
621
691
  )
622
692
 
623
693
  conversation_history.extend(messages)
624
- conversational_search_history_data.extend(conversational_search_data)
625
-
694
+ conversational_search_history_data.extend(
695
+ conversational_search_data
696
+ )
697
+
626
698
  step += 1
627
- return conversation_history, call_tracker, conversational_search_history_data
628
-
699
+ return (
700
+ conversation_history,
701
+ call_tracker,
702
+ conversational_search_history_data,
703
+ )
704
+
629
705
  def _is_looping(self, messages: deque) -> bool:
630
706
  """Checks whether the user or assistant is stuck in a loop.
631
707
  Args:
@@ -634,7 +710,7 @@ class EvaluationController:
634
710
  bool: True if stuck in a loop, False otherwise.
635
711
  """
636
712
  sim_count = 0
637
-
713
+
638
714
  if len(messages) >= self.MAX_REPEATING_MESSAGES:
639
715
  oldest_cached_message = messages[0]
640
716
  for i, old_message in enumerate(messages):
@@ -642,11 +718,16 @@ class EvaluationController:
642
718
  continue
643
719
  if oldest_cached_message == old_message:
644
720
  sim_count += 1
645
- elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
721
+ elif (
722
+ calculate_word_overlap_similarity_score(
723
+ oldest_cached_message, old_message
724
+ )
725
+ > self.MESSAGE_SIMILARITY_THRESHOLD
726
+ ):
646
727
  sim_count += 1
647
-
728
+
648
729
  return sim_count >= self.MAX_REPEATING_MESSAGES - 1
649
-
730
+
650
731
  def _is_end(self, current_user_input: Message) -> bool:
651
732
  """
652
733
  Check if the user input indicates the end of the conversation.
@@ -664,14 +745,16 @@ class EvaluationController:
664
745
  # Check if the user message contains 'END'
665
746
  if "END" in current_user_message_content:
666
747
  return True
667
-
748
+
668
749
  if self.repeating_output_detection:
669
750
  # Check for repeating user or assistant messages
670
- if (self._is_looping(self.recent_user_messages) or
671
- self._is_looping(self.recent_assistant_messages)):
751
+ if self._is_looping(self.recent_user_messages) or self._is_looping(
752
+ self.recent_assistant_messages
753
+ ):
672
754
  return True
673
-
674
- return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
755
+
756
+ return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
757
+
675
758
 
676
759
  def get_wxo_client(
677
760
  service_url: str, tenant_name: str, token: str = None
@@ -684,7 +767,9 @@ def get_wxo_client(
684
767
 
685
768
  if __name__ == "__main__":
686
769
  wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
687
- auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
770
+ auth_config_path = (
771
+ f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
772
+ )
688
773
  with open(auth_config_path, "r") as f:
689
774
  auth_config = yaml.safe_load(f)
690
775
  tenant_name = "local"
@@ -1,9 +1,10 @@
1
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
1
+ from typing import List
2
+
2
3
  from wxo_agentic_evaluation.prompt.template_render import (
3
4
  KeywordMatchingTemplateRenderer,
4
5
  SemanticMatchingTemplateRenderer,
5
6
  )
6
- from typing import List
7
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
7
8
 
8
9
 
9
10
  class LLMMatcher:
@@ -26,7 +27,7 @@ class LLMMatcher:
26
27
  prompt = self.keyword_template.render(
27
28
  keywords_text=keywords_text, response_text=response_text
28
29
  )
29
- output:str = self.llm_client.query(prompt)
30
+ output: str = self.llm_client.query(prompt)
30
31
  result = output.strip().lower()
31
32
  return result.startswith("true")
32
33
 
@@ -1,12 +1,15 @@
1
- from typing import List
2
1
  import json
2
+ from typing import List
3
3
 
4
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
4
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
5
+ AnswerRelevancy,
6
+ Faithfulness,
7
+ )
5
8
  from wxo_agentic_evaluation.prompt.template_render import (
6
- FaithfulnessTemplateRenderer,
7
9
  AnswerRelevancyTemplateRenderer,
10
+ FaithfulnessTemplateRenderer,
8
11
  )
9
- from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
12
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
10
13
 
11
14
 
12
15
  class LLMJudge:
@@ -1,7 +1,8 @@
1
1
  from typing import List, TypeVar
2
- from wxo_agentic_evaluation.type import Message, ContentType
3
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
2
+
4
3
  from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
4
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
5
+ from wxo_agentic_evaluation.type import ContentType, Message
5
6
 
6
7
  T = TypeVar("T", bound=JinjaTemplateRenderer)
7
8
 
@@ -17,7 +18,10 @@ class LLMUser:
17
18
  )
18
19
 
19
20
  def generate_user_input(
20
- self, user_story, conversation_history: List[Message], attack_instructions: str = None
21
+ self,
22
+ user_story,
23
+ conversation_history: List[Message],
24
+ attack_instructions: str = None,
21
25
  ) -> Message | None:
22
26
  # the tool response is already summarized, we don't need that to take over the chat history context window
23
27
  prompt_input = self.prompt_template.render(