ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +8 -2
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation_package.py +114 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +158 -73
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +24 -11
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
from typing import Any, List, Literal, Mapping, Union
|
|
2
|
+
|
|
1
3
|
from pydantic import BaseModel
|
|
2
|
-
from typing import List, Union, Literal, Mapping, Any
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
class ThinkingStepDetails(BaseModel):
|
|
@@ -25,7 +26,9 @@ class ToolResponseStepDetails(BaseModel):
|
|
|
25
26
|
tool_call_id: str
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
StepDetails = Union[
|
|
29
|
+
StepDetails = Union[
|
|
30
|
+
ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails
|
|
31
|
+
]
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
class DeltaMessageChoice(BaseModel):
|
|
@@ -59,8 +62,12 @@ class ThreadRunStepDeltaData(BaseEventData):
|
|
|
59
62
|
|
|
60
63
|
|
|
61
64
|
class UniversalData(BaseEventData):
|
|
62
|
-
object: Union[
|
|
63
|
-
|
|
65
|
+
object: Union[
|
|
66
|
+
Literal["thread.message.delta"],
|
|
67
|
+
Literal["thread.run.step.delta"],
|
|
68
|
+
Literal["thread.run.step.created"],
|
|
69
|
+
Literal["thread.run.step.completed"],
|
|
70
|
+
]
|
|
64
71
|
choices: List[ThreadMessageDeltaChoice]
|
|
65
72
|
choices: List[Union[ThreadMessageDeltaChoice, dict]]
|
|
66
73
|
|
|
@@ -68,4 +75,4 @@ class UniversalData(BaseEventData):
|
|
|
68
75
|
class SchemaValidationResults(BaseModel):
|
|
69
76
|
success: bool
|
|
70
77
|
logged_events: List[str]
|
|
71
|
-
messages: List[Mapping[Any, Any]]
|
|
78
|
+
messages: List[Mapping[Any, Any]]
|
|
@@ -1,61 +1,68 @@
|
|
|
1
|
-
import requests
|
|
2
|
-
import os
|
|
3
|
-
import yaml
|
|
4
1
|
import json
|
|
5
|
-
import
|
|
2
|
+
import os
|
|
6
3
|
import time
|
|
7
|
-
from pydantic import BaseModel
|
|
8
|
-
from typing import List, Generator, Dict, Tuple, Mapping, Any
|
|
9
|
-
from enum import Enum
|
|
10
4
|
from collections import deque
|
|
5
|
+
import urllib3
|
|
6
|
+
from urllib3.exceptions import InsecureRequestWarning
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any, Dict, Generator, List, Mapping, Tuple
|
|
11
9
|
|
|
10
|
+
import requests
|
|
11
|
+
import rich
|
|
12
|
+
import yaml
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
|
|
15
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
16
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
17
|
+
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
18
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
19
|
+
WatsonXProvider,
|
|
20
|
+
)
|
|
12
21
|
from wxo_agentic_evaluation.type import (
|
|
13
22
|
ContentType,
|
|
14
|
-
|
|
23
|
+
ConversationalConfidenceThresholdScore,
|
|
15
24
|
ConversationalSearch,
|
|
16
25
|
ConversationalSearchCitations,
|
|
17
26
|
ConversationalSearchResultMetadata,
|
|
18
|
-
ConversationalConfidenceThresholdScore,
|
|
19
27
|
ConversationalSearchResults,
|
|
20
28
|
ConversationSearchMetadata,
|
|
29
|
+
Message,
|
|
21
30
|
)
|
|
22
|
-
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
23
|
-
from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
|
|
24
|
-
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
25
|
-
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
26
31
|
from wxo_agentic_evaluation.utils.utils import (
|
|
32
|
+
Tokenizer,
|
|
27
33
|
is_saas_url,
|
|
28
34
|
safe_divide,
|
|
29
|
-
Tokenizer
|
|
30
35
|
)
|
|
31
36
|
|
|
32
37
|
tokenizer = Tokenizer()
|
|
33
38
|
|
|
39
|
+
|
|
34
40
|
class Roles(Enum):
|
|
35
41
|
ASSISTANT = "assistant"
|
|
36
42
|
USER = "user"
|
|
37
43
|
|
|
38
|
-
|
|
44
|
+
|
|
45
|
+
def calculate_word_overlap_similarity_score(
|
|
46
|
+
first_message_text: str, second_message_text: str
|
|
47
|
+
) -> float:
|
|
39
48
|
"""Calculate the word overlap similarity score between the .content field of two Message objects.
|
|
40
49
|
Args:
|
|
41
50
|
first_message_text (str): The .content field of the first message.
|
|
42
51
|
second_message_text (str): The .content field of the second message.
|
|
43
52
|
"""
|
|
44
|
-
|
|
53
|
+
|
|
45
54
|
words_in_first_message = tokenizer(first_message_text)
|
|
46
55
|
words_in_second_message = tokenizer(second_message_text)
|
|
47
56
|
|
|
48
57
|
# Calculate the number of common words
|
|
49
58
|
common_words = set(words_in_first_message) & set(words_in_second_message)
|
|
50
59
|
unique_words = set(words_in_first_message + words_in_second_message)
|
|
51
|
-
|
|
60
|
+
|
|
52
61
|
unique_words_count = len(unique_words)
|
|
53
62
|
common_words_count = len(common_words)
|
|
54
63
|
|
|
55
|
-
return safe_divide(
|
|
56
|
-
|
|
57
|
-
unique_words_count
|
|
58
|
-
)
|
|
64
|
+
return safe_divide(common_words_count, unique_words_count)
|
|
65
|
+
|
|
59
66
|
|
|
60
67
|
def is_transfer_response(step_detail: Dict):
|
|
61
68
|
# this is not very reliable
|
|
@@ -77,6 +84,13 @@ class WXOClient:
|
|
|
77
84
|
self.service_url = service_url
|
|
78
85
|
self.api_key = api_key
|
|
79
86
|
|
|
87
|
+
env_ssl_verify = os.getenv("WO_SSL_VERIFY", "true")
|
|
88
|
+
verify = isinstance(env_ssl_verify, str) and env_ssl_verify.strip().lower() == "true"
|
|
89
|
+
self._verify_ssl = verify
|
|
90
|
+
|
|
91
|
+
if not self._verify_ssl:
|
|
92
|
+
urllib3.disable_warnings(InsecureRequestWarning)
|
|
93
|
+
|
|
80
94
|
def _get_headers(self) -> dict:
|
|
81
95
|
headers = {}
|
|
82
96
|
if self.api_key:
|
|
@@ -86,12 +100,12 @@ class WXOClient:
|
|
|
86
100
|
def post(self, payload: dict, path: str, stream=False):
|
|
87
101
|
url = f"{self.service_url}/{path}"
|
|
88
102
|
return requests.post(
|
|
89
|
-
url=url, headers=self._get_headers(), json=payload, stream=stream
|
|
103
|
+
url=url, headers=self._get_headers(), json=payload, stream=stream, verify=self._verify_ssl
|
|
90
104
|
)
|
|
91
105
|
|
|
92
106
|
def get(self, path: str, params: dict = None):
|
|
93
107
|
url = f"{self.service_url}/{path}"
|
|
94
|
-
return requests.get(url, params=params, headers=self._get_headers())
|
|
108
|
+
return requests.get(url, params=params, headers=self._get_headers(), verify=self._verify_ssl)
|
|
95
109
|
|
|
96
110
|
|
|
97
111
|
class WXOInferenceBackend:
|
|
@@ -135,7 +149,9 @@ class WXOInferenceBackend:
|
|
|
135
149
|
else:
|
|
136
150
|
path = "v1/orchestrate/runs?stream=true"
|
|
137
151
|
|
|
138
|
-
response: requests.Response = self.wxo_client.post(
|
|
152
|
+
response: requests.Response = self.wxo_client.post(
|
|
153
|
+
payload, path, stream=True
|
|
154
|
+
)
|
|
139
155
|
import json
|
|
140
156
|
|
|
141
157
|
for chunk in self._parse_events(response):
|
|
@@ -188,7 +204,9 @@ class WXOInferenceBackend:
|
|
|
188
204
|
citations = parse_citations()
|
|
189
205
|
retrieval_context = parsed_search_results()
|
|
190
206
|
citations_title = conversational_search.get("citations_title", "")
|
|
191
|
-
response_length_option = conversational_search.get(
|
|
207
|
+
response_length_option = conversational_search.get(
|
|
208
|
+
"response_length_option", ""
|
|
209
|
+
)
|
|
192
210
|
text = conversational_search.get("text", "")
|
|
193
211
|
|
|
194
212
|
confidence_scores = ConversationalConfidenceThresholdScore(
|
|
@@ -261,7 +279,9 @@ class WXOInferenceBackend:
|
|
|
261
279
|
)
|
|
262
280
|
)
|
|
263
281
|
end_time = time.time()
|
|
264
|
-
call_tracker.tool_call.append(
|
|
282
|
+
call_tracker.tool_call.append(
|
|
283
|
+
end_time - start_time
|
|
284
|
+
)
|
|
265
285
|
start_time = end_time
|
|
266
286
|
elif step_detail["type"] == "tool_call":
|
|
267
287
|
# in step details, we could have [tool_response, tool_call]
|
|
@@ -279,7 +299,9 @@ class WXOInferenceBackend:
|
|
|
279
299
|
)
|
|
280
300
|
)
|
|
281
301
|
end_time = time.time()
|
|
282
|
-
call_tracker.tool_call.append(
|
|
302
|
+
call_tracker.tool_call.append(
|
|
303
|
+
end_time - start_time
|
|
304
|
+
)
|
|
283
305
|
start_time = end_time
|
|
284
306
|
elif step_detail["type"] == "tool_response":
|
|
285
307
|
content = json.dumps(step_detail)
|
|
@@ -293,7 +315,9 @@ class WXOInferenceBackend:
|
|
|
293
315
|
)
|
|
294
316
|
)
|
|
295
317
|
end_time = time.time()
|
|
296
|
-
call_tracker.tool_response.append(
|
|
318
|
+
call_tracker.tool_response.append(
|
|
319
|
+
end_time - start_time
|
|
320
|
+
)
|
|
297
321
|
start_time = end_time
|
|
298
322
|
elif content_field := delta.get("content"):
|
|
299
323
|
for val in content_field:
|
|
@@ -312,7 +336,9 @@ class WXOInferenceBackend:
|
|
|
312
336
|
chunk=event,
|
|
313
337
|
)
|
|
314
338
|
end_time = time.time()
|
|
315
|
-
call_tracker.generic.append(
|
|
339
|
+
call_tracker.generic.append(
|
|
340
|
+
end_time - start_time
|
|
341
|
+
)
|
|
316
342
|
start_time = end_time
|
|
317
343
|
|
|
318
344
|
# NOTE: The event here that is parsed is part of the "message.created" event
|
|
@@ -336,10 +362,14 @@ class WXOInferenceBackend:
|
|
|
336
362
|
"""
|
|
337
363
|
|
|
338
364
|
last_message = json.loads(messages[-1].content)
|
|
339
|
-
tool_call_id = last_message.get(
|
|
365
|
+
tool_call_id = last_message.get(
|
|
366
|
+
"tool_call_id", None
|
|
367
|
+
)
|
|
340
368
|
assert tool_call_id is not None
|
|
341
|
-
conversational_search_metadata =
|
|
342
|
-
|
|
369
|
+
conversational_search_metadata = (
|
|
370
|
+
ConversationSearchMetadata(
|
|
371
|
+
tool_call_id=tool_call_id
|
|
372
|
+
)
|
|
343
373
|
)
|
|
344
374
|
conversational_search = (
|
|
345
375
|
self.parse_conversational_search_response(
|
|
@@ -347,7 +377,9 @@ class WXOInferenceBackend:
|
|
|
347
377
|
metadata=conversational_search_metadata,
|
|
348
378
|
)
|
|
349
379
|
)
|
|
350
|
-
conversational_search_data.append(
|
|
380
|
+
conversational_search_data.append(
|
|
381
|
+
conversational_search
|
|
382
|
+
)
|
|
351
383
|
messages.append(
|
|
352
384
|
Message(
|
|
353
385
|
role=role,
|
|
@@ -436,7 +468,10 @@ class WXOInferenceBackend:
|
|
|
436
468
|
content = json.dumps(tool_json)
|
|
437
469
|
# TO-DO: review do we even need the get messages for retry loop anymore?
|
|
438
470
|
if msg_content := entry.get("content"):
|
|
439
|
-
if
|
|
471
|
+
if (
|
|
472
|
+
msg_content[0].get("response_type")
|
|
473
|
+
== "conversational_search"
|
|
474
|
+
):
|
|
440
475
|
continue
|
|
441
476
|
messages.append(
|
|
442
477
|
Message(
|
|
@@ -451,7 +486,9 @@ class WXOInferenceBackend:
|
|
|
451
486
|
content = json.dumps(step_detail)
|
|
452
487
|
messages.append(
|
|
453
488
|
Message(
|
|
454
|
-
role=role,
|
|
489
|
+
role=role,
|
|
490
|
+
content=content,
|
|
491
|
+
type=content_type,
|
|
455
492
|
)
|
|
456
493
|
)
|
|
457
494
|
else:
|
|
@@ -459,7 +496,9 @@ class WXOInferenceBackend:
|
|
|
459
496
|
content_type = ContentType.tool_response
|
|
460
497
|
messages.append(
|
|
461
498
|
Message(
|
|
462
|
-
role=role,
|
|
499
|
+
role=role,
|
|
500
|
+
content=content,
|
|
501
|
+
type=content_type,
|
|
463
502
|
)
|
|
464
503
|
)
|
|
465
504
|
if content_field := entry.get("content"):
|
|
@@ -468,12 +507,19 @@ class WXOInferenceBackend:
|
|
|
468
507
|
if val["response_type"] == ContentType.text:
|
|
469
508
|
messages.append(
|
|
470
509
|
Message(
|
|
471
|
-
role=role,
|
|
510
|
+
role=role,
|
|
511
|
+
content=val["text"],
|
|
512
|
+
type=ContentType.text,
|
|
472
513
|
)
|
|
473
514
|
)
|
|
474
|
-
if
|
|
475
|
-
|
|
476
|
-
|
|
515
|
+
if (
|
|
516
|
+
val["response_type"]
|
|
517
|
+
== ContentType.conversational_search
|
|
518
|
+
):
|
|
519
|
+
conversational_search_metadata = (
|
|
520
|
+
ConversationSearchMetadata(
|
|
521
|
+
tool_call_id=tool_call_id
|
|
522
|
+
)
|
|
477
523
|
)
|
|
478
524
|
messages.append(
|
|
479
525
|
Message(
|
|
@@ -538,8 +584,12 @@ class WXOInferenceBackend:
|
|
|
538
584
|
class EvaluationController:
|
|
539
585
|
|
|
540
586
|
MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
|
|
541
|
-
MESSAGE_SIMILARITY_THRESHOLD = float(
|
|
542
|
-
|
|
587
|
+
MESSAGE_SIMILARITY_THRESHOLD = float(
|
|
588
|
+
os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
|
|
589
|
+
) # if any two consecutive messages are >98% similar, the inference loop will be terminated
|
|
590
|
+
MAX_REPEATING_MESSAGES = int(
|
|
591
|
+
os.getenv("MAX_REPEATING_MESSAGES", 3)
|
|
592
|
+
) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
|
|
543
593
|
|
|
544
594
|
def __init__(
|
|
545
595
|
self,
|
|
@@ -554,11 +604,20 @@ class EvaluationController:
|
|
|
554
604
|
|
|
555
605
|
if self.repeating_output_detection:
|
|
556
606
|
# Use deque for efficient O(1) operations
|
|
557
|
-
self.recent_user_messages = deque(
|
|
558
|
-
|
|
607
|
+
self.recent_user_messages = deque(
|
|
608
|
+
maxlen=self.MAX_REPEATING_MESSAGES
|
|
609
|
+
)
|
|
610
|
+
self.recent_assistant_messages = deque(
|
|
611
|
+
maxlen=self.MAX_REPEATING_MESSAGES
|
|
612
|
+
)
|
|
559
613
|
|
|
560
614
|
def run(
|
|
561
|
-
self,
|
|
615
|
+
self,
|
|
616
|
+
task_n,
|
|
617
|
+
story,
|
|
618
|
+
agent_name: str,
|
|
619
|
+
starting_user_input: str = None,
|
|
620
|
+
attack_instructions: str = None,
|
|
562
621
|
) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
|
|
563
622
|
step = 0
|
|
564
623
|
thread_id = None
|
|
@@ -570,7 +629,9 @@ class EvaluationController:
|
|
|
570
629
|
while step < self.MAX_CONVERSATION_STEPS:
|
|
571
630
|
if step == 0 and starting_user_input:
|
|
572
631
|
user_input = Message(
|
|
573
|
-
role="user",
|
|
632
|
+
role="user",
|
|
633
|
+
content=starting_user_input,
|
|
634
|
+
type=ContentType.text,
|
|
574
635
|
)
|
|
575
636
|
else:
|
|
576
637
|
if self.config.enable_manual_user_input == True:
|
|
@@ -582,7 +643,9 @@ class EvaluationController:
|
|
|
582
643
|
)
|
|
583
644
|
else: # llm
|
|
584
645
|
user_input = self.llm_user.generate_user_input(
|
|
585
|
-
story,
|
|
646
|
+
story,
|
|
647
|
+
conversation_history,
|
|
648
|
+
attack_instructions=attack_instructions,
|
|
586
649
|
)
|
|
587
650
|
if self.config.enable_verbose_logging:
|
|
588
651
|
rich.print(
|
|
@@ -592,26 +655,33 @@ class EvaluationController:
|
|
|
592
655
|
|
|
593
656
|
if self._is_end(user_input):
|
|
594
657
|
break
|
|
595
|
-
|
|
658
|
+
|
|
596
659
|
if self.repeating_output_detection:
|
|
597
660
|
self.recent_user_messages.append(user_input.content)
|
|
598
661
|
|
|
599
662
|
conversation_history.append(user_input)
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
663
|
+
|
|
664
|
+
(
|
|
665
|
+
messages,
|
|
666
|
+
thread_id,
|
|
667
|
+
conversational_search_data,
|
|
668
|
+
) = self.wxo_inference_backend.stream_messages(
|
|
669
|
+
user_input,
|
|
670
|
+
agent_name=agent_name,
|
|
671
|
+
thread_id=thread_id,
|
|
672
|
+
call_tracker=call_tracker,
|
|
608
673
|
)
|
|
609
674
|
if not messages:
|
|
610
|
-
raise RuntimeError(
|
|
611
|
-
|
|
675
|
+
raise RuntimeError(
|
|
676
|
+
f"[Task-{task_n}] No messages is produced. Exiting task."
|
|
677
|
+
)
|
|
678
|
+
|
|
612
679
|
for message in messages:
|
|
613
680
|
if self.repeating_output_detection:
|
|
614
|
-
if
|
|
681
|
+
if (
|
|
682
|
+
message.role == Roles.ASSISTANT
|
|
683
|
+
and message.type == ContentType.text
|
|
684
|
+
):
|
|
615
685
|
self.recent_assistant_messages.append(message.content)
|
|
616
686
|
|
|
617
687
|
if self.config.enable_verbose_logging:
|
|
@@ -621,11 +691,17 @@ class EvaluationController:
|
|
|
621
691
|
)
|
|
622
692
|
|
|
623
693
|
conversation_history.extend(messages)
|
|
624
|
-
conversational_search_history_data.extend(
|
|
625
|
-
|
|
694
|
+
conversational_search_history_data.extend(
|
|
695
|
+
conversational_search_data
|
|
696
|
+
)
|
|
697
|
+
|
|
626
698
|
step += 1
|
|
627
|
-
return
|
|
628
|
-
|
|
699
|
+
return (
|
|
700
|
+
conversation_history,
|
|
701
|
+
call_tracker,
|
|
702
|
+
conversational_search_history_data,
|
|
703
|
+
)
|
|
704
|
+
|
|
629
705
|
def _is_looping(self, messages: deque) -> bool:
|
|
630
706
|
"""Checks whether the user or assistant is stuck in a loop.
|
|
631
707
|
Args:
|
|
@@ -634,7 +710,7 @@ class EvaluationController:
|
|
|
634
710
|
bool: True if stuck in a loop, False otherwise.
|
|
635
711
|
"""
|
|
636
712
|
sim_count = 0
|
|
637
|
-
|
|
713
|
+
|
|
638
714
|
if len(messages) >= self.MAX_REPEATING_MESSAGES:
|
|
639
715
|
oldest_cached_message = messages[0]
|
|
640
716
|
for i, old_message in enumerate(messages):
|
|
@@ -642,11 +718,16 @@ class EvaluationController:
|
|
|
642
718
|
continue
|
|
643
719
|
if oldest_cached_message == old_message:
|
|
644
720
|
sim_count += 1
|
|
645
|
-
elif
|
|
721
|
+
elif (
|
|
722
|
+
calculate_word_overlap_similarity_score(
|
|
723
|
+
oldest_cached_message, old_message
|
|
724
|
+
)
|
|
725
|
+
> self.MESSAGE_SIMILARITY_THRESHOLD
|
|
726
|
+
):
|
|
646
727
|
sim_count += 1
|
|
647
|
-
|
|
728
|
+
|
|
648
729
|
return sim_count >= self.MAX_REPEATING_MESSAGES - 1
|
|
649
|
-
|
|
730
|
+
|
|
650
731
|
def _is_end(self, current_user_input: Message) -> bool:
|
|
651
732
|
"""
|
|
652
733
|
Check if the user input indicates the end of the conversation.
|
|
@@ -664,14 +745,16 @@ class EvaluationController:
|
|
|
664
745
|
# Check if the user message contains 'END'
|
|
665
746
|
if "END" in current_user_message_content:
|
|
666
747
|
return True
|
|
667
|
-
|
|
748
|
+
|
|
668
749
|
if self.repeating_output_detection:
|
|
669
750
|
# Check for repeating user or assistant messages
|
|
670
|
-
if
|
|
671
|
-
self.
|
|
751
|
+
if self._is_looping(self.recent_user_messages) or self._is_looping(
|
|
752
|
+
self.recent_assistant_messages
|
|
753
|
+
):
|
|
672
754
|
return True
|
|
673
|
-
|
|
674
|
-
return False
|
|
755
|
+
|
|
756
|
+
return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
|
|
757
|
+
|
|
675
758
|
|
|
676
759
|
def get_wxo_client(
|
|
677
760
|
service_url: str, tenant_name: str, token: str = None
|
|
@@ -684,7 +767,9 @@ def get_wxo_client(
|
|
|
684
767
|
|
|
685
768
|
if __name__ == "__main__":
|
|
686
769
|
wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
|
|
687
|
-
auth_config_path =
|
|
770
|
+
auth_config_path = (
|
|
771
|
+
f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
|
|
772
|
+
)
|
|
688
773
|
with open(auth_config_path, "r") as f:
|
|
689
774
|
auth_config = yaml.safe_load(f)
|
|
690
775
|
tenant_name = "local"
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
2
3
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
3
4
|
KeywordMatchingTemplateRenderer,
|
|
4
5
|
SemanticMatchingTemplateRenderer,
|
|
5
6
|
)
|
|
6
|
-
from
|
|
7
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class LLMMatcher:
|
|
@@ -26,7 +27,7 @@ class LLMMatcher:
|
|
|
26
27
|
prompt = self.keyword_template.render(
|
|
27
28
|
keywords_text=keywords_text, response_text=response_text
|
|
28
29
|
)
|
|
29
|
-
output:str = self.llm_client.query(prompt)
|
|
30
|
+
output: str = self.llm_client.query(prompt)
|
|
30
31
|
result = output.strip().lower()
|
|
31
32
|
return result.startswith("true")
|
|
32
33
|
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
-
from typing import List
|
|
2
1
|
import json
|
|
2
|
+
from typing import List
|
|
3
3
|
|
|
4
|
-
from wxo_agentic_evaluation.
|
|
4
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
5
|
+
AnswerRelevancy,
|
|
6
|
+
Faithfulness,
|
|
7
|
+
)
|
|
5
8
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
6
|
-
FaithfulnessTemplateRenderer,
|
|
7
9
|
AnswerRelevancyTemplateRenderer,
|
|
10
|
+
FaithfulnessTemplateRenderer,
|
|
8
11
|
)
|
|
9
|
-
from wxo_agentic_evaluation.
|
|
12
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
10
13
|
|
|
11
14
|
|
|
12
15
|
class LLMJudge:
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from typing import List, TypeVar
|
|
2
|
-
|
|
3
|
-
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
2
|
+
|
|
4
3
|
from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
|
|
4
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
5
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
5
6
|
|
|
6
7
|
T = TypeVar("T", bound=JinjaTemplateRenderer)
|
|
7
8
|
|
|
@@ -17,7 +18,10 @@ class LLMUser:
|
|
|
17
18
|
)
|
|
18
19
|
|
|
19
20
|
def generate_user_input(
|
|
20
|
-
self,
|
|
21
|
+
self,
|
|
22
|
+
user_story,
|
|
23
|
+
conversation_history: List[Message],
|
|
24
|
+
attack_instructions: str = None,
|
|
21
25
|
) -> Message | None:
|
|
22
26
|
# the tool response is already summarized, we don't need that to take over the chat history context window
|
|
23
27
|
prompt_input = self.prompt_template.render(
|