ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA +35 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD +65 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +9 -3
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation.py +42 -0
- wxo_agentic_evaluation/evaluation_package.py +117 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +183 -79
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +176 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +103 -21
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +216 -34
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
from typing import Any, List, Literal, Mapping, Union
|
|
2
|
+
|
|
1
3
|
from pydantic import BaseModel
|
|
2
|
-
from typing import List, Union, Literal, Mapping, Any
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
class ThinkingStepDetails(BaseModel):
|
|
@@ -25,7 +26,9 @@ class ToolResponseStepDetails(BaseModel):
|
|
|
25
26
|
tool_call_id: str
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
StepDetails = Union[
|
|
29
|
+
StepDetails = Union[
|
|
30
|
+
ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails
|
|
31
|
+
]
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
class DeltaMessageChoice(BaseModel):
|
|
@@ -59,8 +62,12 @@ class ThreadRunStepDeltaData(BaseEventData):
|
|
|
59
62
|
|
|
60
63
|
|
|
61
64
|
class UniversalData(BaseEventData):
|
|
62
|
-
object: Union[
|
|
63
|
-
|
|
65
|
+
object: Union[
|
|
66
|
+
Literal["thread.message.delta"],
|
|
67
|
+
Literal["thread.run.step.delta"],
|
|
68
|
+
Literal["thread.run.step.created"],
|
|
69
|
+
Literal["thread.run.step.completed"],
|
|
70
|
+
]
|
|
64
71
|
choices: List[ThreadMessageDeltaChoice]
|
|
65
72
|
choices: List[Union[ThreadMessageDeltaChoice, dict]]
|
|
66
73
|
|
|
@@ -68,4 +75,4 @@ class UniversalData(BaseEventData):
|
|
|
68
75
|
class SchemaValidationResults(BaseModel):
|
|
69
76
|
success: bool
|
|
70
77
|
logged_events: List[str]
|
|
71
|
-
messages: List[Mapping[Any, Any]]
|
|
78
|
+
messages: List[Mapping[Any, Any]]
|
|
@@ -1,61 +1,68 @@
|
|
|
1
|
-
import requests
|
|
2
|
-
import os
|
|
3
|
-
import yaml
|
|
4
1
|
import json
|
|
5
|
-
import
|
|
2
|
+
import os
|
|
6
3
|
import time
|
|
7
|
-
from pydantic import BaseModel
|
|
8
|
-
from typing import List, Generator, Dict, Tuple, Mapping, Any
|
|
9
|
-
from enum import Enum
|
|
10
4
|
from collections import deque
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
import rich
|
|
10
|
+
import urllib3
|
|
11
|
+
import yaml
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
from urllib3.exceptions import InsecureRequestWarning
|
|
11
14
|
|
|
15
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
16
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
17
|
+
from wxo_agentic_evaluation.service_instance import get_env_settings, tenant_setup
|
|
18
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
19
|
+
WatsonXProvider,
|
|
20
|
+
)
|
|
12
21
|
from wxo_agentic_evaluation.type import (
|
|
13
22
|
ContentType,
|
|
14
|
-
|
|
23
|
+
ConversationalConfidenceThresholdScore,
|
|
15
24
|
ConversationalSearch,
|
|
16
25
|
ConversationalSearchCitations,
|
|
17
26
|
ConversationalSearchResultMetadata,
|
|
18
|
-
ConversationalConfidenceThresholdScore,
|
|
19
27
|
ConversationalSearchResults,
|
|
20
28
|
ConversationSearchMetadata,
|
|
29
|
+
Message,
|
|
21
30
|
)
|
|
22
|
-
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
23
|
-
from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
|
|
24
|
-
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
25
|
-
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
26
31
|
from wxo_agentic_evaluation.utils.utils import (
|
|
32
|
+
Tokenizer,
|
|
27
33
|
is_saas_url,
|
|
28
34
|
safe_divide,
|
|
29
|
-
Tokenizer
|
|
30
35
|
)
|
|
31
36
|
|
|
32
37
|
tokenizer = Tokenizer()
|
|
33
38
|
|
|
39
|
+
|
|
34
40
|
class Roles(Enum):
|
|
35
41
|
ASSISTANT = "assistant"
|
|
36
42
|
USER = "user"
|
|
37
43
|
|
|
38
|
-
|
|
44
|
+
|
|
45
|
+
def calculate_word_overlap_similarity_score(
|
|
46
|
+
first_message_text: str, second_message_text: str
|
|
47
|
+
) -> float:
|
|
39
48
|
"""Calculate the word overlap similarity score between the .content field of two Message objects.
|
|
40
49
|
Args:
|
|
41
50
|
first_message_text (str): The .content field of the first message.
|
|
42
51
|
second_message_text (str): The .content field of the second message.
|
|
43
52
|
"""
|
|
44
|
-
|
|
53
|
+
|
|
45
54
|
words_in_first_message = tokenizer(first_message_text)
|
|
46
55
|
words_in_second_message = tokenizer(second_message_text)
|
|
47
56
|
|
|
48
57
|
# Calculate the number of common words
|
|
49
58
|
common_words = set(words_in_first_message) & set(words_in_second_message)
|
|
50
59
|
unique_words = set(words_in_first_message + words_in_second_message)
|
|
51
|
-
|
|
60
|
+
|
|
52
61
|
unique_words_count = len(unique_words)
|
|
53
62
|
common_words_count = len(common_words)
|
|
54
63
|
|
|
55
|
-
return safe_divide(
|
|
56
|
-
|
|
57
|
-
unique_words_count
|
|
58
|
-
)
|
|
64
|
+
return safe_divide(common_words_count, unique_words_count)
|
|
65
|
+
|
|
59
66
|
|
|
60
67
|
def is_transfer_response(step_detail: Dict):
|
|
61
68
|
# this is not very reliable
|
|
@@ -73,10 +80,23 @@ class CallTracker(BaseModel):
|
|
|
73
80
|
|
|
74
81
|
|
|
75
82
|
class WXOClient:
|
|
76
|
-
def __init__(self, service_url, api_key):
|
|
83
|
+
def __init__(self, service_url, api_key, env: Optional[Dict[str, Any]] = None):
|
|
77
84
|
self.service_url = service_url
|
|
78
85
|
self.api_key = api_key
|
|
79
86
|
|
|
87
|
+
ov = os.getenv("WO_SSL_VERIFY")
|
|
88
|
+
if ov and ov.strip().lower() in ("true", "false"):
|
|
89
|
+
self._verify_ssl = ov.strip().lower() == "true"
|
|
90
|
+
else:
|
|
91
|
+
v, bs = (env.get("verify") if env else None), (env.get("bypass_ssl") if env else None)
|
|
92
|
+
self._verify_ssl = False if (
|
|
93
|
+
(bs is True) or (isinstance(bs, str) and bs.strip().lower() == "true") or
|
|
94
|
+
(v is None) or (isinstance(v, str) and v.strip().lower() in {"none", "null"})
|
|
95
|
+
) else (v if isinstance(v, bool) else True)
|
|
96
|
+
|
|
97
|
+
if not self._verify_ssl:
|
|
98
|
+
urllib3.disable_warnings(InsecureRequestWarning)
|
|
99
|
+
|
|
80
100
|
def _get_headers(self) -> dict:
|
|
81
101
|
headers = {}
|
|
82
102
|
if self.api_key:
|
|
@@ -86,12 +106,21 @@ class WXOClient:
|
|
|
86
106
|
def post(self, payload: dict, path: str, stream=False):
|
|
87
107
|
url = f"{self.service_url}/{path}"
|
|
88
108
|
return requests.post(
|
|
89
|
-
url=url,
|
|
109
|
+
url=url,
|
|
110
|
+
headers=self._get_headers(),
|
|
111
|
+
json=payload,
|
|
112
|
+
stream=stream,
|
|
113
|
+
verify=self._verify_ssl,
|
|
90
114
|
)
|
|
91
115
|
|
|
92
116
|
def get(self, path: str, params: dict = None):
|
|
93
117
|
url = f"{self.service_url}/{path}"
|
|
94
|
-
return requests.get(
|
|
118
|
+
return requests.get(
|
|
119
|
+
url,
|
|
120
|
+
params=params,
|
|
121
|
+
headers=self._get_headers(),
|
|
122
|
+
verify=self._verify_ssl,
|
|
123
|
+
)
|
|
95
124
|
|
|
96
125
|
|
|
97
126
|
class WXOInferenceBackend:
|
|
@@ -135,7 +164,9 @@ class WXOInferenceBackend:
|
|
|
135
164
|
else:
|
|
136
165
|
path = "v1/orchestrate/runs?stream=true"
|
|
137
166
|
|
|
138
|
-
response: requests.Response = self.wxo_client.post(
|
|
167
|
+
response: requests.Response = self.wxo_client.post(
|
|
168
|
+
payload, path, stream=True
|
|
169
|
+
)
|
|
139
170
|
import json
|
|
140
171
|
|
|
141
172
|
for chunk in self._parse_events(response):
|
|
@@ -188,7 +219,9 @@ class WXOInferenceBackend:
|
|
|
188
219
|
citations = parse_citations()
|
|
189
220
|
retrieval_context = parsed_search_results()
|
|
190
221
|
citations_title = conversational_search.get("citations_title", "")
|
|
191
|
-
response_length_option = conversational_search.get(
|
|
222
|
+
response_length_option = conversational_search.get(
|
|
223
|
+
"response_length_option", ""
|
|
224
|
+
)
|
|
192
225
|
text = conversational_search.get("text", "")
|
|
193
226
|
|
|
194
227
|
confidence_scores = ConversationalConfidenceThresholdScore(
|
|
@@ -261,7 +294,9 @@ class WXOInferenceBackend:
|
|
|
261
294
|
)
|
|
262
295
|
)
|
|
263
296
|
end_time = time.time()
|
|
264
|
-
call_tracker.tool_call.append(
|
|
297
|
+
call_tracker.tool_call.append(
|
|
298
|
+
end_time - start_time
|
|
299
|
+
)
|
|
265
300
|
start_time = end_time
|
|
266
301
|
elif step_detail["type"] == "tool_call":
|
|
267
302
|
# in step details, we could have [tool_response, tool_call]
|
|
@@ -279,7 +314,9 @@ class WXOInferenceBackend:
|
|
|
279
314
|
)
|
|
280
315
|
)
|
|
281
316
|
end_time = time.time()
|
|
282
|
-
call_tracker.tool_call.append(
|
|
317
|
+
call_tracker.tool_call.append(
|
|
318
|
+
end_time - start_time
|
|
319
|
+
)
|
|
283
320
|
start_time = end_time
|
|
284
321
|
elif step_detail["type"] == "tool_response":
|
|
285
322
|
content = json.dumps(step_detail)
|
|
@@ -293,7 +330,9 @@ class WXOInferenceBackend:
|
|
|
293
330
|
)
|
|
294
331
|
)
|
|
295
332
|
end_time = time.time()
|
|
296
|
-
call_tracker.tool_response.append(
|
|
333
|
+
call_tracker.tool_response.append(
|
|
334
|
+
end_time - start_time
|
|
335
|
+
)
|
|
297
336
|
start_time = end_time
|
|
298
337
|
elif content_field := delta.get("content"):
|
|
299
338
|
for val in content_field:
|
|
@@ -312,7 +351,9 @@ class WXOInferenceBackend:
|
|
|
312
351
|
chunk=event,
|
|
313
352
|
)
|
|
314
353
|
end_time = time.time()
|
|
315
|
-
call_tracker.generic.append(
|
|
354
|
+
call_tracker.generic.append(
|
|
355
|
+
end_time - start_time
|
|
356
|
+
)
|
|
316
357
|
start_time = end_time
|
|
317
358
|
|
|
318
359
|
# NOTE: The event here that is parsed is part of the "message.created" event
|
|
@@ -336,10 +377,14 @@ class WXOInferenceBackend:
|
|
|
336
377
|
"""
|
|
337
378
|
|
|
338
379
|
last_message = json.loads(messages[-1].content)
|
|
339
|
-
tool_call_id = last_message.get(
|
|
380
|
+
tool_call_id = last_message.get(
|
|
381
|
+
"tool_call_id", None
|
|
382
|
+
)
|
|
340
383
|
assert tool_call_id is not None
|
|
341
|
-
conversational_search_metadata =
|
|
342
|
-
|
|
384
|
+
conversational_search_metadata = (
|
|
385
|
+
ConversationSearchMetadata(
|
|
386
|
+
tool_call_id=tool_call_id
|
|
387
|
+
)
|
|
343
388
|
)
|
|
344
389
|
conversational_search = (
|
|
345
390
|
self.parse_conversational_search_response(
|
|
@@ -347,7 +392,9 @@ class WXOInferenceBackend:
|
|
|
347
392
|
metadata=conversational_search_metadata,
|
|
348
393
|
)
|
|
349
394
|
)
|
|
350
|
-
conversational_search_data.append(
|
|
395
|
+
conversational_search_data.append(
|
|
396
|
+
conversational_search
|
|
397
|
+
)
|
|
351
398
|
messages.append(
|
|
352
399
|
Message(
|
|
353
400
|
role=role,
|
|
@@ -436,7 +483,10 @@ class WXOInferenceBackend:
|
|
|
436
483
|
content = json.dumps(tool_json)
|
|
437
484
|
# TO-DO: review do we even need the get messages for retry loop anymore?
|
|
438
485
|
if msg_content := entry.get("content"):
|
|
439
|
-
if
|
|
486
|
+
if (
|
|
487
|
+
msg_content[0].get("response_type")
|
|
488
|
+
== "conversational_search"
|
|
489
|
+
):
|
|
440
490
|
continue
|
|
441
491
|
messages.append(
|
|
442
492
|
Message(
|
|
@@ -451,7 +501,9 @@ class WXOInferenceBackend:
|
|
|
451
501
|
content = json.dumps(step_detail)
|
|
452
502
|
messages.append(
|
|
453
503
|
Message(
|
|
454
|
-
role=role,
|
|
504
|
+
role=role,
|
|
505
|
+
content=content,
|
|
506
|
+
type=content_type,
|
|
455
507
|
)
|
|
456
508
|
)
|
|
457
509
|
else:
|
|
@@ -459,7 +511,9 @@ class WXOInferenceBackend:
|
|
|
459
511
|
content_type = ContentType.tool_response
|
|
460
512
|
messages.append(
|
|
461
513
|
Message(
|
|
462
|
-
role=role,
|
|
514
|
+
role=role,
|
|
515
|
+
content=content,
|
|
516
|
+
type=content_type,
|
|
463
517
|
)
|
|
464
518
|
)
|
|
465
519
|
if content_field := entry.get("content"):
|
|
@@ -468,12 +522,19 @@ class WXOInferenceBackend:
|
|
|
468
522
|
if val["response_type"] == ContentType.text:
|
|
469
523
|
messages.append(
|
|
470
524
|
Message(
|
|
471
|
-
role=role,
|
|
525
|
+
role=role,
|
|
526
|
+
content=val["text"],
|
|
527
|
+
type=ContentType.text,
|
|
472
528
|
)
|
|
473
529
|
)
|
|
474
|
-
if
|
|
475
|
-
|
|
476
|
-
|
|
530
|
+
if (
|
|
531
|
+
val["response_type"]
|
|
532
|
+
== ContentType.conversational_search
|
|
533
|
+
):
|
|
534
|
+
conversational_search_metadata = (
|
|
535
|
+
ConversationSearchMetadata(
|
|
536
|
+
tool_call_id=tool_call_id
|
|
537
|
+
)
|
|
477
538
|
)
|
|
478
539
|
messages.append(
|
|
479
540
|
Message(
|
|
@@ -538,8 +599,12 @@ class WXOInferenceBackend:
|
|
|
538
599
|
class EvaluationController:
|
|
539
600
|
|
|
540
601
|
MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
|
|
541
|
-
MESSAGE_SIMILARITY_THRESHOLD = float(
|
|
542
|
-
|
|
602
|
+
MESSAGE_SIMILARITY_THRESHOLD = float(
|
|
603
|
+
os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
|
|
604
|
+
) # if any two consecutive messages are >98% similar, the inference loop will be terminated
|
|
605
|
+
MAX_REPEATING_MESSAGES = int(
|
|
606
|
+
os.getenv("MAX_REPEATING_MESSAGES", 3)
|
|
607
|
+
) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
|
|
543
608
|
|
|
544
609
|
def __init__(
|
|
545
610
|
self,
|
|
@@ -554,11 +619,20 @@ class EvaluationController:
|
|
|
554
619
|
|
|
555
620
|
if self.repeating_output_detection:
|
|
556
621
|
# Use deque for efficient O(1) operations
|
|
557
|
-
self.recent_user_messages = deque(
|
|
558
|
-
|
|
622
|
+
self.recent_user_messages = deque(
|
|
623
|
+
maxlen=self.MAX_REPEATING_MESSAGES
|
|
624
|
+
)
|
|
625
|
+
self.recent_assistant_messages = deque(
|
|
626
|
+
maxlen=self.MAX_REPEATING_MESSAGES
|
|
627
|
+
)
|
|
559
628
|
|
|
560
629
|
def run(
|
|
561
|
-
self,
|
|
630
|
+
self,
|
|
631
|
+
task_n,
|
|
632
|
+
story,
|
|
633
|
+
agent_name: str,
|
|
634
|
+
starting_user_input: str = None,
|
|
635
|
+
attack_instructions: str = None,
|
|
562
636
|
) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
|
|
563
637
|
step = 0
|
|
564
638
|
thread_id = None
|
|
@@ -570,7 +644,9 @@ class EvaluationController:
|
|
|
570
644
|
while step < self.MAX_CONVERSATION_STEPS:
|
|
571
645
|
if step == 0 and starting_user_input:
|
|
572
646
|
user_input = Message(
|
|
573
|
-
role="user",
|
|
647
|
+
role="user",
|
|
648
|
+
content=starting_user_input,
|
|
649
|
+
type=ContentType.text,
|
|
574
650
|
)
|
|
575
651
|
else:
|
|
576
652
|
if self.config.enable_manual_user_input == True:
|
|
@@ -582,7 +658,9 @@ class EvaluationController:
|
|
|
582
658
|
)
|
|
583
659
|
else: # llm
|
|
584
660
|
user_input = self.llm_user.generate_user_input(
|
|
585
|
-
story,
|
|
661
|
+
story,
|
|
662
|
+
conversation_history,
|
|
663
|
+
attack_instructions=attack_instructions,
|
|
586
664
|
)
|
|
587
665
|
if self.config.enable_verbose_logging:
|
|
588
666
|
rich.print(
|
|
@@ -592,26 +670,33 @@ class EvaluationController:
|
|
|
592
670
|
|
|
593
671
|
if self._is_end(user_input):
|
|
594
672
|
break
|
|
595
|
-
|
|
673
|
+
|
|
596
674
|
if self.repeating_output_detection:
|
|
597
675
|
self.recent_user_messages.append(user_input.content)
|
|
598
676
|
|
|
599
677
|
conversation_history.append(user_input)
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
678
|
+
|
|
679
|
+
(
|
|
680
|
+
messages,
|
|
681
|
+
thread_id,
|
|
682
|
+
conversational_search_data,
|
|
683
|
+
) = self.wxo_inference_backend.stream_messages(
|
|
684
|
+
user_input,
|
|
685
|
+
agent_name=agent_name,
|
|
686
|
+
thread_id=thread_id,
|
|
687
|
+
call_tracker=call_tracker,
|
|
608
688
|
)
|
|
609
689
|
if not messages:
|
|
610
|
-
raise RuntimeError(
|
|
611
|
-
|
|
690
|
+
raise RuntimeError(
|
|
691
|
+
f"[Task-{task_n}] No messages is produced. Exiting task."
|
|
692
|
+
)
|
|
693
|
+
|
|
612
694
|
for message in messages:
|
|
613
695
|
if self.repeating_output_detection:
|
|
614
|
-
if
|
|
696
|
+
if (
|
|
697
|
+
message.role == Roles.ASSISTANT
|
|
698
|
+
and message.type == ContentType.text
|
|
699
|
+
):
|
|
615
700
|
self.recent_assistant_messages.append(message.content)
|
|
616
701
|
|
|
617
702
|
if self.config.enable_verbose_logging:
|
|
@@ -621,11 +706,17 @@ class EvaluationController:
|
|
|
621
706
|
)
|
|
622
707
|
|
|
623
708
|
conversation_history.extend(messages)
|
|
624
|
-
conversational_search_history_data.extend(
|
|
625
|
-
|
|
709
|
+
conversational_search_history_data.extend(
|
|
710
|
+
conversational_search_data
|
|
711
|
+
)
|
|
712
|
+
|
|
626
713
|
step += 1
|
|
627
|
-
return
|
|
628
|
-
|
|
714
|
+
return (
|
|
715
|
+
conversation_history,
|
|
716
|
+
call_tracker,
|
|
717
|
+
conversational_search_history_data,
|
|
718
|
+
)
|
|
719
|
+
|
|
629
720
|
def _is_looping(self, messages: deque) -> bool:
|
|
630
721
|
"""Checks whether the user or assistant is stuck in a loop.
|
|
631
722
|
Args:
|
|
@@ -634,7 +725,7 @@ class EvaluationController:
|
|
|
634
725
|
bool: True if stuck in a loop, False otherwise.
|
|
635
726
|
"""
|
|
636
727
|
sim_count = 0
|
|
637
|
-
|
|
728
|
+
|
|
638
729
|
if len(messages) >= self.MAX_REPEATING_MESSAGES:
|
|
639
730
|
oldest_cached_message = messages[0]
|
|
640
731
|
for i, old_message in enumerate(messages):
|
|
@@ -642,11 +733,16 @@ class EvaluationController:
|
|
|
642
733
|
continue
|
|
643
734
|
if oldest_cached_message == old_message:
|
|
644
735
|
sim_count += 1
|
|
645
|
-
elif
|
|
736
|
+
elif (
|
|
737
|
+
calculate_word_overlap_similarity_score(
|
|
738
|
+
oldest_cached_message, old_message
|
|
739
|
+
)
|
|
740
|
+
> self.MESSAGE_SIMILARITY_THRESHOLD
|
|
741
|
+
):
|
|
646
742
|
sim_count += 1
|
|
647
|
-
|
|
743
|
+
|
|
648
744
|
return sim_count >= self.MAX_REPEATING_MESSAGES - 1
|
|
649
|
-
|
|
745
|
+
|
|
650
746
|
def _is_end(self, current_user_input: Message) -> bool:
|
|
651
747
|
"""
|
|
652
748
|
Check if the user input indicates the end of the conversation.
|
|
@@ -664,27 +760,35 @@ class EvaluationController:
|
|
|
664
760
|
# Check if the user message contains 'END'
|
|
665
761
|
if "END" in current_user_message_content:
|
|
666
762
|
return True
|
|
667
|
-
|
|
763
|
+
|
|
668
764
|
if self.repeating_output_detection:
|
|
669
765
|
# Check for repeating user or assistant messages
|
|
670
|
-
if
|
|
671
|
-
self.
|
|
766
|
+
if self._is_looping(self.recent_user_messages) or self._is_looping(
|
|
767
|
+
self.recent_assistant_messages
|
|
768
|
+
):
|
|
672
769
|
return True
|
|
673
|
-
|
|
674
|
-
return False
|
|
770
|
+
|
|
771
|
+
return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
|
|
772
|
+
|
|
675
773
|
|
|
676
774
|
def get_wxo_client(
|
|
677
|
-
service_url: str, tenant_name: str, token: str = None
|
|
775
|
+
service_url: Optional[str], tenant_name: str, token: Optional[str] = None
|
|
678
776
|
) -> WXOClient:
|
|
679
|
-
if not token:
|
|
680
|
-
token = tenant_setup(service_url, tenant_name)
|
|
681
|
-
wxo_client = WXOClient(service_url=service_url, api_key=token)
|
|
682
|
-
return wxo_client
|
|
683
777
|
|
|
778
|
+
token, resolved_url, env = tenant_setup(service_url, tenant_name)
|
|
779
|
+
service_url = service_url or resolved_url
|
|
780
|
+
|
|
781
|
+
if not (service_url and str(service_url).strip()):
|
|
782
|
+
raise ValueError(f"service_url not provided and not found in config for tenant '{tenant_name}'")
|
|
783
|
+
|
|
784
|
+
wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
|
|
785
|
+
return wxo_client
|
|
684
786
|
|
|
685
787
|
if __name__ == "__main__":
|
|
686
788
|
wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
|
|
687
|
-
auth_config_path =
|
|
789
|
+
auth_config_path = (
|
|
790
|
+
f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
|
|
791
|
+
)
|
|
688
792
|
with open(auth_config_path, "r") as f:
|
|
689
793
|
auth_config = yaml.safe_load(f)
|
|
690
794
|
tenant_name = "local"
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
2
3
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
3
4
|
KeywordMatchingTemplateRenderer,
|
|
4
5
|
SemanticMatchingTemplateRenderer,
|
|
5
6
|
)
|
|
6
|
-
from
|
|
7
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class LLMMatcher:
|
|
@@ -26,7 +27,7 @@ class LLMMatcher:
|
|
|
26
27
|
prompt = self.keyword_template.render(
|
|
27
28
|
keywords_text=keywords_text, response_text=response_text
|
|
28
29
|
)
|
|
29
|
-
output:str = self.llm_client.query(prompt)
|
|
30
|
+
output: str = self.llm_client.query(prompt)
|
|
30
31
|
result = output.strip().lower()
|
|
31
32
|
return result.startswith("true")
|
|
32
33
|
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
-
from typing import List
|
|
2
1
|
import json
|
|
2
|
+
from typing import List
|
|
3
3
|
|
|
4
|
-
from wxo_agentic_evaluation.
|
|
4
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
5
|
+
AnswerRelevancy,
|
|
6
|
+
Faithfulness,
|
|
7
|
+
)
|
|
5
8
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
6
|
-
FaithfulnessTemplateRenderer,
|
|
7
9
|
AnswerRelevancyTemplateRenderer,
|
|
10
|
+
FaithfulnessTemplateRenderer,
|
|
8
11
|
)
|
|
9
|
-
from wxo_agentic_evaluation.
|
|
12
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
10
13
|
|
|
11
14
|
|
|
12
15
|
class LLMJudge:
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from typing import List, TypeVar
|
|
2
|
-
|
|
3
|
-
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
2
|
+
|
|
4
3
|
from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
|
|
4
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
5
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
5
6
|
|
|
6
7
|
T = TypeVar("T", bound=JinjaTemplateRenderer)
|
|
7
8
|
|
|
@@ -17,7 +18,10 @@ class LLMUser:
|
|
|
17
18
|
)
|
|
18
19
|
|
|
19
20
|
def generate_user_input(
|
|
20
|
-
self,
|
|
21
|
+
self,
|
|
22
|
+
user_story,
|
|
23
|
+
conversation_history: List[Message],
|
|
24
|
+
attack_instructions: str = None,
|
|
21
25
|
) -> Message | None:
|
|
22
26
|
# the tool response is already summarized, we don't need that to take over the chat history context window
|
|
23
27
|
prompt_input = self.prompt_template.render(
|