ibm-watsonx-orchestrate-evaluation-framework 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +42 -36
- wxo_agentic_evaluation/analyze_run.py +49 -32
- wxo_agentic_evaluation/arg_configs.py +30 -2
- wxo_agentic_evaluation/data_annotator.py +22 -4
- wxo_agentic_evaluation/description_quality_checker.py +20 -4
- wxo_agentic_evaluation/evaluation_package.py +189 -15
- wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
- wxo_agentic_evaluation/external_agent/types.py +1 -1
- wxo_agentic_evaluation/inference_backend.py +64 -34
- wxo_agentic_evaluation/llm_matching.py +92 -2
- wxo_agentic_evaluation/llm_user.py +2 -2
- wxo_agentic_evaluation/main.py +147 -38
- wxo_agentic_evaluation/metrics/__init__.py +5 -1
- wxo_agentic_evaluation/metrics/evaluations.py +124 -0
- wxo_agentic_evaluation/metrics/metrics.py +24 -3
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/template_render.py +16 -0
- wxo_agentic_evaluation/quick_eval.py +17 -3
- wxo_agentic_evaluation/record_chat.py +17 -6
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +44 -14
- wxo_agentic_evaluation/red_teaming/attack_generator.py +31 -12
- wxo_agentic_evaluation/red_teaming/attack_list.py +23 -24
- wxo_agentic_evaluation/red_teaming/attack_runner.py +36 -19
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +42 -16
- wxo_agentic_evaluation/service_instance.py +5 -3
- wxo_agentic_evaluation/service_provider/__init__.py +129 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
- wxo_agentic_evaluation/type.py +14 -4
- wxo_agentic_evaluation/utils/__init__.py +43 -5
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/utils.py +14 -9
- wxo_agentic_evaluation/wxo_client.py +2 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,47 @@
|
|
|
1
1
|
import json
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
|
|
7
|
+
ToolExtractionOpenAIFormat,
|
|
8
|
+
)
|
|
4
9
|
from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
|
|
10
|
+
from wxo_agentic_evaluation.utils.utils import (
|
|
11
|
+
N_A,
|
|
12
|
+
TestCaseResources,
|
|
13
|
+
add_line_seperator,
|
|
14
|
+
list_run_files,
|
|
15
|
+
load_run_metrics,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def json_dump(output_path, obj):
|
|
20
|
+
"""
|
|
21
|
+
Atomically dump JSON to `output_path`.
|
|
5
22
|
|
|
23
|
+
- Writes to a temporary file first
|
|
24
|
+
- Then atomically replaces the target file
|
|
25
|
+
- Prevents corrupted/half-written JSON if process is interrupted
|
|
26
|
+
"""
|
|
27
|
+
output_path = Path(output_path)
|
|
28
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
6
29
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
30
|
+
fd, tmp_path = tempfile.mkstemp(
|
|
31
|
+
dir=output_path.parent,
|
|
32
|
+
prefix=output_path.stem,
|
|
33
|
+
suffix=".tmp",
|
|
34
|
+
text=True,
|
|
35
|
+
)
|
|
36
|
+
try:
|
|
37
|
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
38
|
+
json.dump(obj, f, indent=4, ensure_ascii=False)
|
|
39
|
+
f.flush()
|
|
40
|
+
os.fsync(f.fileno())
|
|
41
|
+
os.replace(tmp_path, output_path)
|
|
42
|
+
except Exception:
|
|
43
|
+
try:
|
|
44
|
+
os.remove(tmp_path)
|
|
45
|
+
except OSError:
|
|
46
|
+
pass
|
|
47
|
+
raise
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation discovery mechanism.
|
|
3
|
+
|
|
4
|
+
This module provides functionality for discovering classes that inherit from Evaluation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import importlib.util
|
|
8
|
+
import inspect
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def find_evaluation_subclasses(directory: str, base_class_name="Evaluation"):
|
|
13
|
+
"""
|
|
14
|
+
Dynamically import Python files under 'directory' and find classes that
|
|
15
|
+
inherit from a class named 'Evaluation'. Returns a list of non-abstract
|
|
16
|
+
class objects.
|
|
17
|
+
"""
|
|
18
|
+
subclasses = []
|
|
19
|
+
|
|
20
|
+
for root, _, files in os.walk(directory):
|
|
21
|
+
for file in files:
|
|
22
|
+
if file.endswith(".py") and not file.startswith("__"):
|
|
23
|
+
filepath = os.path.join(root, file)
|
|
24
|
+
module_name = os.path.splitext(os.path.basename(filepath))[0]
|
|
25
|
+
|
|
26
|
+
spec = importlib.util.spec_from_file_location(
|
|
27
|
+
module_name, filepath
|
|
28
|
+
)
|
|
29
|
+
if spec and spec.loader:
|
|
30
|
+
module = importlib.util.module_from_spec(spec)
|
|
31
|
+
try:
|
|
32
|
+
spec.loader.exec_module(module)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
print(f"Skipping {filepath} due to import error: {e}")
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
# Inspect for subclasses
|
|
38
|
+
for name, obj in inspect.getmembers(
|
|
39
|
+
module, inspect.isclass
|
|
40
|
+
):
|
|
41
|
+
if any(
|
|
42
|
+
base.__name__ == base_class_name
|
|
43
|
+
for base in obj.__mro__[1:]
|
|
44
|
+
) and not inspect.isabstract(obj):
|
|
45
|
+
subclasses.append(obj)
|
|
46
|
+
|
|
47
|
+
return subclasses
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.arg_configs import AuthConfig
|
|
5
|
+
from wxo_agentic_evaluation.service_provider import USE_GATEWAY_MODEL_PROVIDER
|
|
6
|
+
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
7
|
+
|
|
8
|
+
WXO_AUTH_CONFIG_DEFAULTS = AuthConfig(
|
|
9
|
+
url=os.getenv("WXO_URL", "http://localhost:4321"),
|
|
10
|
+
tenant_name=os.getenv("WXO_TENANT", "wxo-dev"),
|
|
11
|
+
token=os.getenv("WXO_TOKEN", None),
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@lru_cache(maxsize=1)
|
|
16
|
+
def _get_cached_wxo_client():
|
|
17
|
+
# TODO: remove this once the client is implemented as a Singleton.
|
|
18
|
+
return get_wxo_client(
|
|
19
|
+
WXO_AUTH_CONFIG_DEFAULTS.url,
|
|
20
|
+
WXO_AUTH_CONFIG_DEFAULTS.tenant_name,
|
|
21
|
+
WXO_AUTH_CONFIG_DEFAULTS.token,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_provider_kwargs(**base_kwargs: dict) -> dict:
|
|
26
|
+
|
|
27
|
+
if not USE_GATEWAY_MODEL_PROVIDER:
|
|
28
|
+
return base_kwargs
|
|
29
|
+
|
|
30
|
+
if "instance_url" in base_kwargs and "token" in base_kwargs:
|
|
31
|
+
return base_kwargs
|
|
32
|
+
|
|
33
|
+
wxo_client = _get_cached_wxo_client()
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
**base_kwargs,
|
|
37
|
+
"instance_url": wxo_client.service_url,
|
|
38
|
+
"token": wxo_client.api_key,
|
|
39
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ParsedMessages(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
A parsed history of messages.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
messages: list[Message] = Field(description="The list of messages")
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def user_input(self) -> Optional[str]:
|
|
17
|
+
"""Find the original user message."""
|
|
18
|
+
for message in self.messages:
|
|
19
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
20
|
+
return str(message.content)
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def agent_response(self) -> Optional[str]:
|
|
25
|
+
"""Find the most recent assistant message."""
|
|
26
|
+
messages_in_reverse = reversed(self.messages)
|
|
27
|
+
for message in messages_in_reverse:
|
|
28
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
29
|
+
return str(message.content)
|
|
30
|
+
return None
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
+
import csv
|
|
1
2
|
import glob
|
|
2
3
|
import json
|
|
3
4
|
import os
|
|
4
5
|
import re
|
|
5
|
-
import csv
|
|
6
6
|
from collections import defaultdict
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import
|
|
8
|
+
from typing import Any, List, Mapping, Optional, Tuple, Union
|
|
9
9
|
from urllib.parse import urlparse
|
|
10
10
|
|
|
11
|
-
import yaml
|
|
12
11
|
import rich
|
|
12
|
+
import yaml
|
|
13
13
|
from rich import box, print
|
|
14
14
|
from rich.console import Console, Group
|
|
15
15
|
from rich.panel import Panel
|
|
@@ -19,15 +19,15 @@ from rich.table import Table
|
|
|
19
19
|
|
|
20
20
|
from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
|
|
21
21
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
22
|
+
EnhancedAnalyzeMetrics,
|
|
22
23
|
KnowledgeBaseMetricSummary,
|
|
23
24
|
ReferenceLessEvalMetrics,
|
|
24
25
|
ToolCallAndRoutingMetrics,
|
|
25
|
-
EnhancedAnalyzeMetrics,
|
|
26
26
|
)
|
|
27
27
|
from wxo_agentic_evaluation.type import (
|
|
28
28
|
ConversationalConfidenceThresholdScore,
|
|
29
|
-
Message,
|
|
30
29
|
ExtendedMessage,
|
|
30
|
+
Message,
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
console = Console()
|
|
@@ -37,6 +37,7 @@ RUN_FILE_RE = re.compile(
|
|
|
37
37
|
)
|
|
38
38
|
N_A = "N/A"
|
|
39
39
|
|
|
40
|
+
|
|
40
41
|
class AttackResultsTable:
|
|
41
42
|
def __init__(self, attack_results: dict):
|
|
42
43
|
self.table = Table(
|
|
@@ -164,9 +165,11 @@ class TestCaseResources:
|
|
|
164
165
|
|
|
165
166
|
|
|
166
167
|
class AgentMetricsTable:
|
|
167
|
-
def __init__(self, data):
|
|
168
|
+
def __init__(self, data, title: Optional[str] = None):
|
|
169
|
+
if title is None:
|
|
170
|
+
title = "Agent Metrics"
|
|
168
171
|
self.table = Table(
|
|
169
|
-
title=
|
|
172
|
+
title=title,
|
|
170
173
|
box=box.ROUNDED,
|
|
171
174
|
show_lines=True,
|
|
172
175
|
)
|
|
@@ -187,7 +190,9 @@ class AgentMetricsTable:
|
|
|
187
190
|
console.print(self.table)
|
|
188
191
|
|
|
189
192
|
|
|
190
|
-
def create_table(
|
|
193
|
+
def create_table(
|
|
194
|
+
data: List[dict], title: Optional[str] = None
|
|
195
|
+
) -> AgentMetricsTable:
|
|
191
196
|
"""
|
|
192
197
|
Generate a Rich table from a list of dictionaries.
|
|
193
198
|
Returns the AgentMetricsTable instance.
|
|
@@ -199,7 +204,7 @@ def create_table(data: List[dict]) -> AgentMetricsTable:
|
|
|
199
204
|
print("create_table() received an empty dataset. No table generated.")
|
|
200
205
|
return None
|
|
201
206
|
|
|
202
|
-
return AgentMetricsTable(data)
|
|
207
|
+
return AgentMetricsTable(data, title=title)
|
|
203
208
|
|
|
204
209
|
|
|
205
210
|
def safe_divide(nom, denom):
|
|
File without changes
|