ibm-watsonx-orchestrate-evaluation-framework 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (42) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +42 -36
  3. wxo_agentic_evaluation/analyze_run.py +49 -32
  4. wxo_agentic_evaluation/arg_configs.py +30 -2
  5. wxo_agentic_evaluation/data_annotator.py +22 -4
  6. wxo_agentic_evaluation/description_quality_checker.py +20 -4
  7. wxo_agentic_evaluation/evaluation_package.py +189 -15
  8. wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
  9. wxo_agentic_evaluation/external_agent/types.py +1 -1
  10. wxo_agentic_evaluation/inference_backend.py +64 -34
  11. wxo_agentic_evaluation/llm_matching.py +92 -2
  12. wxo_agentic_evaluation/llm_user.py +2 -2
  13. wxo_agentic_evaluation/main.py +147 -38
  14. wxo_agentic_evaluation/metrics/__init__.py +5 -1
  15. wxo_agentic_evaluation/metrics/evaluations.py +124 -0
  16. wxo_agentic_evaluation/metrics/metrics.py +24 -3
  17. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  18. wxo_agentic_evaluation/prompt/template_render.py +16 -0
  19. wxo_agentic_evaluation/quick_eval.py +17 -3
  20. wxo_agentic_evaluation/record_chat.py +17 -6
  21. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +44 -14
  22. wxo_agentic_evaluation/red_teaming/attack_generator.py +31 -12
  23. wxo_agentic_evaluation/red_teaming/attack_list.py +23 -24
  24. wxo_agentic_evaluation/red_teaming/attack_runner.py +36 -19
  25. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +42 -16
  26. wxo_agentic_evaluation/service_instance.py +5 -3
  27. wxo_agentic_evaluation/service_provider/__init__.py +129 -9
  28. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  29. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
  30. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  31. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  32. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  33. wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
  34. wxo_agentic_evaluation/type.py +14 -4
  35. wxo_agentic_evaluation/utils/__init__.py +43 -5
  36. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  37. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  38. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  39. wxo_agentic_evaluation/utils/utils.py +14 -9
  40. wxo_agentic_evaluation/wxo_client.py +2 -1
  41. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
  42. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,47 @@
1
1
  import json
2
- from wxo_agentic_evaluation.utils.utils import TestCaseResources, add_line_seperator, list_run_files, load_run_metrics, N_A
3
- from wxo_agentic_evaluation.utils.open_ai_tool_extractor import ToolExtractionOpenAIFormat
2
+ import os
3
+ import tempfile
4
+ from pathlib import Path
5
+
6
+ from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
7
+ ToolExtractionOpenAIFormat,
8
+ )
4
9
  from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
10
+ from wxo_agentic_evaluation.utils.utils import (
11
+ N_A,
12
+ TestCaseResources,
13
+ add_line_seperator,
14
+ list_run_files,
15
+ load_run_metrics,
16
+ )
17
+
18
+
19
+ def json_dump(output_path, obj):
20
+ """
21
+ Atomically dump JSON to `output_path`.
5
22
 
23
+ - Writes to a temporary file first
24
+ - Then atomically replaces the target file
25
+ - Prevents corrupted/half-written JSON if process is interrupted
26
+ """
27
+ output_path = Path(output_path)
28
+ output_path.parent.mkdir(parents=True, exist_ok=True)
6
29
 
7
- def json_dump(output_path, object):
8
- with open(output_path, "w", encoding="utf-8") as f:
9
- json.dump(object, f, indent=4)
30
+ fd, tmp_path = tempfile.mkstemp(
31
+ dir=output_path.parent,
32
+ prefix=output_path.stem,
33
+ suffix=".tmp",
34
+ text=True,
35
+ )
36
+ try:
37
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
38
+ json.dump(obj, f, indent=4, ensure_ascii=False)
39
+ f.flush()
40
+ os.fsync(f.fileno())
41
+ os.replace(tmp_path, output_path)
42
+ except Exception:
43
+ try:
44
+ os.remove(tmp_path)
45
+ except OSError:
46
+ pass
47
+ raise
@@ -0,0 +1,47 @@
1
+ """
2
+ Evaluation discovery mechanism.
3
+
4
+ This module provides functionality for discovering classes that inherit from Evaluation.
5
+ """
6
+
7
+ import importlib.util
8
+ import inspect
9
+ import os
10
+
11
+
12
+ def find_evaluation_subclasses(directory: str, base_class_name="Evaluation"):
13
+ """
14
+ Dynamically import Python files under 'directory' and find classes that
15
+ inherit from a class named 'Evaluation'. Returns a list of non-abstract
16
+ class objects.
17
+ """
18
+ subclasses = []
19
+
20
+ for root, _, files in os.walk(directory):
21
+ for file in files:
22
+ if file.endswith(".py") and not file.startswith("__"):
23
+ filepath = os.path.join(root, file)
24
+ module_name = os.path.splitext(os.path.basename(filepath))[0]
25
+
26
+ spec = importlib.util.spec_from_file_location(
27
+ module_name, filepath
28
+ )
29
+ if spec and spec.loader:
30
+ module = importlib.util.module_from_spec(spec)
31
+ try:
32
+ spec.loader.exec_module(module)
33
+ except Exception as e:
34
+ print(f"Skipping {filepath} due to import error: {e}")
35
+ continue
36
+
37
+ # Inspect for subclasses
38
+ for name, obj in inspect.getmembers(
39
+ module, inspect.isclass
40
+ ):
41
+ if any(
42
+ base.__name__ == base_class_name
43
+ for base in obj.__mro__[1:]
44
+ ) and not inspect.isabstract(obj):
45
+ subclasses.append(obj)
46
+
47
+ return subclasses
@@ -0,0 +1,39 @@
1
+ import os
2
+ from functools import lru_cache
3
+
4
+ from wxo_agentic_evaluation.arg_configs import AuthConfig
5
+ from wxo_agentic_evaluation.service_provider import USE_GATEWAY_MODEL_PROVIDER
6
+ from wxo_agentic_evaluation.wxo_client import get_wxo_client
7
+
8
+ WXO_AUTH_CONFIG_DEFAULTS = AuthConfig(
9
+ url=os.getenv("WXO_URL", "http://localhost:4321"),
10
+ tenant_name=os.getenv("WXO_TENANT", "wxo-dev"),
11
+ token=os.getenv("WXO_TOKEN", None),
12
+ )
13
+
14
+
15
+ @lru_cache(maxsize=1)
16
+ def _get_cached_wxo_client():
17
+ # TODO: remove this once the client is implemented as a Singleton.
18
+ return get_wxo_client(
19
+ WXO_AUTH_CONFIG_DEFAULTS.url,
20
+ WXO_AUTH_CONFIG_DEFAULTS.tenant_name,
21
+ WXO_AUTH_CONFIG_DEFAULTS.token,
22
+ )
23
+
24
+
25
+ def get_provider_kwargs(**base_kwargs: dict) -> dict:
26
+
27
+ if not USE_GATEWAY_MODEL_PROVIDER:
28
+ return base_kwargs
29
+
30
+ if "instance_url" in base_kwargs and "token" in base_kwargs:
31
+ return base_kwargs
32
+
33
+ wxo_client = _get_cached_wxo_client()
34
+
35
+ return {
36
+ **base_kwargs,
37
+ "instance_url": wxo_client.service_url,
38
+ "token": wxo_client.api_key,
39
+ }
@@ -0,0 +1,30 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from wxo_agentic_evaluation.type import ContentType, Message
6
+
7
+
8
+ class ParsedMessages(BaseModel):
9
+ """
10
+ A parsed history of messages.
11
+ """
12
+
13
+ messages: list[Message] = Field(description="The list of messages")
14
+
15
+ @property
16
+ def user_input(self) -> Optional[str]:
17
+ """Find the original user message."""
18
+ for message in self.messages:
19
+ if message.role == "user" and message.type == ContentType.text:
20
+ return str(message.content)
21
+ return None
22
+
23
+ @property
24
+ def agent_response(self) -> Optional[str]:
25
+ """Find the most recent assistant message."""
26
+ messages_in_reverse = reversed(self.messages)
27
+ for message in messages_in_reverse:
28
+ if message.role == "assistant" and message.type == ContentType.text:
29
+ return str(message.content)
30
+ return None
@@ -1,15 +1,15 @@
1
+ import csv
1
2
  import glob
2
3
  import json
3
4
  import os
4
5
  import re
5
- import csv
6
6
  from collections import defaultdict
7
7
  from pathlib import Path
8
- from typing import List, Optional, Union, Mapping, Tuple, Any
8
+ from typing import Any, List, Mapping, Optional, Tuple, Union
9
9
  from urllib.parse import urlparse
10
10
 
11
- import yaml
12
11
  import rich
12
+ import yaml
13
13
  from rich import box, print
14
14
  from rich.console import Console, Group
15
15
  from rich.panel import Panel
@@ -19,15 +19,15 @@ from rich.table import Table
19
19
 
20
20
  from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
21
21
  from wxo_agentic_evaluation.metrics.metrics import (
22
+ EnhancedAnalyzeMetrics,
22
23
  KnowledgeBaseMetricSummary,
23
24
  ReferenceLessEvalMetrics,
24
25
  ToolCallAndRoutingMetrics,
25
- EnhancedAnalyzeMetrics,
26
26
  )
27
27
  from wxo_agentic_evaluation.type import (
28
28
  ConversationalConfidenceThresholdScore,
29
- Message,
30
29
  ExtendedMessage,
30
+ Message,
31
31
  )
32
32
 
33
33
  console = Console()
@@ -37,6 +37,7 @@ RUN_FILE_RE = re.compile(
37
37
  )
38
38
  N_A = "N/A"
39
39
 
40
+
40
41
  class AttackResultsTable:
41
42
  def __init__(self, attack_results: dict):
42
43
  self.table = Table(
@@ -164,9 +165,11 @@ class TestCaseResources:
164
165
 
165
166
 
166
167
  class AgentMetricsTable:
167
- def __init__(self, data):
168
+ def __init__(self, data, title: Optional[str] = None):
169
+ if title is None:
170
+ title = "Agent Metrics"
168
171
  self.table = Table(
169
- title="Agent Metrics",
172
+ title=title,
170
173
  box=box.ROUNDED,
171
174
  show_lines=True,
172
175
  )
@@ -187,7 +190,9 @@ class AgentMetricsTable:
187
190
  console.print(self.table)
188
191
 
189
192
 
190
- def create_table(data: List[dict]) -> AgentMetricsTable:
193
+ def create_table(
194
+ data: List[dict], title: Optional[str] = None
195
+ ) -> AgentMetricsTable:
191
196
  """
192
197
  Generate a Rich table from a list of dictionaries.
193
198
  Returns the AgentMetricsTable instance.
@@ -199,7 +204,7 @@ def create_table(data: List[dict]) -> AgentMetricsTable:
199
204
  print("create_table() received an empty dataset. No table generated.")
200
205
  return None
201
206
 
202
- return AgentMetricsTable(data)
207
+ return AgentMetricsTable(data, title=title)
203
208
 
204
209
 
205
210
  def safe_divide(nom, denom):
@@ -1,8 +1,9 @@
1
1
  import os
2
+ from typing import Any, Dict, Optional
3
+
2
4
  import requests
3
5
  import urllib3
4
6
  from urllib3.exceptions import InsecureRequestWarning
5
- from typing import Dict, Any, Optional
6
7
 
7
8
  from wxo_agentic_evaluation.service_instance import tenant_setup
8
9