ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
from wxo_agentic_evaluation.type import Message, EvaluationData
|
|
2
|
-
from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
|
|
3
|
-
from wxo_agentic_evaluation.data_annotator import DataAnnotator
|
|
4
1
|
import json
|
|
2
|
+
import os
|
|
5
3
|
from pprint import pprint
|
|
4
|
+
|
|
6
5
|
from jsonargparse import CLI
|
|
7
|
-
|
|
6
|
+
|
|
7
|
+
from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
|
|
8
|
+
from wxo_agentic_evaluation.data_annotator import DataAnnotator
|
|
9
|
+
from wxo_agentic_evaluation.type import Message, OrchestrateDataset
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
def main(config: TestCaseGenerationConfig):
|
|
@@ -15,7 +17,7 @@ def main(config: TestCaseGenerationConfig):
|
|
|
15
17
|
messages.append(Message.model_validate(entry))
|
|
16
18
|
|
|
17
19
|
with open(config.seed_data_path, "r") as f:
|
|
18
|
-
evaluation_data =
|
|
20
|
+
evaluation_data = OrchestrateDataset(**json.load(f))
|
|
19
21
|
|
|
20
22
|
# Generate annonated dataset
|
|
21
23
|
annotator = DataAnnotator(
|
|
@@ -1,16 +1,22 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
4
6
|
from wxo_agentic_evaluation import __file__
|
|
5
7
|
|
|
6
8
|
root_dir = os.path.dirname(__file__)
|
|
7
|
-
LLAMA_USER_PROMPT_PATH = os.path.join(
|
|
8
|
-
|
|
9
|
+
LLAMA_USER_PROMPT_PATH = os.path.join(
|
|
10
|
+
root_dir, "prompt", "llama_user_prompt.jinja2"
|
|
11
|
+
)
|
|
12
|
+
KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(
|
|
13
|
+
root_dir, "prompt", "keywords_generation_prompt.jinja2"
|
|
14
|
+
)
|
|
9
15
|
|
|
10
16
|
|
|
11
17
|
@dataclass
|
|
12
18
|
class AuthConfig:
|
|
13
|
-
url: str
|
|
19
|
+
url: Optional[str] = None
|
|
14
20
|
tenant_name: str = "local"
|
|
15
21
|
token: str = None
|
|
16
22
|
|
|
@@ -25,7 +31,33 @@ class LLMUserConfig:
|
|
|
25
31
|
@dataclass
|
|
26
32
|
class ProviderConfig:
|
|
27
33
|
model_id: str = field(default="meta-llama/llama-3-405b-instruct")
|
|
28
|
-
provider: str = field(
|
|
34
|
+
provider: str = field(
|
|
35
|
+
default_factory=lambda: (
|
|
36
|
+
"gateway"
|
|
37
|
+
if os.getenv("USE_GATEWAY_MODEL_PROVIDER", "").lower() == "true"
|
|
38
|
+
else "watsonx"
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
embedding_model_id: str = field(
|
|
42
|
+
default="sentence-transformers/all-minilm-l6-v2"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CustomMetricsConfig:
|
|
48
|
+
paths: Optional[list[str]] = field(default=None)
|
|
49
|
+
llmaaj_config: ProviderConfig = field(default_factory=ProviderConfig)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ExtractorsConfig:
|
|
54
|
+
paths: Optional[list[str]] = field(default=None)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ControllerConfig:
|
|
59
|
+
enable_verbose_logging: bool = True
|
|
60
|
+
enable_manual_user_input: bool = False
|
|
29
61
|
|
|
30
62
|
|
|
31
63
|
@dataclass
|
|
@@ -36,16 +68,59 @@ class TestConfig:
|
|
|
36
68
|
wxo_lite_version: str
|
|
37
69
|
provider_config: ProviderConfig = field(default_factory=ProviderConfig)
|
|
38
70
|
llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
|
|
71
|
+
custom_metrics_config: CustomMetricsConfig = field(
|
|
72
|
+
default_factory=CustomMetricsConfig
|
|
73
|
+
)
|
|
74
|
+
extractors_config: ExtractorsConfig = field(default_factory=ExtractorsConfig)
|
|
39
75
|
enable_verbose_logging: bool = True
|
|
40
76
|
enable_manual_user_input: bool = False
|
|
41
77
|
skip_available_results: bool = False
|
|
42
78
|
data_annotation_run: bool = False
|
|
43
79
|
num_workers: int = 2
|
|
80
|
+
n_runs: int = 1
|
|
81
|
+
similarity_threshold: float = 0.8
|
|
82
|
+
enable_fuzzy_matching: bool = False
|
|
83
|
+
strict_topological_matching: bool = True
|
|
84
|
+
enable_recursive_search: bool = False
|
|
85
|
+
skip_legacy_evaluation: bool = False # Skip legacy evaluation and only run user/agent simulations
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class AttackConfig:
|
|
90
|
+
attack_paths: List[str]
|
|
91
|
+
output_dir: str
|
|
92
|
+
auth_config: AuthConfig
|
|
93
|
+
provider_config: ProviderConfig = field(default_factory=ProviderConfig)
|
|
94
|
+
llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
|
|
95
|
+
enable_verbose_logging: bool = True
|
|
96
|
+
enable_manual_user_input: bool = False
|
|
97
|
+
num_workers: int = 2
|
|
98
|
+
skip_available_results: bool = True
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class AttackGeneratorConfig:
|
|
103
|
+
attacks_list: Union[List[str], str]
|
|
104
|
+
datasets_path: Union[List[str], str]
|
|
105
|
+
agents_list_or_path: Union[List[str], str]
|
|
106
|
+
target_agent_name: str
|
|
107
|
+
auth_config: AuthConfig
|
|
108
|
+
output_dir: str = None
|
|
109
|
+
max_variants: int = None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class AnalyzeMode(StrEnum):
|
|
113
|
+
default = "default"
|
|
114
|
+
enhanced = "enhanced"
|
|
44
115
|
|
|
45
116
|
|
|
46
117
|
@dataclass
|
|
47
118
|
class AnalyzeConfig:
|
|
48
119
|
data_path: str
|
|
120
|
+
tool_definition_path: Optional[str] = None
|
|
121
|
+
mode: str = AnalyzeMode.default
|
|
122
|
+
num_workers: int = 10
|
|
123
|
+
run: int = -1
|
|
49
124
|
|
|
50
125
|
|
|
51
126
|
@dataclass
|
|
@@ -74,6 +149,12 @@ class ChatRecordingConfig:
|
|
|
74
149
|
service_url: str = "http://localhost:4321"
|
|
75
150
|
tenant_name: str = "local"
|
|
76
151
|
token: str = None
|
|
152
|
+
max_retries: int = 5
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class QuickEvalConfig(TestConfig):
|
|
157
|
+
tools_path: str = None
|
|
77
158
|
|
|
78
159
|
|
|
79
160
|
@dataclass
|
|
@@ -83,3 +164,14 @@ class BatchAnnotateConfig:
|
|
|
83
164
|
stories_path: str
|
|
84
165
|
output_dir: str
|
|
85
166
|
num_variants: int = 2
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@dataclass
|
|
170
|
+
class CompareRunsConfig:
|
|
171
|
+
reference_file_location: str
|
|
172
|
+
experiment_file_location: str
|
|
173
|
+
csv_output: Optional[str] = None
|
|
174
|
+
column_stats_csv: Optional[str] = (
|
|
175
|
+
"column_by_column_summary_stats_comparison.csv"
|
|
176
|
+
)
|
|
177
|
+
verbose: bool = False
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.type import Message
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseUserSimulator(ABC):
|
|
8
|
+
"""Abstract base class for user simulators."""
|
|
9
|
+
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def generate_user_input(
|
|
12
|
+
self, user_story: str, conversation_history: List[Message], **kwargs
|
|
13
|
+
) -> Message:
|
|
14
|
+
"""
|
|
15
|
+
Generate user input based on the user story and conversation history.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
user_story: The user's story or goal
|
|
19
|
+
conversation_history: List of previous messages in the conversation
|
|
20
|
+
**kwargs: Additional parameters specific to the simulator implementation
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Message: The generated user input message
|
|
24
|
+
"""
|
|
25
|
+
pass
|
|
@@ -1,22 +1,28 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import ast
|
|
3
2
|
import csv
|
|
3
|
+
import json
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
|
|
6
7
|
from jsonargparse import CLI
|
|
7
8
|
|
|
8
|
-
from wxo_agentic_evaluation.service_provider import get_provider
|
|
9
|
-
from wxo_agentic_evaluation.prompt.template_render import BatchTestCaseGeneratorTemplateRenderer
|
|
10
|
-
from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
|
|
11
9
|
from wxo_agentic_evaluation import __file__
|
|
10
|
+
from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
|
|
11
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
12
|
+
BatchTestCaseGeneratorTemplateRenderer,
|
|
13
|
+
)
|
|
14
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
12
15
|
|
|
13
16
|
root_dir = os.path.dirname(__file__)
|
|
14
|
-
BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(
|
|
17
|
+
BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(
|
|
18
|
+
root_dir, "prompt", "batch_testcase_prompt.jinja2"
|
|
19
|
+
)
|
|
15
20
|
EXAMPLE_PATH = os.path.join(root_dir, "prompt", "examples", "data_simple.json")
|
|
16
21
|
|
|
17
22
|
|
|
18
|
-
def parse_tools_with_filter(
|
|
19
|
-
|
|
23
|
+
def parse_tools_with_filter(
|
|
24
|
+
agent_name: str, tools_path: Path, allowed_tool_names: list[str]
|
|
25
|
+
) -> tuple[dict, list[dict]]:
|
|
20
26
|
if not allowed_tool_names:
|
|
21
27
|
raise ValueError("Allowed tool list cannot be empty.")
|
|
22
28
|
|
|
@@ -29,7 +35,9 @@ def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_name
|
|
|
29
35
|
elif tools_path.is_dir():
|
|
30
36
|
files_to_parse.extend(tools_path.glob("**/*.py"))
|
|
31
37
|
else:
|
|
32
|
-
raise ValueError(
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"Tools path {tools_path} is neither a file nor directory"
|
|
40
|
+
)
|
|
33
41
|
|
|
34
42
|
for file_path in files_to_parse:
|
|
35
43
|
try:
|
|
@@ -41,21 +49,29 @@ def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_name
|
|
|
41
49
|
# Process only module-level functions
|
|
42
50
|
for node in parsed_code.body:
|
|
43
51
|
if isinstance(node, ast.FunctionDef):
|
|
44
|
-
tool_data.append(
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
52
|
+
tool_data.append(
|
|
53
|
+
{
|
|
54
|
+
"Function Name": node.name,
|
|
55
|
+
"Arguments": [arg.arg for arg in node.args.args],
|
|
56
|
+
"Docstring": ast.get_docstring(node),
|
|
57
|
+
}
|
|
58
|
+
)
|
|
49
59
|
|
|
50
60
|
except Exception as e:
|
|
51
61
|
print(f"Warning: Failed to parse {file_path}: {str(e)}")
|
|
52
62
|
continue
|
|
53
63
|
|
|
54
64
|
# Filter tools based on allowed names
|
|
55
|
-
filtered_tools = [
|
|
65
|
+
filtered_tools = [
|
|
66
|
+
tool
|
|
67
|
+
for tool in tool_data
|
|
68
|
+
if tool["Function Name"] in allowed_tool_names
|
|
69
|
+
]
|
|
56
70
|
|
|
57
71
|
if not filtered_tools:
|
|
58
|
-
print(
|
|
72
|
+
print(
|
|
73
|
+
f"Warning: No matching tools found. Available tools: {[t['Function Name'] for t in tool_data]}"
|
|
74
|
+
)
|
|
59
75
|
|
|
60
76
|
return {"name": agent_name}, filtered_tools
|
|
61
77
|
|
|
@@ -75,8 +91,17 @@ def load_example(example_path: Path):
|
|
|
75
91
|
|
|
76
92
|
|
|
77
93
|
# Step 4: Prompt builder for N test cases from a given story
|
|
78
|
-
def build_prompt_for_story(
|
|
79
|
-
|
|
94
|
+
def build_prompt_for_story(
|
|
95
|
+
agent,
|
|
96
|
+
tools,
|
|
97
|
+
tool_inputs,
|
|
98
|
+
example_case: dict,
|
|
99
|
+
story: str,
|
|
100
|
+
num_variants: int = 2,
|
|
101
|
+
):
|
|
102
|
+
renderer = BatchTestCaseGeneratorTemplateRenderer(
|
|
103
|
+
BATCH_TEST_CASE_GENERATOR_PROMPT_PATH
|
|
104
|
+
)
|
|
80
105
|
|
|
81
106
|
tool_blocks = "\n".join(
|
|
82
107
|
f"- Tool: {t['Function Name']}\n Description: {t['Docstring']}\n Args: {', '.join(t['Arguments']) or 'None'}"
|
|
@@ -93,16 +118,43 @@ def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story:
|
|
|
93
118
|
)
|
|
94
119
|
return prompt
|
|
95
120
|
|
|
121
|
+
|
|
96
122
|
# Step 5: Send prompt to LLM and save test cases
|
|
97
|
-
def generate_multiple_in_one(
|
|
123
|
+
def generate_multiple_in_one(
|
|
124
|
+
prompt,
|
|
125
|
+
output_dir,
|
|
126
|
+
starting_index,
|
|
127
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
128
|
+
# model_id="gpt-4o",
|
|
129
|
+
):
|
|
98
130
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
99
131
|
|
|
132
|
+
# Legacy provider (e.g., watsonx)
|
|
100
133
|
provider = get_provider(
|
|
101
134
|
model_id=model_id,
|
|
102
|
-
params={
|
|
135
|
+
params={
|
|
136
|
+
"min_new_tokens": 50,
|
|
137
|
+
"decoding_method": "greedy",
|
|
138
|
+
"max_new_tokens": 3000,
|
|
139
|
+
},
|
|
140
|
+
use_portkey_provider=False,
|
|
103
141
|
)
|
|
104
|
-
|
|
105
|
-
|
|
142
|
+
response = provider.chat(prompt).text
|
|
143
|
+
|
|
144
|
+
# # OpenAI provider
|
|
145
|
+
# provider = get_provider(provider="openai", model_id=model_id, api_key=os.getenv("OPENAI_API_KEY"))
|
|
146
|
+
# response = provider.chat(prompt).choices[0].message.content
|
|
147
|
+
|
|
148
|
+
# # Azure OpenAI provider
|
|
149
|
+
# provider = get_provider(
|
|
150
|
+
# provider = "azure-openai",
|
|
151
|
+
# azure_model_name = model_id,
|
|
152
|
+
# azure_deployment_id = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
|
|
153
|
+
# azure_resource_name = os.getenv("AZURE_OPENAI_RESOURCE_NAME"),
|
|
154
|
+
# azure_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
|
|
155
|
+
# api_key = f"Bearer {os.getenv('AZURE_OPENAI_API_KEY')}"
|
|
156
|
+
# )
|
|
157
|
+
# response = provider.chat(prompt).choices[0].message.content
|
|
106
158
|
|
|
107
159
|
try:
|
|
108
160
|
raw_text = response
|
|
@@ -124,8 +176,19 @@ def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-
|
|
|
124
176
|
print("Raw text:\n", raw_text)
|
|
125
177
|
print("Error:", str(e))
|
|
126
178
|
|
|
127
|
-
|
|
128
|
-
|
|
179
|
+
|
|
180
|
+
def generate_test_cases_from_stories(
|
|
181
|
+
agent_name: str,
|
|
182
|
+
stories: list[str],
|
|
183
|
+
tools_path: Path,
|
|
184
|
+
snapshot_path: Path,
|
|
185
|
+
output_dir: Path,
|
|
186
|
+
allowed_tools: list[str],
|
|
187
|
+
num_variants: int = 2,
|
|
188
|
+
):
|
|
189
|
+
agent, tools = parse_tools_with_filter(
|
|
190
|
+
agent_name, tools_path, allowed_tools
|
|
191
|
+
)
|
|
129
192
|
tool_inputs = extract_inputs_from_snapshot(snapshot_path)
|
|
130
193
|
example_json = load_example(Path(EXAMPLE_PATH))
|
|
131
194
|
|
|
@@ -134,23 +197,29 @@ def generate_test_cases_from_stories(agent_name: str, stories: list[str], tools_
|
|
|
134
197
|
print(f"\n Generating test cases for story {idx}: {story}")
|
|
135
198
|
|
|
136
199
|
prompt = build_prompt_for_story(
|
|
137
|
-
agent,
|
|
200
|
+
agent,
|
|
201
|
+
tools,
|
|
202
|
+
tool_inputs,
|
|
203
|
+
example_json,
|
|
204
|
+
story,
|
|
205
|
+
num_variants=num_variants,
|
|
138
206
|
)
|
|
139
207
|
|
|
140
208
|
generate_multiple_in_one(
|
|
141
209
|
prompt=prompt,
|
|
142
210
|
output_dir=output_dir,
|
|
143
|
-
starting_index=test_case_counter
|
|
211
|
+
starting_index=test_case_counter,
|
|
144
212
|
)
|
|
145
213
|
|
|
146
214
|
test_case_counter += num_variants
|
|
147
215
|
|
|
216
|
+
|
|
148
217
|
def main(config: BatchAnnotateConfig):
|
|
149
218
|
stories_path = Path(config.stories_path)
|
|
150
219
|
|
|
151
220
|
stories = []
|
|
152
221
|
agent_name = None
|
|
153
|
-
with stories_path.open("r", encoding="utf-8", newline=
|
|
222
|
+
with stories_path.open("r", encoding="utf-8", newline="") as f:
|
|
154
223
|
csv_reader = csv.DictReader(f)
|
|
155
224
|
for row in csv_reader:
|
|
156
225
|
stories.append(row["story"])
|
|
@@ -168,8 +237,9 @@ def main(config: BatchAnnotateConfig):
|
|
|
168
237
|
snapshot_path,
|
|
169
238
|
output_dir,
|
|
170
239
|
config.allowed_tools,
|
|
171
|
-
num_variants=config.num_variants
|
|
240
|
+
num_variants=config.num_variants,
|
|
172
241
|
)
|
|
173
242
|
|
|
243
|
+
|
|
174
244
|
if __name__ == "__main__":
|
|
175
245
|
main(CLI(BatchAnnotateConfig, as_positional=False))
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from dataclasses import asdict, dataclass
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.arg_configs import ProviderConfig, TestConfig
|
|
5
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
6
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
7
|
+
LlamaUserTemplateRenderer,
|
|
8
|
+
)
|
|
9
|
+
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
10
|
+
from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
|
|
11
|
+
WXORuntimeAdapter,
|
|
12
|
+
)
|
|
13
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
14
|
+
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
15
|
+
from wxo_agentic_evaluation.wxo_client import WXOClient, get_wxo_client
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Clients:
|
|
20
|
+
wxo_client: WXOClient
|
|
21
|
+
llmaaj_provider: Provider
|
|
22
|
+
resource_map: ResourceMap
|
|
23
|
+
inference_backend: WXORuntimeAdapter
|
|
24
|
+
llm_user: LLMUser
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def bootstrap_clients(config: TestConfig) -> Clients:
|
|
28
|
+
"""
|
|
29
|
+
Bootstrap all clients needed for the evaluation.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
config: The test configuration
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
A tuple containing:
|
|
36
|
+
- wxo_client: The WXO client
|
|
37
|
+
- llmaaj_provider: The provider for custom metrics
|
|
38
|
+
- resource_map: The resource map
|
|
39
|
+
- inference_backend: The inference backend
|
|
40
|
+
- llm_user: The LLM user
|
|
41
|
+
"""
|
|
42
|
+
# Initialize WXO client
|
|
43
|
+
wxo_client = get_wxo_client(
|
|
44
|
+
config.auth_config.url,
|
|
45
|
+
config.auth_config.tenant_name,
|
|
46
|
+
config.auth_config.token,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Initialize provider for custom metrics
|
|
50
|
+
original_provider_config = config.provider_config
|
|
51
|
+
provider_config_dict = asdict(original_provider_config)
|
|
52
|
+
|
|
53
|
+
provider_kwargs = {
|
|
54
|
+
"config": ProviderConfig(**provider_config_dict),
|
|
55
|
+
"model_id": config.llm_user_config.model_id,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if provider_config_dict.get("provider", "gateway") == "gateway":
|
|
59
|
+
provider_kwargs.update(
|
|
60
|
+
token=config.auth_config.token or wxo_client.api_key,
|
|
61
|
+
instance_url=wxo_client.service_url,
|
|
62
|
+
)
|
|
63
|
+
config.auth_config.token = (
|
|
64
|
+
config.auth_config.token or wxo_client.api_key
|
|
65
|
+
)
|
|
66
|
+
config.auth_config.url = (
|
|
67
|
+
config.auth_config.url or wxo_client.service_url
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Initialize resource map
|
|
71
|
+
resource_map = ResourceMap(wxo_client)
|
|
72
|
+
|
|
73
|
+
# Initialize inference backend
|
|
74
|
+
inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
|
|
75
|
+
|
|
76
|
+
# Initialize LLM user
|
|
77
|
+
llm_user = LLMUser(
|
|
78
|
+
wai_client=get_provider(**provider_kwargs),
|
|
79
|
+
template=LlamaUserTemplateRenderer(
|
|
80
|
+
config.llm_user_config.prompt_config
|
|
81
|
+
),
|
|
82
|
+
user_response_style=config.llm_user_config.user_response_style,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
llamaj_provider_kwargs = copy.deepcopy(provider_kwargs)
|
|
86
|
+
llamaj_config_dict = asdict(llamaj_provider_kwargs["config"])
|
|
87
|
+
|
|
88
|
+
llamaj_config_dict["model_id"] = (
|
|
89
|
+
config.custom_metrics_config.llmaaj_config.model_id
|
|
90
|
+
)
|
|
91
|
+
llamaj_config_dict["embedding_model_id"] = (
|
|
92
|
+
config.custom_metrics_config.llmaaj_config.embedding_model_id
|
|
93
|
+
)
|
|
94
|
+
llamaj_provider_kwargs["config"] = ProviderConfig(**llamaj_config_dict)
|
|
95
|
+
llmaaj_provider = get_provider(**llamaj_provider_kwargs)
|
|
96
|
+
|
|
97
|
+
return Clients(
|
|
98
|
+
wxo_client=wxo_client,
|
|
99
|
+
llmaaj_provider=llmaaj_provider,
|
|
100
|
+
resource_map=resource_map,
|
|
101
|
+
inference_backend=inference_backend,
|
|
102
|
+
llm_user=llm_user,
|
|
103
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
import statistics
|
|
4
|
+
import sys
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
8
|
+
|
|
9
|
+
from jsonargparse import CLI
|
|
10
|
+
|
|
11
|
+
from wxo_agentic_evaluation.arg_configs import CompareRunsConfig
|
|
12
|
+
from wxo_agentic_evaluation.compare_runs.diff import DiffResults
|
|
13
|
+
from wxo_agentic_evaluation.compare_runs.model import EvaluationResult
|
|
14
|
+
from wxo_agentic_evaluation.utils.utils import create_table, read_file
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main(config: CompareRunsConfig):
|
|
18
|
+
"""Main function to compare two run result files."""
|
|
19
|
+
# Extract values from config
|
|
20
|
+
reference_file = config.reference_file_location
|
|
21
|
+
experiment_file = config.experiment_file_location
|
|
22
|
+
csv_output = config.csv_output
|
|
23
|
+
column_stats_csv = config.column_stats_csv
|
|
24
|
+
verbose = config.verbose
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
# Read the files
|
|
28
|
+
obj1 = read_file(reference_file)
|
|
29
|
+
obj2 = read_file(experiment_file)
|
|
30
|
+
|
|
31
|
+
# Create evaluation results
|
|
32
|
+
result1 = EvaluationResult.from_csv(obj1)
|
|
33
|
+
result2 = EvaluationResult.from_csv(obj2)
|
|
34
|
+
|
|
35
|
+
# Create diff results
|
|
36
|
+
diff_results = DiffResults(result1, result2)
|
|
37
|
+
|
|
38
|
+
# Display summary statistics
|
|
39
|
+
summary_stats = diff_results.summary_statistics()
|
|
40
|
+
summary_table = create_table(summary_stats, title="Summary Statistics")
|
|
41
|
+
print(
|
|
42
|
+
"\nALL metrics are computed on OVERLAPPING test cases, ie cases that exist in both the Reference and Experiment runs\n"
|
|
43
|
+
)
|
|
44
|
+
print(
|
|
45
|
+
"If Experiment - Reference is Positive, that's an increase in the metric. If Experiment - Reference is Negative, that's a decrease in the metric.\n"
|
|
46
|
+
)
|
|
47
|
+
summary_table.print()
|
|
48
|
+
|
|
49
|
+
# Display exclusive tests
|
|
50
|
+
if verbose:
|
|
51
|
+
diff_results.display_exclusive_tests()
|
|
52
|
+
|
|
53
|
+
# Display test cases with differing summary match and success status
|
|
54
|
+
diff_results.display_differing_summary_matches()
|
|
55
|
+
|
|
56
|
+
# Display tabular diff
|
|
57
|
+
diff_results.compute_tabular_diff(verbose=verbose)
|
|
58
|
+
|
|
59
|
+
# Write results to CSV if specified
|
|
60
|
+
if csv_output:
|
|
61
|
+
diff_results.to_csv(csv_output)
|
|
62
|
+
|
|
63
|
+
except Exception as e:
|
|
64
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
65
|
+
return 1
|
|
66
|
+
|
|
67
|
+
return 0
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
args = CLI(CompareRunsConfig, as_positional=False)
|
|
72
|
+
sys.exit(main(args))
|
|
73
|
+
|
|
74
|
+
# Made with Bob
|