ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,12 @@
1
- from wxo_agentic_evaluation.type import Message, EvaluationData
2
- from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
3
- from wxo_agentic_evaluation.data_annotator import DataAnnotator
4
1
  import json
2
+ import os
5
3
  from pprint import pprint
4
+
6
5
  from jsonargparse import CLI
7
- import os
6
+
7
+ from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
8
+ from wxo_agentic_evaluation.data_annotator import DataAnnotator
9
+ from wxo_agentic_evaluation.type import Message, OrchestrateDataset
8
10
 
9
11
 
10
12
  def main(config: TestCaseGenerationConfig):
@@ -15,7 +17,7 @@ def main(config: TestCaseGenerationConfig):
15
17
  messages.append(Message.model_validate(entry))
16
18
 
17
19
  with open(config.seed_data_path, "r") as f:
18
- evaluation_data = EvaluationData(**json.load(f))
20
+ evaluation_data = OrchestrateDataset(**json.load(f))
19
21
 
20
22
  # Generate annonated dataset
21
23
  annotator = DataAnnotator(
@@ -1,16 +1,22 @@
1
1
  import os
2
2
  from dataclasses import dataclass, field
3
- from typing import List
3
+ from enum import StrEnum
4
+ from typing import List, Optional, Union
5
+
4
6
  from wxo_agentic_evaluation import __file__
5
7
 
6
8
  root_dir = os.path.dirname(__file__)
7
- LLAMA_USER_PROMPT_PATH = os.path.join(root_dir, "prompt", "llama_user_prompt.jinja2")
8
- KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(root_dir, "prompt", "keywords_generation_prompt.jinja2")
9
+ LLAMA_USER_PROMPT_PATH = os.path.join(
10
+ root_dir, "prompt", "llama_user_prompt.jinja2"
11
+ )
12
+ KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(
13
+ root_dir, "prompt", "keywords_generation_prompt.jinja2"
14
+ )
9
15
 
10
16
 
11
17
  @dataclass
12
18
  class AuthConfig:
13
- url: str
19
+ url: Optional[str] = None
14
20
  tenant_name: str = "local"
15
21
  token: str = None
16
22
 
@@ -25,7 +31,33 @@ class LLMUserConfig:
25
31
  @dataclass
26
32
  class ProviderConfig:
27
33
  model_id: str = field(default="meta-llama/llama-3-405b-instruct")
28
- provider: str = field(default="watsonx")
34
+ provider: str = field(
35
+ default_factory=lambda: (
36
+ "gateway"
37
+ if os.getenv("USE_GATEWAY_MODEL_PROVIDER", "").lower() == "true"
38
+ else "watsonx"
39
+ )
40
+ )
41
+ embedding_model_id: str = field(
42
+ default="sentence-transformers/all-minilm-l6-v2"
43
+ )
44
+
45
+
46
+ @dataclass
47
+ class CustomMetricsConfig:
48
+ paths: Optional[list[str]] = field(default=None)
49
+ llmaaj_config: ProviderConfig = field(default_factory=ProviderConfig)
50
+
51
+
52
+ @dataclass
53
+ class ExtractorsConfig:
54
+ paths: Optional[list[str]] = field(default=None)
55
+
56
+
57
+
58
+ class ControllerConfig:
59
+ enable_verbose_logging: bool = True
60
+ enable_manual_user_input: bool = False
29
61
 
30
62
 
31
63
  @dataclass
@@ -36,16 +68,59 @@ class TestConfig:
36
68
  wxo_lite_version: str
37
69
  provider_config: ProviderConfig = field(default_factory=ProviderConfig)
38
70
  llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
71
+ custom_metrics_config: CustomMetricsConfig = field(
72
+ default_factory=CustomMetricsConfig
73
+ )
74
+ extractors_config: ExtractorsConfig = field(default_factory=ExtractorsConfig)
39
75
  enable_verbose_logging: bool = True
40
76
  enable_manual_user_input: bool = False
41
77
  skip_available_results: bool = False
42
78
  data_annotation_run: bool = False
43
79
  num_workers: int = 2
80
+ n_runs: int = 1
81
+ similarity_threshold: float = 0.8
82
+ enable_fuzzy_matching: bool = False
83
+ strict_topological_matching: bool = True
84
+ enable_recursive_search: bool = False
85
+ skip_legacy_evaluation: bool = False # Skip legacy evaluation and only run user/agent simulations
86
+
87
+
88
+ @dataclass
89
+ class AttackConfig:
90
+ attack_paths: List[str]
91
+ output_dir: str
92
+ auth_config: AuthConfig
93
+ provider_config: ProviderConfig = field(default_factory=ProviderConfig)
94
+ llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
95
+ enable_verbose_logging: bool = True
96
+ enable_manual_user_input: bool = False
97
+ num_workers: int = 2
98
+ skip_available_results: bool = True
99
+
100
+
101
+ @dataclass
102
+ class AttackGeneratorConfig:
103
+ attacks_list: Union[List[str], str]
104
+ datasets_path: Union[List[str], str]
105
+ agents_list_or_path: Union[List[str], str]
106
+ target_agent_name: str
107
+ auth_config: AuthConfig
108
+ output_dir: str = None
109
+ max_variants: int = None
110
+
111
+
112
+ class AnalyzeMode(StrEnum):
113
+ default = "default"
114
+ enhanced = "enhanced"
44
115
 
45
116
 
46
117
  @dataclass
47
118
  class AnalyzeConfig:
48
119
  data_path: str
120
+ tool_definition_path: Optional[str] = None
121
+ mode: str = AnalyzeMode.default
122
+ num_workers: int = 10
123
+ run: int = -1
49
124
 
50
125
 
51
126
  @dataclass
@@ -74,6 +149,12 @@ class ChatRecordingConfig:
74
149
  service_url: str = "http://localhost:4321"
75
150
  tenant_name: str = "local"
76
151
  token: str = None
152
+ max_retries: int = 5
153
+
154
+
155
+ @dataclass
156
+ class QuickEvalConfig(TestConfig):
157
+ tools_path: str = None
77
158
 
78
159
 
79
160
  @dataclass
@@ -83,3 +164,14 @@ class BatchAnnotateConfig:
83
164
  stories_path: str
84
165
  output_dir: str
85
166
  num_variants: int = 2
167
+
168
+
169
+ @dataclass
170
+ class CompareRunsConfig:
171
+ reference_file_location: str
172
+ experiment_file_location: str
173
+ csv_output: Optional[str] = None
174
+ column_stats_csv: Optional[str] = (
175
+ "column_by_column_summary_stats_comparison.csv"
176
+ )
177
+ verbose: bool = False
@@ -0,0 +1,25 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List
3
+
4
+ from wxo_agentic_evaluation.type import Message
5
+
6
+
7
+ class BaseUserSimulator(ABC):
8
+ """Abstract base class for user simulators."""
9
+
10
+ @abstractmethod
11
+ def generate_user_input(
12
+ self, user_story: str, conversation_history: List[Message], **kwargs
13
+ ) -> Message:
14
+ """
15
+ Generate user input based on the user story and conversation history.
16
+
17
+ Args:
18
+ user_story: The user's story or goal
19
+ conversation_history: List of previous messages in the conversation
20
+ **kwargs: Additional parameters specific to the simulator implementation
21
+
22
+ Returns:
23
+ Message: The generated user input message
24
+ """
25
+ pass
@@ -1,22 +1,28 @@
1
- import json
2
1
  import ast
3
2
  import csv
3
+ import json
4
4
  import os
5
5
  from pathlib import Path
6
+
6
7
  from jsonargparse import CLI
7
8
 
8
- from wxo_agentic_evaluation.service_provider import get_provider
9
- from wxo_agentic_evaluation.prompt.template_render import BatchTestCaseGeneratorTemplateRenderer
10
- from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
11
9
  from wxo_agentic_evaluation import __file__
10
+ from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
11
+ from wxo_agentic_evaluation.prompt.template_render import (
12
+ BatchTestCaseGeneratorTemplateRenderer,
13
+ )
14
+ from wxo_agentic_evaluation.service_provider import get_provider
12
15
 
13
16
  root_dir = os.path.dirname(__file__)
14
- BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(root_dir, "prompt", "batch_testcase_prompt.jinja2")
17
+ BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(
18
+ root_dir, "prompt", "batch_testcase_prompt.jinja2"
19
+ )
15
20
  EXAMPLE_PATH = os.path.join(root_dir, "prompt", "examples", "data_simple.json")
16
21
 
17
22
 
18
- def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_names: list[str]) -> tuple[
19
- dict, list[dict]]:
23
+ def parse_tools_with_filter(
24
+ agent_name: str, tools_path: Path, allowed_tool_names: list[str]
25
+ ) -> tuple[dict, list[dict]]:
20
26
  if not allowed_tool_names:
21
27
  raise ValueError("Allowed tool list cannot be empty.")
22
28
 
@@ -29,7 +35,9 @@ def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_name
29
35
  elif tools_path.is_dir():
30
36
  files_to_parse.extend(tools_path.glob("**/*.py"))
31
37
  else:
32
- raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
38
+ raise ValueError(
39
+ f"Tools path {tools_path} is neither a file nor directory"
40
+ )
33
41
 
34
42
  for file_path in files_to_parse:
35
43
  try:
@@ -41,21 +49,29 @@ def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_name
41
49
  # Process only module-level functions
42
50
  for node in parsed_code.body:
43
51
  if isinstance(node, ast.FunctionDef):
44
- tool_data.append({
45
- "Function Name": node.name,
46
- "Arguments": [arg.arg for arg in node.args.args],
47
- "Docstring": ast.get_docstring(node)
48
- })
52
+ tool_data.append(
53
+ {
54
+ "Function Name": node.name,
55
+ "Arguments": [arg.arg for arg in node.args.args],
56
+ "Docstring": ast.get_docstring(node),
57
+ }
58
+ )
49
59
 
50
60
  except Exception as e:
51
61
  print(f"Warning: Failed to parse {file_path}: {str(e)}")
52
62
  continue
53
63
 
54
64
  # Filter tools based on allowed names
55
- filtered_tools = [tool for tool in tool_data if tool["Function Name"] in allowed_tool_names]
65
+ filtered_tools = [
66
+ tool
67
+ for tool in tool_data
68
+ if tool["Function Name"] in allowed_tool_names
69
+ ]
56
70
 
57
71
  if not filtered_tools:
58
- print(f"Warning: No matching tools found. Available tools: {[t['Function Name'] for t in tool_data]}")
72
+ print(
73
+ f"Warning: No matching tools found. Available tools: {[t['Function Name'] for t in tool_data]}"
74
+ )
59
75
 
60
76
  return {"name": agent_name}, filtered_tools
61
77
 
@@ -75,8 +91,17 @@ def load_example(example_path: Path):
75
91
 
76
92
 
77
93
  # Step 4: Prompt builder for N test cases from a given story
78
- def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story: str, num_variants: int = 2):
79
- renderer = BatchTestCaseGeneratorTemplateRenderer(BATCH_TEST_CASE_GENERATOR_PROMPT_PATH)
94
+ def build_prompt_for_story(
95
+ agent,
96
+ tools,
97
+ tool_inputs,
98
+ example_case: dict,
99
+ story: str,
100
+ num_variants: int = 2,
101
+ ):
102
+ renderer = BatchTestCaseGeneratorTemplateRenderer(
103
+ BATCH_TEST_CASE_GENERATOR_PROMPT_PATH
104
+ )
80
105
 
81
106
  tool_blocks = "\n".join(
82
107
  f"- Tool: {t['Function Name']}\n Description: {t['Docstring']}\n Args: {', '.join(t['Arguments']) or 'None'}"
@@ -93,16 +118,43 @@ def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story:
93
118
  )
94
119
  return prompt
95
120
 
121
+
96
122
  # Step 5: Send prompt to LLM and save test cases
97
- def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-llama/llama-3-405b-instruct", ):
123
+ def generate_multiple_in_one(
124
+ prompt,
125
+ output_dir,
126
+ starting_index,
127
+ model_id="meta-llama/llama-3-405b-instruct",
128
+ # model_id="gpt-4o",
129
+ ):
98
130
  output_dir.mkdir(parents=True, exist_ok=True)
99
131
 
132
+ # Legacy provider (e.g., watsonx)
100
133
  provider = get_provider(
101
134
  model_id=model_id,
102
- params={"min_new_tokens": 50, "decoding_method": "greedy", "max_new_tokens": 3000},
135
+ params={
136
+ "min_new_tokens": 50,
137
+ "decoding_method": "greedy",
138
+ "max_new_tokens": 3000,
139
+ },
140
+ use_portkey_provider=False,
103
141
  )
104
-
105
- response = provider.query(prompt)
142
+ response = provider.chat(prompt).text
143
+
144
+ # # OpenAI provider
145
+ # provider = get_provider(provider="openai", model_id=model_id, api_key=os.getenv("OPENAI_API_KEY"))
146
+ # response = provider.chat(prompt).choices[0].message.content
147
+
148
+ # # Azure OpenAI provider
149
+ # provider = get_provider(
150
+ # provider = "azure-openai",
151
+ # azure_model_name = model_id,
152
+ # azure_deployment_id = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
153
+ # azure_resource_name = os.getenv("AZURE_OPENAI_RESOURCE_NAME"),
154
+ # azure_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
155
+ # api_key = f"Bearer {os.getenv('AZURE_OPENAI_API_KEY')}"
156
+ # )
157
+ # response = provider.chat(prompt).choices[0].message.content
106
158
 
107
159
  try:
108
160
  raw_text = response
@@ -124,8 +176,19 @@ def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-
124
176
  print("Raw text:\n", raw_text)
125
177
  print("Error:", str(e))
126
178
 
127
- def generate_test_cases_from_stories(agent_name: str, stories: list[str], tools_path: Path, snapshot_path: Path, output_dir: Path, allowed_tools: list[str], num_variants: int = 2):
128
- agent, tools = parse_tools_with_filter(agent_name, tools_path, allowed_tools)
179
+
180
+ def generate_test_cases_from_stories(
181
+ agent_name: str,
182
+ stories: list[str],
183
+ tools_path: Path,
184
+ snapshot_path: Path,
185
+ output_dir: Path,
186
+ allowed_tools: list[str],
187
+ num_variants: int = 2,
188
+ ):
189
+ agent, tools = parse_tools_with_filter(
190
+ agent_name, tools_path, allowed_tools
191
+ )
129
192
  tool_inputs = extract_inputs_from_snapshot(snapshot_path)
130
193
  example_json = load_example(Path(EXAMPLE_PATH))
131
194
 
@@ -134,23 +197,29 @@ def generate_test_cases_from_stories(agent_name: str, stories: list[str], tools_
134
197
  print(f"\n Generating test cases for story {idx}: {story}")
135
198
 
136
199
  prompt = build_prompt_for_story(
137
- agent, tools, tool_inputs, example_json, story, num_variants=num_variants
200
+ agent,
201
+ tools,
202
+ tool_inputs,
203
+ example_json,
204
+ story,
205
+ num_variants=num_variants,
138
206
  )
139
207
 
140
208
  generate_multiple_in_one(
141
209
  prompt=prompt,
142
210
  output_dir=output_dir,
143
- starting_index=test_case_counter
211
+ starting_index=test_case_counter,
144
212
  )
145
213
 
146
214
  test_case_counter += num_variants
147
215
 
216
+
148
217
  def main(config: BatchAnnotateConfig):
149
218
  stories_path = Path(config.stories_path)
150
219
 
151
220
  stories = []
152
221
  agent_name = None
153
- with stories_path.open("r", encoding="utf-8", newline='') as f:
222
+ with stories_path.open("r", encoding="utf-8", newline="") as f:
154
223
  csv_reader = csv.DictReader(f)
155
224
  for row in csv_reader:
156
225
  stories.append(row["story"])
@@ -168,8 +237,9 @@ def main(config: BatchAnnotateConfig):
168
237
  snapshot_path,
169
238
  output_dir,
170
239
  config.allowed_tools,
171
- num_variants=config.num_variants
240
+ num_variants=config.num_variants,
172
241
  )
173
242
 
243
+
174
244
  if __name__ == "__main__":
175
245
  main(CLI(BatchAnnotateConfig, as_positional=False))
@@ -0,0 +1,103 @@
1
+ import copy
2
+ from dataclasses import asdict, dataclass
3
+
4
+ from wxo_agentic_evaluation.arg_configs import ProviderConfig, TestConfig
5
+ from wxo_agentic_evaluation.llm_user import LLMUser
6
+ from wxo_agentic_evaluation.prompt.template_render import (
7
+ LlamaUserTemplateRenderer,
8
+ )
9
+ from wxo_agentic_evaluation.resource_map import ResourceMap
10
+ from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
11
+ WXORuntimeAdapter,
12
+ )
13
+ from wxo_agentic_evaluation.service_provider import get_provider
14
+ from wxo_agentic_evaluation.service_provider.provider import Provider
15
+ from wxo_agentic_evaluation.wxo_client import WXOClient, get_wxo_client
16
+
17
+
18
+ @dataclass
19
+ class Clients:
20
+ wxo_client: WXOClient
21
+ llmaaj_provider: Provider
22
+ resource_map: ResourceMap
23
+ inference_backend: WXORuntimeAdapter
24
+ llm_user: LLMUser
25
+
26
+
27
+ def bootstrap_clients(config: TestConfig) -> Clients:
28
+ """
29
+ Bootstrap all clients needed for the evaluation.
30
+
31
+ Args:
32
+ config: The test configuration
33
+
34
+ Returns:
35
+ A tuple containing:
36
+ - wxo_client: The WXO client
37
+ - llmaaj_provider: The provider for custom metrics
38
+ - resource_map: The resource map
39
+ - inference_backend: The inference backend
40
+ - llm_user: The LLM user
41
+ """
42
+ # Initialize WXO client
43
+ wxo_client = get_wxo_client(
44
+ config.auth_config.url,
45
+ config.auth_config.tenant_name,
46
+ config.auth_config.token,
47
+ )
48
+
49
+ # Initialize provider for custom metrics
50
+ original_provider_config = config.provider_config
51
+ provider_config_dict = asdict(original_provider_config)
52
+
53
+ provider_kwargs = {
54
+ "config": ProviderConfig(**provider_config_dict),
55
+ "model_id": config.llm_user_config.model_id,
56
+ }
57
+
58
+ if provider_config_dict.get("provider", "gateway") == "gateway":
59
+ provider_kwargs.update(
60
+ token=config.auth_config.token or wxo_client.api_key,
61
+ instance_url=wxo_client.service_url,
62
+ )
63
+ config.auth_config.token = (
64
+ config.auth_config.token or wxo_client.api_key
65
+ )
66
+ config.auth_config.url = (
67
+ config.auth_config.url or wxo_client.service_url
68
+ )
69
+
70
+ # Initialize resource map
71
+ resource_map = ResourceMap(wxo_client)
72
+
73
+ # Initialize inference backend
74
+ inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
75
+
76
+ # Initialize LLM user
77
+ llm_user = LLMUser(
78
+ wai_client=get_provider(**provider_kwargs),
79
+ template=LlamaUserTemplateRenderer(
80
+ config.llm_user_config.prompt_config
81
+ ),
82
+ user_response_style=config.llm_user_config.user_response_style,
83
+ )
84
+
85
+ llamaj_provider_kwargs = copy.deepcopy(provider_kwargs)
86
+ llamaj_config_dict = asdict(llamaj_provider_kwargs["config"])
87
+
88
+ llamaj_config_dict["model_id"] = (
89
+ config.custom_metrics_config.llmaaj_config.model_id
90
+ )
91
+ llamaj_config_dict["embedding_model_id"] = (
92
+ config.custom_metrics_config.llmaaj_config.embedding_model_id
93
+ )
94
+ llamaj_provider_kwargs["config"] = ProviderConfig(**llamaj_config_dict)
95
+ llmaaj_provider = get_provider(**llamaj_provider_kwargs)
96
+
97
+ return Clients(
98
+ wxo_client=wxo_client,
99
+ llmaaj_provider=llmaaj_provider,
100
+ resource_map=resource_map,
101
+ inference_backend=inference_backend,
102
+ llm_user=llm_user,
103
+ )
File without changes
@@ -0,0 +1,74 @@
1
+ import csv
2
+ import os
3
+ import statistics
4
+ import sys
5
+ from collections import defaultdict
6
+ from dataclasses import dataclass, field
7
+ from typing import Any, Callable, Dict, List, Optional, Union
8
+
9
+ from jsonargparse import CLI
10
+
11
+ from wxo_agentic_evaluation.arg_configs import CompareRunsConfig
12
+ from wxo_agentic_evaluation.compare_runs.diff import DiffResults
13
+ from wxo_agentic_evaluation.compare_runs.model import EvaluationResult
14
+ from wxo_agentic_evaluation.utils.utils import create_table, read_file
15
+
16
+
17
+ def main(config: CompareRunsConfig):
18
+ """Main function to compare two run result files."""
19
+ # Extract values from config
20
+ reference_file = config.reference_file_location
21
+ experiment_file = config.experiment_file_location
22
+ csv_output = config.csv_output
23
+ column_stats_csv = config.column_stats_csv
24
+ verbose = config.verbose
25
+
26
+ try:
27
+ # Read the files
28
+ obj1 = read_file(reference_file)
29
+ obj2 = read_file(experiment_file)
30
+
31
+ # Create evaluation results
32
+ result1 = EvaluationResult.from_csv(obj1)
33
+ result2 = EvaluationResult.from_csv(obj2)
34
+
35
+ # Create diff results
36
+ diff_results = DiffResults(result1, result2)
37
+
38
+ # Display summary statistics
39
+ summary_stats = diff_results.summary_statistics()
40
+ summary_table = create_table(summary_stats, title="Summary Statistics")
41
+ print(
42
+ "\nALL metrics are computed on OVERLAPPING test cases, ie cases that exist in both the Reference and Experiment runs\n"
43
+ )
44
+ print(
45
+ "If Experiment - Reference is Positive, that's an increase in the metric. If Experiment - Reference is Negative, that's a decrease in the metric.\n"
46
+ )
47
+ summary_table.print()
48
+
49
+ # Display exclusive tests
50
+ if verbose:
51
+ diff_results.display_exclusive_tests()
52
+
53
+ # Display test cases with differing summary match and success status
54
+ diff_results.display_differing_summary_matches()
55
+
56
+ # Display tabular diff
57
+ diff_results.compute_tabular_diff(verbose=verbose)
58
+
59
+ # Write results to CSV if specified
60
+ if csv_output:
61
+ diff_results.to_csv(csv_output)
62
+
63
+ except Exception as e:
64
+ print(f"Error: {e}", file=sys.stderr)
65
+ return 1
66
+
67
+ return 0
68
+
69
+
70
+ if __name__ == "__main__":
71
+ args = CLI(CompareRunsConfig, as_positional=False)
72
+ sys.exit(main(args))
73
+
74
+ # Made with Bob