ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,7 @@ from jsonargparse import CLI
6
6
 
7
7
  from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
8
8
  from wxo_agentic_evaluation.data_annotator import DataAnnotator
9
- from wxo_agentic_evaluation.type import EvaluationData, Message
9
+ from wxo_agentic_evaluation.type import Message, OrchestrateDataset
10
10
 
11
11
 
12
12
  def main(config: TestCaseGenerationConfig):
@@ -17,7 +17,7 @@ def main(config: TestCaseGenerationConfig):
17
17
  messages.append(Message.model_validate(entry))
18
18
 
19
19
  with open(config.seed_data_path, "r") as f:
20
- evaluation_data = EvaluationData(**json.load(f))
20
+ evaluation_data = OrchestrateDataset(**json.load(f))
21
21
 
22
22
  # Generate annonated dataset
23
23
  annotator = DataAnnotator(
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  from dataclasses import dataclass, field
3
+ from enum import StrEnum
3
4
  from typing import List, Optional, Union
4
5
 
5
6
  from wxo_agentic_evaluation import __file__
@@ -30,7 +31,33 @@ class LLMUserConfig:
30
31
  @dataclass
31
32
  class ProviderConfig:
32
33
  model_id: str = field(default="meta-llama/llama-3-405b-instruct")
33
- provider: str = field(default="watsonx")
34
+ provider: str = field(
35
+ default_factory=lambda: (
36
+ "gateway"
37
+ if os.getenv("USE_GATEWAY_MODEL_PROVIDER", "").lower() == "true"
38
+ else "watsonx"
39
+ )
40
+ )
41
+ embedding_model_id: str = field(
42
+ default="sentence-transformers/all-minilm-l6-v2"
43
+ )
44
+
45
+
46
+ @dataclass
47
+ class CustomMetricsConfig:
48
+ paths: Optional[list[str]] = field(default=None)
49
+ llmaaj_config: ProviderConfig = field(default_factory=ProviderConfig)
50
+
51
+
52
+ @dataclass
53
+ class ExtractorsConfig:
54
+ paths: Optional[list[str]] = field(default=None)
55
+
56
+
57
+
58
+ class ControllerConfig:
59
+ enable_verbose_logging: bool = True
60
+ enable_manual_user_input: bool = False
34
61
 
35
62
 
36
63
  @dataclass
@@ -41,11 +68,21 @@ class TestConfig:
41
68
  wxo_lite_version: str
42
69
  provider_config: ProviderConfig = field(default_factory=ProviderConfig)
43
70
  llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
71
+ custom_metrics_config: CustomMetricsConfig = field(
72
+ default_factory=CustomMetricsConfig
73
+ )
74
+ extractors_config: ExtractorsConfig = field(default_factory=ExtractorsConfig)
44
75
  enable_verbose_logging: bool = True
45
76
  enable_manual_user_input: bool = False
46
77
  skip_available_results: bool = False
47
78
  data_annotation_run: bool = False
48
79
  num_workers: int = 2
80
+ n_runs: int = 1
81
+ similarity_threshold: float = 0.8
82
+ enable_fuzzy_matching: bool = False
83
+ strict_topological_matching: bool = True
84
+ enable_recursive_search: bool = False
85
+ skip_legacy_evaluation: bool = False # Skip legacy evaluation and only run user/agent simulations
49
86
 
50
87
 
51
88
  @dataclass
@@ -58,22 +95,32 @@ class AttackConfig:
58
95
  enable_verbose_logging: bool = True
59
96
  enable_manual_user_input: bool = False
60
97
  num_workers: int = 2
98
+ skip_available_results: bool = True
61
99
 
62
100
 
63
101
  @dataclass
64
102
  class AttackGeneratorConfig:
65
103
  attacks_list: Union[List[str], str]
66
104
  datasets_path: Union[List[str], str]
67
- agents_path: str
105
+ agents_list_or_path: Union[List[str], str]
68
106
  target_agent_name: str
107
+ auth_config: AuthConfig
69
108
  output_dir: str = None
70
109
  max_variants: int = None
71
110
 
72
111
 
112
+ class AnalyzeMode(StrEnum):
113
+ default = "default"
114
+ enhanced = "enhanced"
115
+
116
+
73
117
  @dataclass
74
118
  class AnalyzeConfig:
75
119
  data_path: str
76
120
  tool_definition_path: Optional[str] = None
121
+ mode: str = AnalyzeMode.default
122
+ num_workers: int = 10
123
+ run: int = -1
77
124
 
78
125
 
79
126
  @dataclass
@@ -117,3 +164,14 @@ class BatchAnnotateConfig:
117
164
  stories_path: str
118
165
  output_dir: str
119
166
  num_variants: int = 2
167
+
168
+
169
+ @dataclass
170
+ class CompareRunsConfig:
171
+ reference_file_location: str
172
+ experiment_file_location: str
173
+ csv_output: Optional[str] = None
174
+ column_stats_csv: Optional[str] = (
175
+ "column_by_column_summary_stats_comparison.csv"
176
+ )
177
+ verbose: bool = False
@@ -0,0 +1,25 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List
3
+
4
+ from wxo_agentic_evaluation.type import Message
5
+
6
+
7
+ class BaseUserSimulator(ABC):
8
+ """Abstract base class for user simulators."""
9
+
10
+ @abstractmethod
11
+ def generate_user_input(
12
+ self, user_story: str, conversation_history: List[Message], **kwargs
13
+ ) -> Message:
14
+ """
15
+ Generate user input based on the user story and conversation history.
16
+
17
+ Args:
18
+ user_story: The user's story or goal
19
+ conversation_history: List of previous messages in the conversation
20
+ **kwargs: Additional parameters specific to the simulator implementation
21
+
22
+ Returns:
23
+ Message: The generated user input message
24
+ """
25
+ pass
@@ -125,9 +125,11 @@ def generate_multiple_in_one(
125
125
  output_dir,
126
126
  starting_index,
127
127
  model_id="meta-llama/llama-3-405b-instruct",
128
+ # model_id="gpt-4o",
128
129
  ):
129
130
  output_dir.mkdir(parents=True, exist_ok=True)
130
131
 
132
+ # Legacy provider (e.g., watsonx)
131
133
  provider = get_provider(
132
134
  model_id=model_id,
133
135
  params={
@@ -135,9 +137,24 @@ def generate_multiple_in_one(
135
137
  "decoding_method": "greedy",
136
138
  "max_new_tokens": 3000,
137
139
  },
140
+ use_portkey_provider=False,
138
141
  )
139
-
140
- response = provider.query(prompt)
142
+ response = provider.chat(prompt).text
143
+
144
+ # # OpenAI provider
145
+ # provider = get_provider(provider="openai", model_id=model_id, api_key=os.getenv("OPENAI_API_KEY"))
146
+ # response = provider.chat(prompt).choices[0].message.content
147
+
148
+ # # Azure OpenAI provider
149
+ # provider = get_provider(
150
+ # provider = "azure-openai",
151
+ # azure_model_name = model_id,
152
+ # azure_deployment_id = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
153
+ # azure_resource_name = os.getenv("AZURE_OPENAI_RESOURCE_NAME"),
154
+ # azure_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
155
+ # api_key = f"Bearer {os.getenv('AZURE_OPENAI_API_KEY')}"
156
+ # )
157
+ # response = provider.chat(prompt).choices[0].message.content
141
158
 
142
159
  try:
143
160
  raw_text = response
@@ -0,0 +1,103 @@
1
+ import copy
2
+ from dataclasses import asdict, dataclass
3
+
4
+ from wxo_agentic_evaluation.arg_configs import ProviderConfig, TestConfig
5
+ from wxo_agentic_evaluation.llm_user import LLMUser
6
+ from wxo_agentic_evaluation.prompt.template_render import (
7
+ LlamaUserTemplateRenderer,
8
+ )
9
+ from wxo_agentic_evaluation.resource_map import ResourceMap
10
+ from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
11
+ WXORuntimeAdapter,
12
+ )
13
+ from wxo_agentic_evaluation.service_provider import get_provider
14
+ from wxo_agentic_evaluation.service_provider.provider import Provider
15
+ from wxo_agentic_evaluation.wxo_client import WXOClient, get_wxo_client
16
+
17
+
18
+ @dataclass
19
+ class Clients:
20
+ wxo_client: WXOClient
21
+ llmaaj_provider: Provider
22
+ resource_map: ResourceMap
23
+ inference_backend: WXORuntimeAdapter
24
+ llm_user: LLMUser
25
+
26
+
27
+ def bootstrap_clients(config: TestConfig) -> Clients:
28
+ """
29
+ Bootstrap all clients needed for the evaluation.
30
+
31
+ Args:
32
+ config: The test configuration
33
+
34
+ Returns:
35
+ A tuple containing:
36
+ - wxo_client: The WXO client
37
+ - llmaaj_provider: The provider for custom metrics
38
+ - resource_map: The resource map
39
+ - inference_backend: The inference backend
40
+ - llm_user: The LLM user
41
+ """
42
+ # Initialize WXO client
43
+ wxo_client = get_wxo_client(
44
+ config.auth_config.url,
45
+ config.auth_config.tenant_name,
46
+ config.auth_config.token,
47
+ )
48
+
49
+ # Initialize provider for custom metrics
50
+ original_provider_config = config.provider_config
51
+ provider_config_dict = asdict(original_provider_config)
52
+
53
+ provider_kwargs = {
54
+ "config": ProviderConfig(**provider_config_dict),
55
+ "model_id": config.llm_user_config.model_id,
56
+ }
57
+
58
+ if provider_config_dict.get("provider", "gateway") == "gateway":
59
+ provider_kwargs.update(
60
+ token=config.auth_config.token or wxo_client.api_key,
61
+ instance_url=wxo_client.service_url,
62
+ )
63
+ config.auth_config.token = (
64
+ config.auth_config.token or wxo_client.api_key
65
+ )
66
+ config.auth_config.url = (
67
+ config.auth_config.url or wxo_client.service_url
68
+ )
69
+
70
+ # Initialize resource map
71
+ resource_map = ResourceMap(wxo_client)
72
+
73
+ # Initialize inference backend
74
+ inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
75
+
76
+ # Initialize LLM user
77
+ llm_user = LLMUser(
78
+ wai_client=get_provider(**provider_kwargs),
79
+ template=LlamaUserTemplateRenderer(
80
+ config.llm_user_config.prompt_config
81
+ ),
82
+ user_response_style=config.llm_user_config.user_response_style,
83
+ )
84
+
85
+ llamaj_provider_kwargs = copy.deepcopy(provider_kwargs)
86
+ llamaj_config_dict = asdict(llamaj_provider_kwargs["config"])
87
+
88
+ llamaj_config_dict["model_id"] = (
89
+ config.custom_metrics_config.llmaaj_config.model_id
90
+ )
91
+ llamaj_config_dict["embedding_model_id"] = (
92
+ config.custom_metrics_config.llmaaj_config.embedding_model_id
93
+ )
94
+ llamaj_provider_kwargs["config"] = ProviderConfig(**llamaj_config_dict)
95
+ llmaaj_provider = get_provider(**llamaj_provider_kwargs)
96
+
97
+ return Clients(
98
+ wxo_client=wxo_client,
99
+ llmaaj_provider=llmaaj_provider,
100
+ resource_map=resource_map,
101
+ inference_backend=inference_backend,
102
+ llm_user=llm_user,
103
+ )
File without changes
@@ -0,0 +1,74 @@
1
+ import csv
2
+ import os
3
+ import statistics
4
+ import sys
5
+ from collections import defaultdict
6
+ from dataclasses import dataclass, field
7
+ from typing import Any, Callable, Dict, List, Optional, Union
8
+
9
+ from jsonargparse import CLI
10
+
11
+ from wxo_agentic_evaluation.arg_configs import CompareRunsConfig
12
+ from wxo_agentic_evaluation.compare_runs.diff import DiffResults
13
+ from wxo_agentic_evaluation.compare_runs.model import EvaluationResult
14
+ from wxo_agentic_evaluation.utils.utils import create_table, read_file
15
+
16
+
17
+ def main(config: CompareRunsConfig):
18
+ """Main function to compare two run result files."""
19
+ # Extract values from config
20
+ reference_file = config.reference_file_location
21
+ experiment_file = config.experiment_file_location
22
+ csv_output = config.csv_output
23
+ column_stats_csv = config.column_stats_csv
24
+ verbose = config.verbose
25
+
26
+ try:
27
+ # Read the files
28
+ obj1 = read_file(reference_file)
29
+ obj2 = read_file(experiment_file)
30
+
31
+ # Create evaluation results
32
+ result1 = EvaluationResult.from_csv(obj1)
33
+ result2 = EvaluationResult.from_csv(obj2)
34
+
35
+ # Create diff results
36
+ diff_results = DiffResults(result1, result2)
37
+
38
+ # Display summary statistics
39
+ summary_stats = diff_results.summary_statistics()
40
+ summary_table = create_table(summary_stats, title="Summary Statistics")
41
+ print(
42
+ "\nALL metrics are computed on OVERLAPPING test cases, ie cases that exist in both the Reference and Experiment runs\n"
43
+ )
44
+ print(
45
+ "If Experiment - Reference is Positive, that's an increase in the metric. If Experiment - Reference is Negative, that's a decrease in the metric.\n"
46
+ )
47
+ summary_table.print()
48
+
49
+ # Display exclusive tests
50
+ if verbose:
51
+ diff_results.display_exclusive_tests()
52
+
53
+ # Display test cases with differing summary match and success status
54
+ diff_results.display_differing_summary_matches()
55
+
56
+ # Display tabular diff
57
+ diff_results.compute_tabular_diff(verbose=verbose)
58
+
59
+ # Write results to CSV if specified
60
+ if csv_output:
61
+ diff_results.to_csv(csv_output)
62
+
63
+ except Exception as e:
64
+ print(f"Error: {e}", file=sys.stderr)
65
+ return 1
66
+
67
+ return 0
68
+
69
+
70
+ if __name__ == "__main__":
71
+ args = CLI(CompareRunsConfig, as_positional=False)
72
+ sys.exit(main(args))
73
+
74
+ # Made with Bob