ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,204 @@
1
+ import dataclasses
2
+ import glob
3
+ import json
4
+ import os
5
+ import traceback
6
+ from concurrent.futures import ThreadPoolExecutor
7
+
8
+ import rich
9
+ import yaml
10
+ from jsonargparse import CLI
11
+ from rich.progress import Progress
12
+
13
+ from wxo_agentic_evaluation.arg_configs import AttackConfig
14
+ from wxo_agentic_evaluation.evaluation_controller.evaluation_controller import (
15
+ AttackEvaluationController,
16
+ EvaluationController,
17
+ )
18
+ from wxo_agentic_evaluation.llm_user import LLMUser
19
+ from wxo_agentic_evaluation.prompt.template_render import (
20
+ LlamaUserTemplateRenderer,
21
+ )
22
+ from wxo_agentic_evaluation.red_teaming.attack_evaluator import (
23
+ AttackEvaluator,
24
+ evaluate_all_attacks,
25
+ )
26
+ from wxo_agentic_evaluation.resource_map import ResourceMap
27
+ from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
28
+ WXORuntimeAdapter,
29
+ )
30
+ from wxo_agentic_evaluation.service_provider import (
31
+ USE_GATEWAY_MODEL_PROVIDER,
32
+ get_provider,
33
+ )
34
+ from wxo_agentic_evaluation.type import AttackData
35
+ from wxo_agentic_evaluation.utils import json_dump
36
+ from wxo_agentic_evaluation.wxo_client import get_wxo_client
37
+
38
+
39
+ def process_attack(
40
+ task_n, attack_path, config, inference_backend, llm_user, resource_map
41
+ ):
42
+ attack_filename = os.path.basename(attack_path).replace(".json", "")
43
+ with open(attack_path, "r") as f:
44
+ attack: AttackData = AttackData.model_validate(json.load(f))
45
+
46
+ attack_evaluator = AttackEvaluator(config, resource_map, attack_filename)
47
+
48
+ evaluation_controller = AttackEvaluationController(
49
+ runtime=inference_backend,
50
+ llm_user=llm_user,
51
+ config=config,
52
+ attack_data=attack,
53
+ attack_evaluator=attack_evaluator,
54
+ )
55
+ rich.print(
56
+ f"[bold magenta]Running attack: {attack_filename}[/bold magenta]"
57
+ )
58
+ history, _, _ = evaluation_controller.run(
59
+ task_n,
60
+ attack.story,
61
+ agent_name=attack.agent,
62
+ starting_user_input=attack.starting_sentence,
63
+ attack_instructions=attack.attack_data.attack_instructions,
64
+ )
65
+ result = list()
66
+ for message in history:
67
+ result.append(message.model_dump())
68
+
69
+ json_dump(
70
+ os.path.join(
71
+ config.output_dir, "messages", attack_filename + ".messages.json"
72
+ ),
73
+ result,
74
+ )
75
+
76
+ # Ensure there's a results dir and write a failure record if none was created
77
+ result_path = os.path.join(
78
+ config.output_dir, "results", attack_filename + ".result.json"
79
+ )
80
+ if not os.path.exists(result_path):
81
+ # attack evaluator should have written a success on early termination; if not, mark as failed
82
+ json_dump(
83
+ result_path,
84
+ {
85
+ "attack_filename": attack_filename,
86
+ "success": False,
87
+ "attack_category": str(attack.attack_data.attack_category),
88
+ "attack_name": getattr(attack.attack_data, "attack_name", ""),
89
+ "attack_type": getattr(attack.attack_data, "attack_type", ""),
90
+ },
91
+ )
92
+
93
+ return result
94
+
95
+
96
+ def run_attacks(config: AttackConfig):
97
+ executor = ThreadPoolExecutor(max_workers=config.num_workers)
98
+ wxo_client = get_wxo_client(
99
+ config.auth_config.url,
100
+ config.auth_config.tenant_name,
101
+ config.auth_config.token,
102
+ )
103
+ resource_map = ResourceMap(wxo_client)
104
+ inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
105
+ provider_kwargs = {}
106
+ if USE_GATEWAY_MODEL_PROVIDER:
107
+ provider_kwargs.update(
108
+ instance_url=wxo_client.service_url,
109
+ token=wxo_client.api_key,
110
+ )
111
+ llm_user = LLMUser(
112
+ wai_client=get_provider(
113
+ model_id=config.llm_user_config.model_id, **provider_kwargs
114
+ ),
115
+ template=LlamaUserTemplateRenderer(
116
+ config.llm_user_config.prompt_config
117
+ ),
118
+ user_response_style=config.llm_user_config.user_response_style,
119
+ )
120
+
121
+ print(
122
+ f"Running red teaming attacks with tenant {config.auth_config.tenant_name}"
123
+ )
124
+ for folder in ["messages", "results", "evaluations"]:
125
+ os.makedirs(os.path.join(config.output_dir, folder), exist_ok=True)
126
+
127
+ available_res = set()
128
+ if config.skip_available_results:
129
+ available_res = set(
130
+ [
131
+ os.path.basename(f).replace(".result", "")
132
+ for f in glob.glob(
133
+ os.path.join(config.output_dir, "results", "*.result.json")
134
+ )
135
+ ]
136
+ )
137
+
138
+ results_list = []
139
+ attack_paths = []
140
+ for path in config.attack_paths:
141
+ if os.path.isdir(path):
142
+ path = os.path.join(path, "*.json")
143
+ attack_paths.extend(sorted(glob.glob(path)))
144
+
145
+ futures = []
146
+ task_n = 0
147
+
148
+ for attack_path in attack_paths:
149
+ if not attack_path.endswith(".json") or attack_path.endswith(
150
+ "agent.json"
151
+ ):
152
+ continue
153
+
154
+ if config.skip_available_results:
155
+ if os.path.basename(attack_path) in available_res:
156
+ print(
157
+ f"Skipping attack {os.path.basename(attack_path)} as results already exist."
158
+ )
159
+ continue
160
+
161
+ future = executor.submit(
162
+ process_attack,
163
+ task_n,
164
+ attack_path,
165
+ config,
166
+ inference_backend,
167
+ llm_user,
168
+ resource_map,
169
+ )
170
+
171
+ futures.append((attack_path, future))
172
+ task_n += 1
173
+
174
+ if futures:
175
+ with Progress() as progress:
176
+ task1 = progress.add_task(
177
+ f"[purple]Running {len(futures)} attacks...", total=len(futures)
178
+ )
179
+ for attack_path, future in futures:
180
+ try:
181
+ results_list.extend(future.result())
182
+ except Exception as e:
183
+ rich.print(f"Attack {attack_path} fails with {e}")
184
+ traceback.print_exc()
185
+ finally:
186
+ progress.update(task1, advance=1)
187
+
188
+ attack_results = evaluate_all_attacks(config, resource_map)
189
+
190
+ with open(
191
+ os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"
192
+ ) as f:
193
+ yaml.safe_dump(dataclasses.asdict(config), f)
194
+
195
+ with open(
196
+ os.path.join(config.output_dir, "attacks_results.json"), "w"
197
+ ) as f:
198
+ json.dump(attack_results, f, indent=2)
199
+
200
+ print(f"Attack results saved to {config.output_dir}")
201
+
202
+
203
+ if __name__ == "__main__":
204
+ run_attacks(CLI(AttackConfig, as_positional=False))
@@ -0,0 +1,3 @@
1
+ from wxo_agentic_evaluation.referenceless_eval.referenceless_eval import (
2
+ ReferencelessEvaluation,
3
+ )
@@ -0,0 +1,28 @@
1
+ ### Metric name constants
2
+ ## General metrics
3
+ METRIC_GENERAL_HALLUCINATION_CHECK = "general_hallucination_check"
4
+ METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT = "general_value_format_alignment"
5
+
6
+ ## Function selection metrics
7
+ METRIC_FUNCTION_SELECTION_APPROPRIATENESS = "function_selection_appropriateness"
8
+ METRIC_AGENTIC_CONSTRAINTS_SATISFACTION = "agentic_constraints_satisfaction"
9
+
10
+ ## Parameter metrics
11
+ METRIC_PARAMETER_VALUE_FORMAT_ALIGNMENT = "parameter_value_format_alignment"
12
+ METRIC_PARAMETER_HALLUCINATION_CHECK = "parameter_hallucination_check"
13
+
14
+ ## Metric category mapping
15
+ GENERAL_METRICS = [
16
+ METRIC_GENERAL_HALLUCINATION_CHECK,
17
+ METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT,
18
+ ]
19
+
20
+ FUNCTION_SELECTION_METRICS = [
21
+ METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
22
+ METRIC_AGENTIC_CONSTRAINTS_SATISFACTION,
23
+ ]
24
+
25
+ PARAMETER_METRICS = [
26
+ METRIC_PARAMETER_VALUE_FORMAT_ALIGNMENT,
27
+ METRIC_PARAMETER_HALLUCINATION_CHECK,
28
+ ]
@@ -0,0 +1,29 @@
1
+ from abc import ABC
2
+
3
+ from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
4
+ from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
5
+ MetricPrompt,
6
+ )
7
+
8
+
9
+ class FunctionMetricsPrompt(MetricPrompt, ABC):
10
+ """
11
+ Abstract base for function-calling metric prompts.
12
+ Subclasses must define class attrs:
13
+ - system_template: str
14
+ - user_template: str
15
+ """
16
+
17
+ system_template: str
18
+ user_template: str
19
+
20
+ def __init__(self, metric: Metric, task_description: str) -> None:
21
+ super().__init__(
22
+ metric=metric,
23
+ system_template=self.system_template,
24
+ user_template=self.user_template,
25
+ system_kwargs_defaults={
26
+ "task_description": task_description,
27
+ "metric_jsonschema": metric.to_jsonschema(),
28
+ },
29
+ )
@@ -0,0 +1,49 @@
1
+ from typing import Any, Dict, List, Union
2
+
3
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.base import (
4
+ FunctionMetricsPrompt,
5
+ )
6
+
7
+ _general_system = (
8
+ "### Task Description and Role:\n\n"
9
+ "{{ task_description }}\n\n"
10
+ "Your output must conform to the following JSON schema, in the same order as the fields appear in the schema:\n"
11
+ "{{ metric_jsonschema }}"
12
+ )
13
+
14
+ _general_user: str = (
15
+ "Conversation context:\n"
16
+ "{{ conversation_context }}\n\n"
17
+ "Tool Specification:\n"
18
+ "{{ tool_inventory }}\n\n"
19
+ "Proposed tool call:\n"
20
+ "{{ tool_call }}\n\n"
21
+ "Return a JSON object as specified in the system prompt. You MUST keep the same order of fields in the JSON object as provided in the JSON schema and examples."
22
+ )
23
+
24
+
25
+ class GeneralMetricsPrompt(FunctionMetricsPrompt):
26
+ """Prompt builder for general tool-call semantic metrics."""
27
+
28
+ system_template = _general_system
29
+ user_template = _general_user
30
+
31
+
32
+ def get_general_metrics_prompt(
33
+ prompt: GeneralMetricsPrompt,
34
+ conversation_context: Union[str, List[Dict[str, str]]],
35
+ tool_inventory: List[Dict[str, Any]],
36
+ tool_call: Dict[str, Any],
37
+ ) -> List[Dict[str, str]]:
38
+ """
39
+ Build the messages for a general semantic evaluation.
40
+
41
+ Returns the list of chat messages (system -> [few-shot] -> user).
42
+ """
43
+ return prompt.build_messages(
44
+ user_kwargs={
45
+ "conversation_context": conversation_context,
46
+ "tool_inventory": tool_inventory,
47
+ "tool_call": tool_call,
48
+ }
49
+ )