ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (33) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/METADATA +1 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/RECORD +33 -29
  3. wxo_agentic_evaluation/analyze_run.py +805 -344
  4. wxo_agentic_evaluation/arg_configs.py +10 -1
  5. wxo_agentic_evaluation/description_quality_checker.py +11 -2
  6. wxo_agentic_evaluation/evaluation_package.py +8 -3
  7. wxo_agentic_evaluation/inference_backend.py +46 -79
  8. wxo_agentic_evaluation/llm_matching.py +14 -2
  9. wxo_agentic_evaluation/main.py +1 -1
  10. wxo_agentic_evaluation/metrics/__init__.py +1 -0
  11. wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
  12. wxo_agentic_evaluation/metrics/metrics.py +43 -1
  13. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  14. wxo_agentic_evaluation/prompt/template_render.py +4 -2
  15. wxo_agentic_evaluation/quick_eval.py +7 -9
  16. wxo_agentic_evaluation/record_chat.py +2 -5
  17. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +139 -100
  18. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -34
  19. wxo_agentic_evaluation/red_teaming/attack_list.py +89 -18
  20. wxo_agentic_evaluation/red_teaming/attack_runner.py +51 -11
  21. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  22. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  23. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  24. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +77 -39
  25. wxo_agentic_evaluation/resource_map.py +3 -1
  26. wxo_agentic_evaluation/service_instance.py +7 -0
  27. wxo_agentic_evaluation/type.py +1 -1
  28. wxo_agentic_evaluation/utils/__init__.py +3 -0
  29. wxo_agentic_evaluation/utils/parsers.py +71 -0
  30. wxo_agentic_evaluation/utils/utils.py +131 -16
  31. wxo_agentic_evaluation/wxo_client.py +80 -0
  32. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/WHEEL +0 -0
  33. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/top_level.txt +0 -0
@@ -12,32 +12,36 @@ from rich.progress import Progress
12
12
 
13
13
  from wxo_agentic_evaluation.arg_configs import AttackConfig
14
14
  from wxo_agentic_evaluation.inference_backend import (
15
- EvaluationController,
15
+ AttackEvaluationController,
16
16
  WXOInferenceBackend,
17
- get_wxo_client,
18
17
  )
18
+ from wxo_agentic_evaluation.wxo_client import get_wxo_client
19
19
  from wxo_agentic_evaluation.llm_user import LLMUser
20
20
  from wxo_agentic_evaluation.prompt.template_render import (
21
21
  LlamaUserTemplateRenderer,
22
22
  )
23
- from wxo_agentic_evaluation.red_teaming.attack_evaluator import AttackEvaluator
23
+ from wxo_agentic_evaluation.red_teaming.attack_evaluator import AttackEvaluator, evaluate_all_attacks
24
24
  from wxo_agentic_evaluation.resource_map import ResourceMap
25
25
  from wxo_agentic_evaluation.service_provider import get_provider
26
26
  from wxo_agentic_evaluation.type import AttackData
27
27
  from wxo_agentic_evaluation.utils import json_dump
28
28
 
29
29
 
30
- def process_attack(task_n, attack_path, config, inference_backend, llm_user):
31
- tc_name = os.path.basename(attack_path).replace(".json", "")
30
+ def process_attack(task_n, attack_path, config, inference_backend, llm_user, resource_map):
31
+ attack_filename = os.path.basename(attack_path).replace(".json", "")
32
32
  with open(attack_path, "r") as f:
33
33
  attack: AttackData = AttackData.model_validate(json.load(f))
34
34
 
35
- evaluation_controller = EvaluationController(
35
+ attack_evaluator = AttackEvaluator(config, resource_map, attack_filename)
36
+
37
+ evaluation_controller = AttackEvaluationController(
36
38
  wxo_inference_backend=inference_backend,
37
39
  llm_user=llm_user,
38
40
  config=config,
41
+ attack_data=attack,
42
+ attack_evaluator=attack_evaluator
39
43
  )
40
- rich.print(f"[bold magenta]Running attack: {tc_name}[/bold magenta]")
44
+ rich.print(f"[bold magenta]Running attack: {attack_filename}[/bold magenta]")
41
45
  history, _, _ = evaluation_controller.run(
42
46
  task_n,
43
47
  attack.story,
@@ -50,10 +54,25 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
50
54
  result.append(message.model_dump())
51
55
 
52
56
  json_dump(
53
- os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
57
+ os.path.join(config.output_dir, "messages", attack_filename + ".messages.json"),
54
58
  result,
55
59
  )
56
60
 
61
+ # Ensure there's a results dir and write a failure record if none was created
62
+ result_path = os.path.join(config.output_dir, "results", attack_filename + ".result.json")
63
+ if not os.path.exists(result_path):
64
+ # attack evaluator should have written a success on early termination; if not, mark as failed
65
+ json_dump(
66
+ result_path,
67
+ {
68
+ "attack_filename": attack_filename,
69
+ "success": False,
70
+ "attack_category": str(attack.attack_data.attack_category),
71
+ "attack_name": getattr(attack.attack_data, "attack_name", ""),
72
+ "attack_type": getattr(attack.attack_data, "attack_type", ""),
73
+ }
74
+ )
75
+
57
76
  return result
58
77
 
59
78
 
@@ -80,7 +99,21 @@ def run_attacks(config: AttackConfig):
80
99
  print(
81
100
  f"Running red teaming attacks with tenant {config.auth_config.tenant_name}"
82
101
  )
83
- os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
102
+ for folder in ["messages", "results", "evaluations"]:
103
+ os.makedirs(os.path.join(config.output_dir, folder), exist_ok=True)
104
+
105
+ available_res = set()
106
+ if config.skip_available_results:
107
+ available_res = set(
108
+ [
109
+ os.path.basename(f).replace(".result", "")
110
+ for f in glob.glob(
111
+ os.path.join(
112
+ config.output_dir, "results", "*.result.json"
113
+ )
114
+ )
115
+ ]
116
+ )
84
117
 
85
118
  results_list = []
86
119
  attack_paths = []
@@ -98,6 +131,13 @@ def run_attacks(config: AttackConfig):
98
131
  ):
99
132
  continue
100
133
 
134
+ if config.skip_available_results:
135
+ if os.path.basename(attack_path) in available_res:
136
+ print(
137
+ f"Skipping attack {os.path.basename(attack_path)} as results already exist."
138
+ )
139
+ continue
140
+
101
141
  future = executor.submit(
102
142
  process_attack,
103
143
  task_n,
@@ -105,6 +145,7 @@ def run_attacks(config: AttackConfig):
105
145
  config,
106
146
  inference_backend,
107
147
  llm_user,
148
+ resource_map
108
149
  )
109
150
 
110
151
  futures.append((attack_path, future))
@@ -124,8 +165,7 @@ def run_attacks(config: AttackConfig):
124
165
  finally:
125
166
  progress.update(task1, advance=1)
126
167
 
127
- attack_evaluator = AttackEvaluator(config, resource_map)
128
- attack_results = attack_evaluator.evaluate_attacks()
168
+ attack_results = evaluate_all_attacks(config, resource_map)
129
169
 
130
170
  with open(
131
171
  os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"