ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +8 -2
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation_package.py +114 -70
  14. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  15. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  16. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  17. wxo_agentic_evaluation/external_agent/types.py +12 -5
  18. wxo_agentic_evaluation/inference_backend.py +158 -73
  19. wxo_agentic_evaluation/llm_matching.py +4 -3
  20. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  21. wxo_agentic_evaluation/llm_user.py +7 -3
  22. wxo_agentic_evaluation/main.py +175 -67
  23. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  24. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  25. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  26. wxo_agentic_evaluation/quick_eval.py +49 -23
  27. wxo_agentic_evaluation/record_chat.py +70 -33
  28. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  29. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  30. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  38. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  39. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  40. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  41. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  42. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  43. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  44. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  45. wxo_agentic_evaluation/resource_map.py +2 -1
  46. wxo_agentic_evaluation/service_instance.py +24 -11
  47. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  48. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
  49. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  50. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  51. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  52. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  53. wxo_agentic_evaluation/tool_planner.py +128 -44
  54. wxo_agentic_evaluation/type.py +12 -9
  55. wxo_agentic_evaluation/utils/__init__.py +1 -0
  56. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  57. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  58. wxo_agentic_evaluation/utils/utils.py +83 -52
  59. ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
  61. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,28 @@
1
- import json
2
1
  import ast
3
2
  import csv
3
+ import json
4
4
  import os
5
5
  from pathlib import Path
6
+
6
7
  from jsonargparse import CLI
7
8
 
8
- from wxo_agentic_evaluation.service_provider import get_provider
9
- from wxo_agentic_evaluation.prompt.template_render import BatchTestCaseGeneratorTemplateRenderer
10
- from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
11
9
  from wxo_agentic_evaluation import __file__
10
+ from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
11
+ from wxo_agentic_evaluation.prompt.template_render import (
12
+ BatchTestCaseGeneratorTemplateRenderer,
13
+ )
14
+ from wxo_agentic_evaluation.service_provider import get_provider
12
15
 
13
16
  root_dir = os.path.dirname(__file__)
14
- BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(root_dir, "prompt", "batch_testcase_prompt.jinja2")
17
+ BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(
18
+ root_dir, "prompt", "batch_testcase_prompt.jinja2"
19
+ )
15
20
  EXAMPLE_PATH = os.path.join(root_dir, "prompt", "examples", "data_simple.json")
16
21
 
17
22
 
18
- def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_names: list[str]) -> tuple[
19
- dict, list[dict]]:
23
+ def parse_tools_with_filter(
24
+ agent_name: str, tools_path: Path, allowed_tool_names: list[str]
25
+ ) -> tuple[dict, list[dict]]:
20
26
  if not allowed_tool_names:
21
27
  raise ValueError("Allowed tool list cannot be empty.")
22
28
 
@@ -29,7 +35,9 @@ def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_name
29
35
  elif tools_path.is_dir():
30
36
  files_to_parse.extend(tools_path.glob("**/*.py"))
31
37
  else:
32
- raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
38
+ raise ValueError(
39
+ f"Tools path {tools_path} is neither a file nor directory"
40
+ )
33
41
 
34
42
  for file_path in files_to_parse:
35
43
  try:
@@ -41,21 +49,29 @@ def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_name
41
49
  # Process only module-level functions
42
50
  for node in parsed_code.body:
43
51
  if isinstance(node, ast.FunctionDef):
44
- tool_data.append({
45
- "Function Name": node.name,
46
- "Arguments": [arg.arg for arg in node.args.args],
47
- "Docstring": ast.get_docstring(node)
48
- })
52
+ tool_data.append(
53
+ {
54
+ "Function Name": node.name,
55
+ "Arguments": [arg.arg for arg in node.args.args],
56
+ "Docstring": ast.get_docstring(node),
57
+ }
58
+ )
49
59
 
50
60
  except Exception as e:
51
61
  print(f"Warning: Failed to parse {file_path}: {str(e)}")
52
62
  continue
53
63
 
54
64
  # Filter tools based on allowed names
55
- filtered_tools = [tool for tool in tool_data if tool["Function Name"] in allowed_tool_names]
65
+ filtered_tools = [
66
+ tool
67
+ for tool in tool_data
68
+ if tool["Function Name"] in allowed_tool_names
69
+ ]
56
70
 
57
71
  if not filtered_tools:
58
- print(f"Warning: No matching tools found. Available tools: {[t['Function Name'] for t in tool_data]}")
72
+ print(
73
+ f"Warning: No matching tools found. Available tools: {[t['Function Name'] for t in tool_data]}"
74
+ )
59
75
 
60
76
  return {"name": agent_name}, filtered_tools
61
77
 
@@ -75,8 +91,17 @@ def load_example(example_path: Path):
75
91
 
76
92
 
77
93
  # Step 4: Prompt builder for N test cases from a given story
78
- def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story: str, num_variants: int = 2):
79
- renderer = BatchTestCaseGeneratorTemplateRenderer(BATCH_TEST_CASE_GENERATOR_PROMPT_PATH)
94
+ def build_prompt_for_story(
95
+ agent,
96
+ tools,
97
+ tool_inputs,
98
+ example_case: dict,
99
+ story: str,
100
+ num_variants: int = 2,
101
+ ):
102
+ renderer = BatchTestCaseGeneratorTemplateRenderer(
103
+ BATCH_TEST_CASE_GENERATOR_PROMPT_PATH
104
+ )
80
105
 
81
106
  tool_blocks = "\n".join(
82
107
  f"- Tool: {t['Function Name']}\n Description: {t['Docstring']}\n Args: {', '.join(t['Arguments']) or 'None'}"
@@ -93,13 +118,23 @@ def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story:
93
118
  )
94
119
  return prompt
95
120
 
121
+
96
122
  # Step 5: Send prompt to LLM and save test cases
97
- def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-llama/llama-3-405b-instruct", ):
123
+ def generate_multiple_in_one(
124
+ prompt,
125
+ output_dir,
126
+ starting_index,
127
+ model_id="meta-llama/llama-3-405b-instruct",
128
+ ):
98
129
  output_dir.mkdir(parents=True, exist_ok=True)
99
130
 
100
131
  provider = get_provider(
101
132
  model_id=model_id,
102
- params={"min_new_tokens": 50, "decoding_method": "greedy", "max_new_tokens": 3000},
133
+ params={
134
+ "min_new_tokens": 50,
135
+ "decoding_method": "greedy",
136
+ "max_new_tokens": 3000,
137
+ },
103
138
  )
104
139
 
105
140
  response = provider.query(prompt)
@@ -124,8 +159,19 @@ def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-
124
159
  print("Raw text:\n", raw_text)
125
160
  print("Error:", str(e))
126
161
 
127
- def generate_test_cases_from_stories(agent_name: str, stories: list[str], tools_path: Path, snapshot_path: Path, output_dir: Path, allowed_tools: list[str], num_variants: int = 2):
128
- agent, tools = parse_tools_with_filter(agent_name, tools_path, allowed_tools)
162
+
163
+ def generate_test_cases_from_stories(
164
+ agent_name: str,
165
+ stories: list[str],
166
+ tools_path: Path,
167
+ snapshot_path: Path,
168
+ output_dir: Path,
169
+ allowed_tools: list[str],
170
+ num_variants: int = 2,
171
+ ):
172
+ agent, tools = parse_tools_with_filter(
173
+ agent_name, tools_path, allowed_tools
174
+ )
129
175
  tool_inputs = extract_inputs_from_snapshot(snapshot_path)
130
176
  example_json = load_example(Path(EXAMPLE_PATH))
131
177
 
@@ -134,23 +180,29 @@ def generate_test_cases_from_stories(agent_name: str, stories: list[str], tools_
134
180
  print(f"\n Generating test cases for story {idx}: {story}")
135
181
 
136
182
  prompt = build_prompt_for_story(
137
- agent, tools, tool_inputs, example_json, story, num_variants=num_variants
183
+ agent,
184
+ tools,
185
+ tool_inputs,
186
+ example_json,
187
+ story,
188
+ num_variants=num_variants,
138
189
  )
139
190
 
140
191
  generate_multiple_in_one(
141
192
  prompt=prompt,
142
193
  output_dir=output_dir,
143
- starting_index=test_case_counter
194
+ starting_index=test_case_counter,
144
195
  )
145
196
 
146
197
  test_case_counter += num_variants
147
198
 
199
+
148
200
  def main(config: BatchAnnotateConfig):
149
201
  stories_path = Path(config.stories_path)
150
202
 
151
203
  stories = []
152
204
  agent_name = None
153
- with stories_path.open("r", encoding="utf-8", newline='') as f:
205
+ with stories_path.open("r", encoding="utf-8", newline="") as f:
154
206
  csv_reader = csv.DictReader(f)
155
207
  for row in csv_reader:
156
208
  stories.append(row["story"])
@@ -168,8 +220,9 @@ def main(config: BatchAnnotateConfig):
168
220
  snapshot_path,
169
221
  output_dir,
170
222
  config.allowed_tools,
171
- num_variants=config.num_variants
223
+ num_variants=config.num_variants,
172
224
  )
173
225
 
226
+
174
227
  if __name__ == "__main__":
175
228
  main(CLI(BatchAnnotateConfig, as_positional=False))
@@ -1,16 +1,16 @@
1
- from wxo_agentic_evaluation.type import Message, EvaluationData
2
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
3
- from wxo_agentic_evaluation.service_provider import get_provider
4
- from wxo_agentic_evaluation.prompt.template_render import (
5
- LlamaKeywordsGenerationTemplateRenderer,
6
- )
7
- from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
8
-
9
1
  import ast
10
- import json
11
2
  import collections
3
+ import json
12
4
  from typing import Dict, List, Optional
13
5
 
6
+ from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
7
+ from wxo_agentic_evaluation.prompt.template_render import (
8
+ LlamaKeywordsGenerationTemplateRenderer,
9
+ )
10
+ from wxo_agentic_evaluation.service_provider import get_provider
11
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
12
+ from wxo_agentic_evaluation.type import EvaluationData, Message
13
+
14
14
  ERROR_KEYWORDS = [
15
15
  "error",
16
16
  "erroneous",
@@ -143,7 +143,9 @@ class DataAnnotator:
143
143
  )
144
144
  return wrong_tool_response_id
145
145
 
146
- def _process_tool_call_order(self, wrong_tool_response_id: list[str]) -> list[str]:
146
+ def _process_tool_call_order(
147
+ self, wrong_tool_response_id: list[str]
148
+ ) -> list[str]:
147
149
  """Process and order tool calls, skipping failed ones"""
148
150
  # gather all call ids that actually got a response
149
151
  valid_call_ids = {
@@ -230,7 +232,11 @@ class DataAnnotator:
230
232
  if message.role == "assistant":
231
233
  provider = get_provider(
232
234
  model_id=self.keywords_generation_config.model_id,
233
- params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 256},
235
+ params={
236
+ "min_new_tokens": 0,
237
+ "decoding_method": "greedy",
238
+ "max_new_tokens": 256,
239
+ },
234
240
  )
235
241
  kw_generator = KeywordsGenerationLLM(
236
242
  provider=provider,
@@ -247,14 +253,13 @@ class DataAnnotator:
247
253
  }
248
254
  goal_details.append(summarize_step)
249
255
  break
250
-
256
+
251
257
  if previous is None:
252
258
  goals["summarize"] = []
253
259
  elif summarize_step is None:
254
260
  goals[previous] = []
255
261
  else:
256
262
  goals[previous] = ["summarize"]
257
-
258
263
 
259
264
  def generate(self) -> Dict:
260
265
  """Generate the final dataset"""
@@ -1,15 +1,18 @@
1
1
  import os
2
+ from enum import Enum
2
3
  from pathlib import Path
3
4
  from typing import List
5
+
4
6
  import rich
5
- from enum import Enum
6
7
 
8
+ from wxo_agentic_evaluation.prompt.template_render import (
9
+ BadToolDescriptionRenderer,
10
+ )
7
11
  from wxo_agentic_evaluation.service_provider import get_provider
8
- from wxo_agentic_evaluation.prompt.template_render import BadToolDescriptionRenderer
9
12
  from wxo_agentic_evaluation.tool_planner import (
10
- parse_json_string,
11
- extract_tool_signatures,
12
13
  MISSING_DOCSTRING_PROMPT,
14
+ extract_tool_signatures,
15
+ parse_json_string,
13
16
  )
14
17
  from wxo_agentic_evaluation.type import ToolDefinition
15
18
  from wxo_agentic_evaluation.utils.utils import safe_divide
@@ -34,11 +37,11 @@ class ToolDescriptionIssue(Enum):
34
37
 
35
38
 
36
39
  class DescriptionQualityInspector:
37
- DEFAULT_CLASSIFICATION_THRESHOLD = (
38
- 40.0 # 2/5 issues detected. A higher score indicates a worse description.
39
- )
40
+ DEFAULT_CLASSIFICATION_THRESHOLD = 40.0 # 2/5 issues detected. A higher score indicates a worse description.
40
41
  CLASSIFICATION_SCORE_THRESHOLD = float(
41
- os.getenv("CLASSIFICATION_SCORE_THRESHOLD", DEFAULT_CLASSIFICATION_THRESHOLD)
42
+ os.getenv(
43
+ "CLASSIFICATION_SCORE_THRESHOLD", DEFAULT_CLASSIFICATION_THRESHOLD
44
+ )
42
45
  )
43
46
 
44
47
  LLM_MODEL_ID = "meta-llama/llama-3-2-90b-vision-instruct"
@@ -67,9 +70,7 @@ class DescriptionQualityInspector:
67
70
  self.template = BadToolDescriptionRenderer(
68
71
  self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
69
72
  )
70
- self.cached_response = (
71
- None # this is used in the unit-tests for nuanced analysis of the response.
72
- )
73
+ self.cached_response = None # this is used in the unit-tests for nuanced analysis of the response.
73
74
 
74
75
  @staticmethod
75
76
  def extract_tool_desc_from_tool_source(
@@ -96,7 +97,8 @@ class DescriptionQualityInspector:
96
97
  tool_name=tool_name,
97
98
  tool_description=(
98
99
  tool_data["Docstring"]
99
- if tool_data["Docstring"] != MISSING_DOCSTRING_PROMPT
100
+ if tool_data["Docstring"]
101
+ != MISSING_DOCSTRING_PROMPT
100
102
  else None
101
103
  ),
102
104
  tool_params=tool_data["Arguments"],
@@ -131,7 +133,9 @@ class DescriptionQualityInspector:
131
133
  return False # likely some unexpected parsing issue, in this case - flags description as good.
132
134
 
133
135
  # calculate weighted score
134
- final_description_score = self._calculate_score(response_data=response_data)
136
+ final_description_score = self._calculate_score(
137
+ response_data=response_data
138
+ )
135
139
 
136
140
  return final_description_score >= self.CLASSIFICATION_SCORE_THRESHOLD
137
141
 
@@ -146,4 +150,6 @@ class DescriptionQualityInspector:
146
150
  for issue in ToolDescriptionIssue
147
151
  if response_data.get(issue.value, "FALSE").upper() == "TRUE"
148
152
  )
149
- return safe_divide(detected_issues, self.WORST_POSSIBLE_EVAL_OUTCOME) * 100
153
+ return (
154
+ safe_divide(detected_issues, self.WORST_POSSIBLE_EVAL_OUTCOME) * 100
155
+ )