ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,21 @@
1
- import rich
2
1
  import json
3
- from rich.layout import Layout
4
- from rich.table import Table
5
- from rich.panel import Panel
6
- from rich.align import Align
7
- from rich.console import Group
8
- from wxo_agentic_evaluation.type import Message, ContentType
9
- from typing import List, Dict, Optional
2
+ from typing import Dict, List, Optional
3
+
4
+ import rich
10
5
  from analytics.tools.types import (
11
- ToolDefinitionRecommendation,
12
- Priority,
13
6
  AgentRecommendation,
14
7
  AnalysisResults,
15
8
  ErrorPatterns,
9
+ Priority,
10
+ ToolDefinitionRecommendation,
16
11
  )
12
+ from rich.align import Align
13
+ from rich.console import Group
14
+ from rich.layout import Layout
15
+ from rich.panel import Panel
16
+ from rich.table import Table
17
+
18
+ from wxo_agentic_evaluation.type import ContentType, Message
17
19
 
18
20
 
19
21
  class ToolErrorDisplayManager:
@@ -24,7 +26,9 @@ class ToolErrorDisplayManager:
24
26
  )
25
27
 
26
28
  def __init__(
27
- self, messages: List[Message], error_patterns: Optional[ErrorPatterns] = None
29
+ self,
30
+ messages: List[Message],
31
+ error_patterns: Optional[ErrorPatterns] = None,
28
32
  ):
29
33
  self.messages = messages
30
34
  self.error_patterns = error_patterns or ErrorPatterns()
@@ -44,7 +48,9 @@ class ToolErrorDisplayManager:
44
48
  }
45
49
 
46
50
  validation_error_codes = ["404", "not found", "client error"]
47
- unhelpful_resp_threshold = ToolErrorDisplayManager.CHARACTER_THRESHOLD
51
+ unhelpful_resp_threshold = (
52
+ ToolErrorDisplayManager.CHARACTER_THRESHOLD
53
+ )
48
54
 
49
55
  for failure in failures:
50
56
  error_msg = str(failure.error_message).lower()
@@ -55,7 +61,9 @@ class ToolErrorDisplayManager:
55
61
  ):
56
62
  failure_counts["unhelpful_responses"] += 1
57
63
 
58
- if any(err_code in error_msg for err_code in validation_error_codes):
64
+ if any(
65
+ err_code in error_msg for err_code in validation_error_codes
66
+ ):
59
67
  failure_counts["parameter_type_validation"] += 1
60
68
 
61
69
  if any(x in error_msg for x in ['"[', '{"', '"]', "}"]):
@@ -115,7 +123,9 @@ class ToolErrorDisplayManager:
115
123
  tool_def_recs_count = len(tool_def_recs)
116
124
 
117
125
  # Calculate accurate statistics from analyzed results
118
- total_failed_tools = len(all_failures) # unique tools that failed atleast once
126
+ total_failed_tools = len(
127
+ all_failures
128
+ ) # unique tools that failed atleast once
119
129
  total_failure_instances = sum(
120
130
  len(failures) for failures in all_failures.values()
121
131
  ) # individual failures across all tools, the same tool may have multiple failure instances
@@ -132,18 +142,25 @@ class ToolErrorDisplayManager:
132
142
  header_table = Table(show_header=False, box=None)
133
143
  header_table.add_row("📊 Test Case:", f"[bold]{base_name}[/bold]")
134
144
  header_table.add_row(
135
- "🔧 Total Tools Used (unique):", str(len(self._get_all_tools(results)))
145
+ "🔧 Total Tools Used (unique):",
146
+ str(len(self._get_all_tools(results))),
147
+ )
148
+ header_table.add_row(
149
+ "❌ Failed Tools (unique):", str(total_failed_tools)
136
150
  )
137
- header_table.add_row("❌ Failed Tools (unique):", str(total_failed_tools))
138
151
  header_table.add_row(
139
- "🔥 Total Failure Instances (not unique):", str(total_failure_instances)
152
+ "🔥 Total Failure Instances (not unique):",
153
+ str(total_failure_instances),
154
+ )
155
+ header_table.add_row(
156
+ "🔄 Repeated Failures:", str(repeated_failure_tools)
140
157
  )
141
- header_table.add_row("🔄 Repeated Failures:", str(repeated_failure_tools))
142
158
  header_table.add_row(
143
159
  "🔨 Tool Definition Recommendations:", str(tool_def_recs_count)
144
160
  )
145
161
  header_table.add_row(
146
- "🤖 Agent Template Recommendations:", str(len(results.recommendations))
162
+ "🤖 Agent Template Recommendations:",
163
+ str(len(results.recommendations)),
147
164
  )
148
165
 
149
166
  header_panel = Panel(
@@ -152,8 +169,13 @@ class ToolErrorDisplayManager:
152
169
 
153
170
  layout = Layout()
154
171
  layout.split_row(
155
- Layout(self._display_conversation(failed_tool_calls), name="conversation"),
156
- Layout(self._create_detailed_analysis_panel(results), name="analysis"),
172
+ Layout(
173
+ self._display_conversation(failed_tool_calls),
174
+ name="conversation",
175
+ ),
176
+ Layout(
177
+ self._create_detailed_analysis_panel(results), name="analysis"
178
+ ),
157
179
  )
158
180
 
159
181
  rich.print(header_panel)
@@ -202,7 +224,9 @@ class ToolErrorDisplayManager:
202
224
  border_style="blue",
203
225
  )
204
226
 
205
- def _create_detailed_analysis_panel(self, results: AnalysisResults) -> Panel:
227
+ def _create_detailed_analysis_panel(
228
+ self, results: AnalysisResults
229
+ ) -> Panel:
206
230
  """Creates the analysis panel."""
207
231
 
208
232
  content = []
@@ -213,7 +237,10 @@ class ToolErrorDisplayManager:
213
237
  error_table.add_column("Attempts", justify="center")
214
238
  error_table.add_column("Error Type", style="red")
215
239
 
216
- for tool, failures in results.error_patterns.repeated_failures.items():
240
+ for (
241
+ tool,
242
+ failures,
243
+ ) in results.error_patterns.repeated_failures.items():
217
244
  # Use the analyzed error classification
218
245
  error_snippet = str(failures[-1].error_message)[:50] + "..."
219
246
  error_table.add_row(tool, str(len(failures)), error_snippet)
@@ -235,12 +262,16 @@ class ToolErrorDisplayManager:
235
262
  for category, issues in root_cause_data.items():
236
263
  if issues:
237
264
  affected_tools = {issue.tool for issue in issues}
238
- tools_str = ", ".join(list(affected_tools)[:3]) # Limit display
265
+ tools_str = ", ".join(
266
+ list(affected_tools)[:3]
267
+ ) # Limit display
239
268
  if len(affected_tools) > 3:
240
269
  tools_str += f"... (+{len(affected_tools)-3} more)"
241
270
 
242
271
  cause_table.add_row(
243
- category.replace("_", " ").title(), str(len(issues)), tools_str
272
+ category.replace("_", " ").title(),
273
+ str(len(issues)),
274
+ tools_str,
244
275
  )
245
276
 
246
277
  content.append(cause_table)
@@ -263,7 +294,9 @@ class ToolErrorDisplayManager:
263
294
  # Show all tools from failures
264
295
  for tool in results.error_patterns.all_failures.keys():
265
296
  if tool in tools_with_issues:
266
- issue_count = len([r for r in tool_def_recs if r.tool == tool])
297
+ issue_count = len(
298
+ [r for r in tool_def_recs if r.tool == tool]
299
+ )
267
300
  tool_def_table.add_row(
268
301
  tool, f"[red]❌ {issue_count} issue(s)[/red]"
269
302
  )
@@ -319,12 +352,17 @@ class ToolErrorDisplayManager:
319
352
 
320
353
  # 2. Count total failed tool calls across all test cases
321
354
  total_failed_tool_calls = sum(
322
- sum(len(failures) for failures in r.error_patterns.all_failures.values())
355
+ sum(
356
+ len(failures)
357
+ for failures in r.error_patterns.all_failures.values()
358
+ )
323
359
  for r in all_results.values()
324
360
  )
325
361
 
326
362
  # 3. Get total tool calls from stored data (we'll add this to results)
327
- total_tool_calls = sum(r.total_tool_calls or 0 for r in all_results.values())
363
+ total_tool_calls = sum(
364
+ r.total_tool_calls or 0 for r in all_results.values()
365
+ )
328
366
 
329
367
  # 4. Calculate successful tool calls and success rate
330
368
  successful_tool_calls = total_tool_calls - total_failed_tool_calls
@@ -343,8 +381,12 @@ class ToolErrorDisplayManager:
343
381
  # Create failing test cases display
344
382
  failing_cases_text = ""
345
383
  if failing_test_cases:
346
- failing_cases_text = "\n[bold red]📋 Failing Test Cases:[/bold red]\n"
347
- for test_case, failed_tool_count in sorted(failing_test_cases.items()):
384
+ failing_cases_text = (
385
+ "\n[bold red]📋 Failing Test Cases:[/bold red]\n"
386
+ )
387
+ for test_case, failed_tool_count in sorted(
388
+ failing_test_cases.items()
389
+ ):
348
390
  failing_cases_text += f" • [red]{test_case}[/red]: [bold]{failed_tool_count}[/bold] failing tool(s)\n"
349
391
  else:
350
392
  failing_cases_text = (
@@ -380,7 +422,9 @@ class ToolErrorDisplayManager:
380
422
  3. Update ground truth data where needed
381
423
  """ # disclaimer_text can be embedded here when recommendations are ready
382
424
 
383
- rich.print(Panel(Align.center(summary_text), border_style="green", padding=1))
425
+ rich.print(
426
+ Panel(Align.center(summary_text), border_style="green", padding=1)
427
+ )
384
428
 
385
429
  def _prioritize_recommendations(
386
430
  self, recommendations: List[AgentRecommendation]