ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +8 -2
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation_package.py +114 -70
  14. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  15. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  16. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  17. wxo_agentic_evaluation/external_agent/types.py +12 -5
  18. wxo_agentic_evaluation/inference_backend.py +158 -73
  19. wxo_agentic_evaluation/llm_matching.py +4 -3
  20. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  21. wxo_agentic_evaluation/llm_user.py +7 -3
  22. wxo_agentic_evaluation/main.py +175 -67
  23. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  24. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  25. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  26. wxo_agentic_evaluation/quick_eval.py +49 -23
  27. wxo_agentic_evaluation/record_chat.py +70 -33
  28. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  29. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  30. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  38. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  39. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  40. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  41. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  42. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  43. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  44. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  45. wxo_agentic_evaluation/resource_map.py +2 -1
  46. wxo_agentic_evaluation/service_instance.py +24 -11
  47. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  48. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
  49. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  50. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  51. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  52. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  53. wxo_agentic_evaluation/tool_planner.py +128 -44
  54. wxo_agentic_evaluation/type.py +12 -9
  55. wxo_agentic_evaluation/utils/__init__.py +1 -0
  56. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  57. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  58. wxo_agentic_evaluation/utils/utils.py +83 -52
  59. ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA +0 -385
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
  61. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,21 @@
1
- import rich
2
1
  import json
3
- from rich.layout import Layout
4
- from rich.table import Table
5
- from rich.panel import Panel
6
- from rich.align import Align
7
- from rich.console import Group
8
- from wxo_agentic_evaluation.type import Message, ContentType
9
- from typing import List, Dict, Optional
2
+ from typing import Dict, List, Optional
3
+
4
+ import rich
10
5
  from analytics.tools.types import (
11
- ToolDefinitionRecommendation,
12
- Priority,
13
6
  AgentRecommendation,
14
7
  AnalysisResults,
15
8
  ErrorPatterns,
9
+ Priority,
10
+ ToolDefinitionRecommendation,
16
11
  )
12
+ from rich.align import Align
13
+ from rich.console import Group
14
+ from rich.layout import Layout
15
+ from rich.panel import Panel
16
+ from rich.table import Table
17
+
18
+ from wxo_agentic_evaluation.type import ContentType, Message
17
19
 
18
20
 
19
21
  class ToolErrorDisplayManager:
@@ -24,7 +26,9 @@ class ToolErrorDisplayManager:
24
26
  )
25
27
 
26
28
  def __init__(
27
- self, messages: List[Message], error_patterns: Optional[ErrorPatterns] = None
29
+ self,
30
+ messages: List[Message],
31
+ error_patterns: Optional[ErrorPatterns] = None,
28
32
  ):
29
33
  self.messages = messages
30
34
  self.error_patterns = error_patterns or ErrorPatterns()
@@ -44,7 +48,9 @@ class ToolErrorDisplayManager:
44
48
  }
45
49
 
46
50
  validation_error_codes = ["404", "not found", "client error"]
47
- unhelpful_resp_threshold = ToolErrorDisplayManager.CHARACTER_THRESHOLD
51
+ unhelpful_resp_threshold = (
52
+ ToolErrorDisplayManager.CHARACTER_THRESHOLD
53
+ )
48
54
 
49
55
  for failure in failures:
50
56
  error_msg = str(failure.error_message).lower()
@@ -55,7 +61,9 @@ class ToolErrorDisplayManager:
55
61
  ):
56
62
  failure_counts["unhelpful_responses"] += 1
57
63
 
58
- if any(err_code in error_msg for err_code in validation_error_codes):
64
+ if any(
65
+ err_code in error_msg for err_code in validation_error_codes
66
+ ):
59
67
  failure_counts["parameter_type_validation"] += 1
60
68
 
61
69
  if any(x in error_msg for x in ['"[', '{"', '"]', "}"]):
@@ -115,7 +123,9 @@ class ToolErrorDisplayManager:
115
123
  tool_def_recs_count = len(tool_def_recs)
116
124
 
117
125
  # Calculate accurate statistics from analyzed results
118
- total_failed_tools = len(all_failures) # unique tools that failed atleast once
126
+ total_failed_tools = len(
127
+ all_failures
128
+ ) # unique tools that failed atleast once
119
129
  total_failure_instances = sum(
120
130
  len(failures) for failures in all_failures.values()
121
131
  ) # individual failures across all tools, the same tool may have multiple failure instances
@@ -132,18 +142,25 @@ class ToolErrorDisplayManager:
132
142
  header_table = Table(show_header=False, box=None)
133
143
  header_table.add_row("📊 Test Case:", f"[bold]{base_name}[/bold]")
134
144
  header_table.add_row(
135
- "🔧 Total Tools Used (unique):", str(len(self._get_all_tools(results)))
145
+ "🔧 Total Tools Used (unique):",
146
+ str(len(self._get_all_tools(results))),
147
+ )
148
+ header_table.add_row(
149
+ "❌ Failed Tools (unique):", str(total_failed_tools)
136
150
  )
137
- header_table.add_row("❌ Failed Tools (unique):", str(total_failed_tools))
138
151
  header_table.add_row(
139
- "🔥 Total Failure Instances (not unique):", str(total_failure_instances)
152
+ "🔥 Total Failure Instances (not unique):",
153
+ str(total_failure_instances),
154
+ )
155
+ header_table.add_row(
156
+ "🔄 Repeated Failures:", str(repeated_failure_tools)
140
157
  )
141
- header_table.add_row("🔄 Repeated Failures:", str(repeated_failure_tools))
142
158
  header_table.add_row(
143
159
  "🔨 Tool Definition Recommendations:", str(tool_def_recs_count)
144
160
  )
145
161
  header_table.add_row(
146
- "🤖 Agent Template Recommendations:", str(len(results.recommendations))
162
+ "🤖 Agent Template Recommendations:",
163
+ str(len(results.recommendations)),
147
164
  )
148
165
 
149
166
  header_panel = Panel(
@@ -152,8 +169,13 @@ class ToolErrorDisplayManager:
152
169
 
153
170
  layout = Layout()
154
171
  layout.split_row(
155
- Layout(self._display_conversation(failed_tool_calls), name="conversation"),
156
- Layout(self._create_detailed_analysis_panel(results), name="analysis"),
172
+ Layout(
173
+ self._display_conversation(failed_tool_calls),
174
+ name="conversation",
175
+ ),
176
+ Layout(
177
+ self._create_detailed_analysis_panel(results), name="analysis"
178
+ ),
157
179
  )
158
180
 
159
181
  rich.print(header_panel)
@@ -202,7 +224,9 @@ class ToolErrorDisplayManager:
202
224
  border_style="blue",
203
225
  )
204
226
 
205
- def _create_detailed_analysis_panel(self, results: AnalysisResults) -> Panel:
227
+ def _create_detailed_analysis_panel(
228
+ self, results: AnalysisResults
229
+ ) -> Panel:
206
230
  """Creates the analysis panel."""
207
231
 
208
232
  content = []
@@ -213,7 +237,10 @@ class ToolErrorDisplayManager:
213
237
  error_table.add_column("Attempts", justify="center")
214
238
  error_table.add_column("Error Type", style="red")
215
239
 
216
- for tool, failures in results.error_patterns.repeated_failures.items():
240
+ for (
241
+ tool,
242
+ failures,
243
+ ) in results.error_patterns.repeated_failures.items():
217
244
  # Use the analyzed error classification
218
245
  error_snippet = str(failures[-1].error_message)[:50] + "..."
219
246
  error_table.add_row(tool, str(len(failures)), error_snippet)
@@ -235,12 +262,16 @@ class ToolErrorDisplayManager:
235
262
  for category, issues in root_cause_data.items():
236
263
  if issues:
237
264
  affected_tools = {issue.tool for issue in issues}
238
- tools_str = ", ".join(list(affected_tools)[:3]) # Limit display
265
+ tools_str = ", ".join(
266
+ list(affected_tools)[:3]
267
+ ) # Limit display
239
268
  if len(affected_tools) > 3:
240
269
  tools_str += f"... (+{len(affected_tools)-3} more)"
241
270
 
242
271
  cause_table.add_row(
243
- category.replace("_", " ").title(), str(len(issues)), tools_str
272
+ category.replace("_", " ").title(),
273
+ str(len(issues)),
274
+ tools_str,
244
275
  )
245
276
 
246
277
  content.append(cause_table)
@@ -263,7 +294,9 @@ class ToolErrorDisplayManager:
263
294
  # Show all tools from failures
264
295
  for tool in results.error_patterns.all_failures.keys():
265
296
  if tool in tools_with_issues:
266
- issue_count = len([r for r in tool_def_recs if r.tool == tool])
297
+ issue_count = len(
298
+ [r for r in tool_def_recs if r.tool == tool]
299
+ )
267
300
  tool_def_table.add_row(
268
301
  tool, f"[red]❌ {issue_count} issue(s)[/red]"
269
302
  )
@@ -319,12 +352,17 @@ class ToolErrorDisplayManager:
319
352
 
320
353
  # 2. Count total failed tool calls across all test cases
321
354
  total_failed_tool_calls = sum(
322
- sum(len(failures) for failures in r.error_patterns.all_failures.values())
355
+ sum(
356
+ len(failures)
357
+ for failures in r.error_patterns.all_failures.values()
358
+ )
323
359
  for r in all_results.values()
324
360
  )
325
361
 
326
362
  # 3. Get total tool calls from stored data (we'll add this to results)
327
- total_tool_calls = sum(r.total_tool_calls or 0 for r in all_results.values())
363
+ total_tool_calls = sum(
364
+ r.total_tool_calls or 0 for r in all_results.values()
365
+ )
328
366
 
329
367
  # 4. Calculate successful tool calls and success rate
330
368
  successful_tool_calls = total_tool_calls - total_failed_tool_calls
@@ -343,8 +381,12 @@ class ToolErrorDisplayManager:
343
381
  # Create failing test cases display
344
382
  failing_cases_text = ""
345
383
  if failing_test_cases:
346
- failing_cases_text = "\n[bold red]📋 Failing Test Cases:[/bold red]\n"
347
- for test_case, failed_tool_count in sorted(failing_test_cases.items()):
384
+ failing_cases_text = (
385
+ "\n[bold red]📋 Failing Test Cases:[/bold red]\n"
386
+ )
387
+ for test_case, failed_tool_count in sorted(
388
+ failing_test_cases.items()
389
+ ):
348
390
  failing_cases_text += f" • [red]{test_case}[/red]: [bold]{failed_tool_count}[/bold] failing tool(s)\n"
349
391
  else:
350
392
  failing_cases_text = (
@@ -380,7 +422,9 @@ class ToolErrorDisplayManager:
380
422
  3. Update ground truth data where needed
381
423
  """ # disclaimer_text can be embedded here when recommendations are ready
382
424
 
383
- rich.print(Panel(Align.center(summary_text), border_style="green", padding=1))
425
+ rich.print(
426
+ Panel(Align.center(summary_text), border_style="green", padding=1)
427
+ )
384
428
 
385
429
  def _prioritize_recommendations(
386
430
  self, recommendations: List[AgentRecommendation]
@@ -1,36 +1,37 @@
1
+ import csv
1
2
  import json
2
3
  import os
3
- import csv
4
- from jsonargparse import CLI
5
4
  from pathlib import Path
6
- from typing import List, Dict, Set, Optional
5
+ from typing import Dict, List, Optional, Set
7
6
 
8
- from rich.text import Text
9
- from rich.table import Table
10
- from rich.panel import Panel
7
+ from jsonargparse import CLI
11
8
  from rich.console import Group
9
+ from rich.panel import Panel
12
10
  from rich.style import Style
11
+ from rich.table import Table
12
+ from rich.text import Text
13
13
 
14
- from wxo_agentic_evaluation.type import ExtendedMessage, ContentType, ToolDefinition
15
- from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
16
14
  from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
17
15
  from wxo_agentic_evaluation.description_quality_checker import (
18
16
  DescriptionQualityInspector,
19
17
  )
18
+ from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
19
+ from wxo_agentic_evaluation.type import (
20
+ ContentType,
21
+ ExtendedMessage,
22
+ ToolDefinition,
23
+ )
20
24
  from wxo_agentic_evaluation.utils.rich_utils import (
21
- pretty_print,
22
- warn,
25
+ IncorrectParameterUtils,
23
26
  is_ok,
27
+ pretty_print,
24
28
  print_done,
25
- IncorrectParameterUtils,
26
- )
27
- from wxo_agentic_evaluation.utils.utils import (
28
- add_line_seperator,
29
+ warn,
29
30
  )
31
+ from wxo_agentic_evaluation.utils.utils import add_line_seperator
30
32
 
31
33
 
32
34
  class Analyzer:
33
-
34
35
  def __init__(self):
35
36
  self.analysis_cache: Dict[str, List[Text]] = (
36
37
  {}
@@ -44,8 +45,10 @@ class Analyzer:
44
45
  blink=True,
45
46
  bold=True,
46
47
  )
47
-
48
- def _split_cache(self, failing_tools: Set[str]) -> tuple[List[str], List[Text]]:
48
+
49
+ def _split_cache(
50
+ self, failing_tools: Set[str]
51
+ ) -> tuple[List[str], List[Text]]:
49
52
 
50
53
  tools_to_analyze: List[str] = []
51
54
  cached_lines: List[Text] = []
@@ -65,11 +68,7 @@ class Analyzer:
65
68
  style="bold cyan",
66
69
  )
67
70
 
68
- return (
69
- tools_to_analyze,
70
- cached_lines
71
- )
72
-
71
+ return (tools_to_analyze, cached_lines)
73
72
 
74
73
  def analyze_failing_tool_description_quality(
75
74
  self,
@@ -98,9 +97,11 @@ class Analyzer:
98
97
  # Step 2: analyze cache misses
99
98
  if tools_to_analyze:
100
99
 
101
- failing_tool_definitions: List[ToolDefinition] = inspector.extract_tool_desc_from_tool_source(
102
- Path(tool_definition_path),
103
- tools_to_analyze,
100
+ failing_tool_definitions: List[ToolDefinition] = (
101
+ inspector.extract_tool_desc_from_tool_source(
102
+ Path(tool_definition_path),
103
+ tools_to_analyze,
104
+ )
104
105
  )
105
106
 
106
107
  if not failing_tool_definitions:
@@ -110,7 +111,7 @@ class Analyzer:
110
111
  )
111
112
  )
112
113
  return analysis_for_display
113
-
114
+
114
115
  missing_tools = self._get_tools_not_found_in_source(
115
116
  tools_to_analyze, failing_tool_definitions
116
117
  )
@@ -134,7 +135,9 @@ class Analyzer:
134
135
 
135
136
  return analysis_for_display
136
137
 
137
- def render(self, data: List[ExtendedMessage], tool_definition_path: Optional[str]) -> Group:
138
+ def render(
139
+ self, data: List[ExtendedMessage], tool_definition_path: Optional[str]
140
+ ) -> Group:
138
141
  """
139
142
  Render the conversation history and analysis results.
140
143
  :param data: List of ExtendedMessage objects containing the conversation history.
@@ -151,7 +154,10 @@ class Analyzer:
151
154
  content = msg.content
152
155
  reason = entry.reason
153
156
  tool_name = None
154
- if msg.type == ContentType.tool_call or msg.type == ContentType.tool_response:
157
+ if (
158
+ msg.type == ContentType.tool_call
159
+ or msg.type == ContentType.tool_response
160
+ ):
155
161
  tool_name = json.loads(msg.content)["name"]
156
162
 
157
163
  if role == "user":
@@ -159,7 +165,7 @@ class Analyzer:
159
165
  elif role == "assistant" and msg.type == ContentType.tool_call:
160
166
  if reason:
161
167
  label = "❌ Tool Call"
162
-
168
+
163
169
  if reason.get("reason") == "incorrect parameter":
164
170
  failing_tools.append(
165
171
  tool_name
@@ -199,8 +205,8 @@ class Analyzer:
199
205
  border_style="blue",
200
206
  )
201
207
  reason_panel = Panel(
202
- Text().join(reason_lines),
203
- title="Analysis Results",
208
+ Text().join(reason_lines),
209
+ title="Analysis Results",
204
210
  border_style="red",
205
211
  )
206
212
 
@@ -218,7 +224,9 @@ class Analyzer:
218
224
  def get_summary(summary_file_name: str = "summary_metrics.csv"):
219
225
  summary = []
220
226
 
221
- path_to_summary_file = os.path.join(config.data_path, summary_file_name)
227
+ path_to_summary_file = os.path.join(
228
+ config.data_path, summary_file_name
229
+ )
222
230
 
223
231
  with open(path_to_summary_file, "r") as f:
224
232
  reader = csv.reader(f)
@@ -232,7 +240,9 @@ class Analyzer:
232
240
  test_messages = []
233
241
 
234
242
  test_case_path = os.path.join(
235
- config.data_path, "messages", f"{test_case_name}.messages.analyze.json"
243
+ config.data_path,
244
+ "messages",
245
+ f"{test_case_name}.messages.analyze.json",
236
246
  )
237
247
 
238
248
  with open(test_case_path, "r", encoding="utf-8") as f:
@@ -265,7 +275,8 @@ class Analyzer:
265
275
  header_table.add_row("No Tool Call Error found!")
266
276
 
267
277
  panel = Panel(
268
- header_table, title="[bold green]📋 Analysis Summary[/bold green]"
278
+ header_table,
279
+ title="[bold green]📋 Analysis Summary[/bold green]",
269
280
  )
270
281
 
271
282
  pretty_print(panel)
@@ -279,21 +290,23 @@ class Analyzer:
279
290
  test_case_name=test_case_name
280
291
  )
281
292
 
282
- header_panel = self._create_header_analysis_panel(test_case_name, metrics)
293
+ header_panel = self._create_header_analysis_panel(
294
+ test_case_name, metrics
295
+ )
283
296
  pretty_print(header_panel)
284
297
 
285
- tool_definition_path = config.tool_definition_path \
286
- if config.tool_definition_path else None
287
-
298
+ tool_definition_path = (
299
+ config.tool_definition_path
300
+ if config.tool_definition_path
301
+ else None
302
+ )
303
+
288
304
  rendered_content = self.render(
289
- data=test_messages,
290
- tool_definition_path=tool_definition_path
291
- )
305
+ data=test_messages, tool_definition_path=tool_definition_path
306
+ )
292
307
  pretty_print(rendered_content)
293
308
 
294
- add_line_seperator(
295
- self._generate_style_config()
296
- )
309
+ add_line_seperator(self._generate_style_config())
297
310
 
298
311
  def _create_header_analysis_panel(
299
312
  self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
@@ -301,8 +314,12 @@ class Analyzer:
301
314
  header_table = Table(show_header=False, box=None)
302
315
 
303
316
  header_table.add_row(f"Test Case Name: {test_case_name}")
304
- header_table.add_row(f"Expected Tool Calls: {metrics.expected_tool_calls}")
305
- header_table.add_row(f"Correct Tool Calls: {metrics.correct_tool_calls}")
317
+ header_table.add_row(
318
+ f"Expected Tool Calls: {metrics.expected_tool_calls}"
319
+ )
320
+ header_table.add_row(
321
+ f"Correct Tool Calls: {metrics.correct_tool_calls}"
322
+ )
306
323
  header_table.add_row(f"Text Match: {metrics.text_match.value}")
307
324
  header_table.add_row(f"Journey Success: {metrics.is_success}")
308
325
 
@@ -359,7 +376,8 @@ class Analyzer:
359
376
  if tool_desc is None:
360
377
  tool_analysis.extend(
361
378
  IncorrectParameterUtils.format_missing_description_message(
362
- tool_name=tool_name, tool_definition_path=tool_definition_path
379
+ tool_name=tool_name,
380
+ tool_definition_path=tool_definition_path,
363
381
  )
364
382
  )
365
383
  return tool_analysis
@@ -375,10 +393,13 @@ class Analyzer:
375
393
 
376
394
  # good description
377
395
  tool_analysis.append(
378
- is_ok(message=f"The description for the `{tool_name}` looks sufficient.")
396
+ is_ok(
397
+ message=f"The description for the `{tool_name}` looks sufficient."
398
+ )
379
399
  )
380
400
  return tool_analysis
381
401
 
402
+
382
403
  if __name__ == "__main__":
383
404
  dummy_analyzer = Analyzer()
384
405
  dummy_analyzer.analyze(CLI(AnalyzeConfig, as_positional=False))
@@ -1,10 +1,12 @@
1
- from wxo_agentic_evaluation.type import Message, EvaluationData
2
- from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
3
- from wxo_agentic_evaluation.data_annotator import DataAnnotator
4
1
  import json
2
+ import os
5
3
  from pprint import pprint
4
+
6
5
  from jsonargparse import CLI
7
- import os
6
+
7
+ from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
8
+ from wxo_agentic_evaluation.data_annotator import DataAnnotator
9
+ from wxo_agentic_evaluation.type import EvaluationData, Message
8
10
 
9
11
 
10
12
  def main(config: TestCaseGenerationConfig):
@@ -1,11 +1,16 @@
1
1
  import os
2
2
  from dataclasses import dataclass, field
3
3
  from typing import List, Optional, Union
4
+
4
5
  from wxo_agentic_evaluation import __file__
5
6
 
6
7
  root_dir = os.path.dirname(__file__)
7
- LLAMA_USER_PROMPT_PATH = os.path.join(root_dir, "prompt", "llama_user_prompt.jinja2")
8
- KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(root_dir, "prompt", "keywords_generation_prompt.jinja2")
8
+ LLAMA_USER_PROMPT_PATH = os.path.join(
9
+ root_dir, "prompt", "llama_user_prompt.jinja2"
10
+ )
11
+ KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(
12
+ root_dir, "prompt", "keywords_generation_prompt.jinja2"
13
+ )
9
14
 
10
15
 
11
16
  @dataclass
@@ -104,6 +109,7 @@ class ChatRecordingConfig:
104
109
  class QuickEvalConfig(TestConfig):
105
110
  tools_path: str = None
106
111
 
112
+
107
113
  @dataclass
108
114
  class BatchAnnotateConfig:
109
115
  allowed_tools: List[str]