ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,178 @@
1
+ import ast
2
+ import re
3
+ from pathlib import Path
4
+ from typing import Any, List, Mapping, Union
5
+
6
+
7
+ class PythonTypeToJsonType:
8
+ OPTIONAL_PARAM_EXTRACT = re.compile(r"[Oo]ptional\[(\w+)\]")
9
+
10
+ @staticmethod
11
+ def python_to_json_type(python_annotation: str):
12
+ if not python_annotation:
13
+ return "string"
14
+ python_annotation = python_annotation.lower().strip()
15
+ if "str" == python_annotation:
16
+ return "string"
17
+ if "int" == python_annotation:
18
+ return "integer"
19
+ if "float" == python_annotation:
20
+ return "number"
21
+ if "bool" == python_annotation:
22
+ return "boolean"
23
+ if python_annotation.startswith("list"):
24
+ return "array"
25
+ if python_annotation.startswith("dict"):
26
+ return "object"
27
+ if python_annotation.startswith("optional"):
28
+ # extract the type within Optional[T]
29
+ inner_type = PythonTypeToJsonType.OPTIONAL_PARAM_EXTRACT.search(
30
+ python_annotation
31
+ ).group(1)
32
+ return PythonTypeToJsonType.python_to_json_type(inner_type)
33
+
34
+ return "string"
35
+
36
+
37
+ class ToolExtractionOpenAIFormat:
38
+ @staticmethod
39
+ def get_default_arguments(node):
40
+ """Returns the default arguments (if any)
41
+
42
+ The default arguments are stored in args.default array.
43
+ Since, in Python, the default arguments only come after positional arguments,
44
+ we can index the argument array starting from the last `n` arguments, where n is
45
+ the length of the default arguments.
46
+
47
+ ex.
48
+ def add(a, b=5):
49
+ pass
50
+
51
+ Then we have,
52
+ args = [a, b]
53
+ defaults = [Constant(value=5)]
54
+
55
+ args[-len(defaults):] = [b]
56
+
57
+ (
58
+ "FunctionDef(
59
+ name='add',
60
+ args=arguments(
61
+ posonlyargs=[],
62
+ args=[
63
+ arg(arg='a'), "
64
+ "arg(arg='b')
65
+ ],
66
+ kwonlyargs=[],
67
+ kw_defaults=[],
68
+ defaults=[Constant(value=5)]), "
69
+ "body=[Return(value=BinOp(left=Name(id='a', ctx=Load()), op=Add(), "
70
+ "right=Name(id='b', ctx=Load())))], decorator_list=[], type_params=[])")
71
+ """
72
+ default_arguments = set()
73
+ num_defaults = len(node.args.defaults)
74
+ if num_defaults > 0:
75
+ for arg in node.args.args[-num_defaults:]:
76
+ default_arguments.add(arg)
77
+
78
+ return default_arguments
79
+
80
+ @staticmethod
81
+ def from_file(tools_path: Union[str, Path]) -> Mapping[str, Any]:
82
+ """Uses `extract_tool_signatures` function, but converts the response
83
+ to open-ai format
84
+
85
+ ```
86
+ function_spec = {
87
+ "type": "function",
88
+ "function": {
89
+ "name": func_name,
90
+ "description": description,
91
+ "parameters": parameters,
92
+ },
93
+ }
94
+ ```
95
+
96
+ """
97
+ tool_data = []
98
+ tools_path = Path(tools_path)
99
+
100
+ with tools_path.open("r", encoding="utf-8") as f:
101
+ code = f.read()
102
+
103
+ try:
104
+ parsed_code = ast.parse(code)
105
+ for node in parsed_code.body:
106
+ if isinstance(node, ast.FunctionDef):
107
+ parameters = {
108
+ "type": "object",
109
+ "properties": {},
110
+ "required": [],
111
+ }
112
+ function_name = node.name
113
+ for arg in node.args.args:
114
+ type_annotation = None
115
+ if arg.arg == "self":
116
+ continue
117
+ if arg.annotation:
118
+ type_annotation = ast.unparse(arg.annotation)
119
+
120
+ parameter_type = (
121
+ PythonTypeToJsonType.python_to_json_type(
122
+ type_annotation
123
+ )
124
+ )
125
+ parameters["properties"][arg.arg] = {
126
+ "type": parameter_type,
127
+ "description": "", # todo
128
+ }
129
+
130
+ if (
131
+ type_annotation
132
+ and "Optional" not in type_annotation
133
+ ):
134
+ parameters["required"].append(arg.arg)
135
+
136
+ default_arguments = (
137
+ ToolExtractionOpenAIFormat.get_default_arguments(node)
138
+ )
139
+ for arg_name in parameters["required"]:
140
+ if arg_name in default_arguments:
141
+ parameters.remove(arg_name)
142
+
143
+ open_ai_format_fn = {
144
+ "type": "function",
145
+ "function": {
146
+ "name": function_name,
147
+ "parameters": parameters,
148
+ "description": ast.get_docstring(
149
+ node
150
+ ), # fix (does not do :params)
151
+ },
152
+ }
153
+ tool_data.append(open_ai_format_fn)
154
+
155
+ except Exception as e:
156
+ print(f"Warning: Failed to parse {tools_path}: {str(e)}")
157
+
158
+ return tool_data
159
+
160
+ @staticmethod
161
+ def from_path(tools_path: Union[str, Path]) -> List[Mapping[str, Any]]:
162
+ tools_path = Path(tools_path)
163
+ files_to_parse = []
164
+ all_tools = []
165
+
166
+ if tools_path.is_file():
167
+ files_to_parse.append(tools_path)
168
+ elif tools_path.is_dir():
169
+ files_to_parse.extend(tools_path.glob("**/*.py"))
170
+ else:
171
+ raise ValueError(
172
+ f"Tools path {tools_path} is neither a file nor directory"
173
+ )
174
+
175
+ for file_path in files_to_parse:
176
+ all_tools.extend(ToolExtractionOpenAIFormat.from_file(file_path))
177
+
178
+ return all_tools
@@ -0,0 +1,71 @@
1
+ from typing import Any, List, Mapping, Optional
2
+
3
+ from wxo_agentic_evaluation.metrics.metrics import (
4
+ Annotation,
5
+ FailedSemanticTestCases,
6
+ FailedStaticTestCases,
7
+ )
8
+
9
+
10
+ class ReferencelessEvalParser:
11
+ @staticmethod
12
+ def static_parser(
13
+ static_metrics: Mapping[str, Mapping[str, Any]],
14
+ ) -> List[FailedStaticTestCases]:
15
+ """
16
+ static.metrics
17
+ """
18
+
19
+ failed_test_cases = []
20
+
21
+ for metric, metric_data in static_metrics.items():
22
+ if not metric_data.get("valid", False):
23
+ fail = FailedStaticTestCases(
24
+ metric_name=metric,
25
+ description=metric_data.get("description"),
26
+ explanation=metric_data.get("explanation"),
27
+ )
28
+
29
+ failed_test_cases.append(fail)
30
+
31
+ return failed_test_cases
32
+
33
+ @staticmethod
34
+ def parse_annotations(
35
+ actionable_reccomendations, filters: List[str]
36
+ ) -> Optional[List[Annotation]]:
37
+ annotations = [
38
+ Annotation(
39
+ parameter_name=recc.get("parameter_name"),
40
+ recommendation=recc.get("recommendation"),
41
+ details=recc.get("details"),
42
+ quote=recc.get("quote"),
43
+ )
44
+ for recc in actionable_reccomendations
45
+ if recc.get("recommendation") in filters
46
+ ]
47
+
48
+ annotations = annotations if annotations else None
49
+
50
+ return annotations
51
+
52
+ @staticmethod
53
+ def semantic_parser(
54
+ metric_name, data, annotation_filters: Optional[List[str]]
55
+ ):
56
+ semantic_metric = FailedSemanticTestCases(
57
+ metric_name=metric_name,
58
+ evidence=data.get("evidence"),
59
+ explanation=data.get("explanation"),
60
+ output=data.get("output"),
61
+ confidence=data.get("confidence"),
62
+ )
63
+
64
+ if annotation_filters and (
65
+ annotations := ReferencelessEvalParser.parse_annotations(
66
+ data.get("actionable_recommendations"), annotation_filters
67
+ )
68
+ ):
69
+ semantic_metric.annotations = annotations
70
+
71
+ return semantic_metric
@@ -0,0 +1,188 @@
1
+ from typing import Any, List, Optional
2
+
3
+ import rich
4
+ from rich.text import Text
5
+
6
+
7
+ def pretty_print(content: Any, style: Optional[str] = None):
8
+ """
9
+ Utility function for stylized prints.
10
+ Please refer to: https://rich.readthedocs.io/en/stable/appendix/colors.html for valid `style` strings.
11
+ NOTE:
12
+ Rich allows for nested [style][/style] tags within a string.
13
+ This utility only applies an outermost style wrapper using the passed `style` (ONLY for a string `content`).
14
+
15
+ :param content: The content to be printed
16
+ :param style: a valid `rich` colour.
17
+ """
18
+ if isinstance(content, str):
19
+ if style:
20
+ rich.print(f"[{style}]{content}[/{style}]")
21
+ else:
22
+ rich.print(content)
23
+ else:
24
+ rich.print(content)
25
+
26
+
27
+ def warn(
28
+ message: str,
29
+ style: Optional[str] = "bold yellow",
30
+ prompt: Optional[str] = "WARNING ⚠️ :",
31
+ ) -> Text:
32
+ """Utility function for formatting a warning message."""
33
+ return Text(f"{prompt}{message}\n\n", style=style)
34
+
35
+
36
+ def is_ok(
37
+ message: str,
38
+ style: Optional[str] = "bold green",
39
+ prompt: Optional[str] = "OK ✅ :",
40
+ ) -> Text:
41
+ """Utility function for formatting an OK message."""
42
+ return Text(f"{prompt}{message}\n\n", style=style)
43
+
44
+
45
+ def print_done(
46
+ prompt: Optional[str] = "Done ✅", style: Optional[str] = "bold cyan"
47
+ ):
48
+ """
49
+ Prints a prompt indicating completion of a process/routine.
50
+ :param prompt: default is `"Done ✅"`
51
+ :param style: The style for the text (default is bold cyan).
52
+ """
53
+ pretty_print(content=prompt, style=style)
54
+
55
+
56
+ def print_success(
57
+ message: str,
58
+ style: Optional[str] = "bold green",
59
+ prompt: Optional[str] = "✅ PASSED",
60
+ ):
61
+ """
62
+ Prints a success message.
63
+ :param message: a statement that is printed alongside a PASSED outcome.
64
+ :param style: The style for the text (default is bold green).
65
+ :param prompt: The prompt to display before the message (default is "✅ PASSED").
66
+ """
67
+ pretty_print(content=f"{prompt} - {message}", style=style)
68
+
69
+
70
+ def print_failure(
71
+ message: str,
72
+ style: Optional[str] = "bold red",
73
+ prompt: Optional[str] = "❌ FAILED",
74
+ ):
75
+ """
76
+ Prints a failure message.
77
+ :param message: a statement that is printed alongside a FAILED outcome.
78
+ :param style: The style for the text (default is bold red).
79
+ :param prompt: The prompt to display before the message (default is "❌ FAILED").
80
+ """
81
+ pretty_print(content=f"{prompt} - {message}", style=style)
82
+
83
+
84
+ class IncorrectParameterUtils:
85
+ """
86
+ Utility functions for handling warning and suggestion messages related to bad parameters in tool descriptions.
87
+ These are primarily used for providing feedback on incorrect parameter usage by the assistant in `analyze_run`.
88
+ """
89
+
90
+ @staticmethod
91
+ def suggest(message: str, style: Optional[str] = "green") -> Text:
92
+ """
93
+ Used for formatting a suggestion message for improving agent behaviour relating to bad parameter usage.
94
+ :param message: The suggestion message to display.
95
+ :param style: The style for the text (default is green).
96
+ :return: A rich Text object styled as a suggestion.
97
+ """
98
+ return Text(
99
+ f"💡 {message}\n✅ A good description is insightful of the tool's purpose, and clarifies parameter usage to the assistant.\n\n",
100
+ style=style,
101
+ )
102
+
103
+ @staticmethod
104
+ def format_missing_description_message(
105
+ tool_definition_path: str, tool_name: str
106
+ ) -> List[Text]:
107
+
108
+ return [
109
+ warn(
110
+ f"Tool description for '{tool_name}' not found in file: '{tool_definition_path}'"
111
+ ),
112
+ IncorrectParameterUtils.suggest(
113
+ f"Please consider adding a description for '{tool_name}'."
114
+ ),
115
+ ]
116
+
117
+ @staticmethod
118
+ def format_bad_description_message(
119
+ tool_name: str, tool_desc: str
120
+ ) -> List[Text]:
121
+
122
+ return [
123
+ warn(
124
+ f"Tool description for '{tool_name}' may be incomplete or unclear: '{tool_desc.strip()}'."
125
+ ),
126
+ IncorrectParameterUtils.suggest(
127
+ f"Please consider making the description for '{tool_name}' more informative on parameter usage."
128
+ ),
129
+ ]
130
+
131
+
132
+ class TestingUtils:
133
+ """
134
+ Provides a collection of formatted messages that can be used in testing workflows.
135
+ """
136
+
137
+ @staticmethod
138
+ def print_test_header(
139
+ test_case_count: int,
140
+ test_description: str,
141
+ style: Optional[str] = "bold cyan",
142
+ prompt: Optional[str] = "\n⚙️ Testing",
143
+ ):
144
+ """
145
+ Print formatted test suite header.
146
+ :param test_case_count: # of test-cases.
147
+ :param test_description: a short statement explaining what is being examined.
148
+ For example, this can be read as: `"{\n⚙️ Testing} {20} {good tool descriptions}"`.
149
+ """
150
+ pretty_print(
151
+ content=f"{prompt} {test_case_count} {test_description}",
152
+ style=style,
153
+ )
154
+
155
+ @staticmethod
156
+ def print_error_details(
157
+ expected: List[str],
158
+ detected: List[str],
159
+ style: Optional[str] = "bold red",
160
+ ):
161
+ """
162
+ Print detailed error information.
163
+ An error in this context can be an assertion mis-match.
164
+ Use this function to display the delta.
165
+ :param expected: the expected outcome.
166
+ :param detected: the actual/observed outcome.
167
+ """
168
+ pretty_print(content=f" Expected: {expected}", style=style)
169
+ pretty_print(content=f" Detected: {detected}", style=style)
170
+
171
+ @staticmethod
172
+ def print_failure_summary(
173
+ failed_cases: List[str],
174
+ prompt: Optional[str] = "Failed cases",
175
+ style: Optional[str] = "bold red",
176
+ ):
177
+ """
178
+ Print summary of all failures.
179
+ List out the specific cases that failed the test.
180
+ :param failed_cases: List of failed case names, this list is iterated over to print/list all failures.
181
+ :param style: The style for the text (default is bold red).
182
+ """
183
+ if failed_cases:
184
+ pretty_print(
185
+ content=f"{prompt} ({len(failed_cases)}):", style=style
186
+ )
187
+ for case in failed_cases:
188
+ pretty_print(content=f" - {case}", style=style)
@@ -0,0 +1,23 @@
1
+ def lcs_length(x, y):
2
+ """Compute the length of the Longest Common Subsequence (LCS)."""
3
+ m, n = len(x), len(y)
4
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
5
+ for i in range(m):
6
+ for j in range(n):
7
+ if x[i] == y[j]:
8
+ dp[i + 1][j + 1] = dp[i][j] + 1
9
+ else:
10
+ dp[i + 1][j + 1] = max(dp[i][j + 1], dp[i + 1][j])
11
+ return dp[m][n]
12
+
13
+
14
+ def rouge_l_recall(prediction, reference):
15
+ """Compute ROUGE-L recall. No stemming."""
16
+ pred_tokens = prediction.split()
17
+ ref_tokens = reference.split()
18
+
19
+ lcs = lcs_length(pred_tokens, ref_tokens)
20
+ if len(pred_tokens) == 0:
21
+ return 0.0
22
+
23
+ return lcs / len(pred_tokens)