ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +8 -2
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation_package.py +114 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +158 -73
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +24 -11
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA +0 -385
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -1,26 +1,35 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import ast
|
|
3
2
|
import csv
|
|
4
|
-
from pathlib import Path
|
|
5
3
|
import importlib.util
|
|
6
|
-
import
|
|
7
|
-
from jsonargparse import CLI
|
|
4
|
+
import json
|
|
8
5
|
import os
|
|
6
|
+
import re
|
|
9
7
|
import sys
|
|
10
8
|
import textwrap
|
|
11
|
-
from dataclasses import
|
|
9
|
+
from dataclasses import asdict, is_dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from jsonargparse import CLI
|
|
12
13
|
|
|
13
|
-
from wxo_agentic_evaluation.service_provider import get_provider
|
|
14
|
-
from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
|
|
15
|
-
from wxo_agentic_evaluation.prompt.template_render import ToolPlannerTemplateRenderer, ArgsExtractorTemplateRenderer
|
|
16
14
|
from wxo_agentic_evaluation import __file__
|
|
15
|
+
from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
|
|
16
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
17
|
+
ArgsExtractorTemplateRenderer,
|
|
18
|
+
ToolPlannerTemplateRenderer,
|
|
19
|
+
)
|
|
20
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
17
21
|
|
|
18
22
|
root_dir = os.path.dirname(__file__)
|
|
19
|
-
TOOL_PLANNER_PROMPT_PATH = os.path.join(
|
|
20
|
-
|
|
23
|
+
TOOL_PLANNER_PROMPT_PATH = os.path.join(
|
|
24
|
+
root_dir, "prompt", "tool_planner.jinja2"
|
|
25
|
+
)
|
|
26
|
+
ARGS_EXTRACTOR_PROMPT_PATH = os.path.join(
|
|
27
|
+
root_dir, "prompt", "args_extractor_prompt.jinja2"
|
|
28
|
+
)
|
|
21
29
|
|
|
22
30
|
MISSING_DOCSTRING_PROMPT = "No description available"
|
|
23
31
|
|
|
32
|
+
|
|
24
33
|
class UniversalEncoder(json.JSONEncoder):
|
|
25
34
|
def default(self, obj):
|
|
26
35
|
if is_dataclass(obj):
|
|
@@ -29,12 +38,15 @@ class UniversalEncoder(json.JSONEncoder):
|
|
|
29
38
|
return obj.__dict__
|
|
30
39
|
return super().default(obj)
|
|
31
40
|
|
|
41
|
+
|
|
32
42
|
def extract_first_json_list(raw: str) -> list:
|
|
33
43
|
matches = re.findall(r"\[\s*{.*?}\s*]", raw, re.DOTALL)
|
|
34
44
|
for match in matches:
|
|
35
45
|
try:
|
|
36
46
|
parsed = json.loads(match)
|
|
37
|
-
if isinstance(parsed, list) and all(
|
|
47
|
+
if isinstance(parsed, list) and all(
|
|
48
|
+
"tool_name" in step for step in parsed
|
|
49
|
+
):
|
|
38
50
|
return parsed
|
|
39
51
|
except Exception:
|
|
40
52
|
continue
|
|
@@ -42,6 +54,7 @@ def extract_first_json_list(raw: str) -> list:
|
|
|
42
54
|
print(raw)
|
|
43
55
|
return []
|
|
44
56
|
|
|
57
|
+
|
|
45
58
|
def parse_json_string(input_string):
|
|
46
59
|
json_char_count = 0
|
|
47
60
|
json_objects = []
|
|
@@ -79,12 +92,16 @@ def load_tools_module(tools_path: Path) -> dict:
|
|
|
79
92
|
elif tools_path.is_dir():
|
|
80
93
|
files_to_parse.extend(tools_path.glob("**/*.py"))
|
|
81
94
|
else:
|
|
82
|
-
raise ValueError(
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"Tools path {tools_path} is neither a file nor directory"
|
|
97
|
+
)
|
|
83
98
|
|
|
84
99
|
for file_path in files_to_parse:
|
|
85
100
|
try:
|
|
86
101
|
module_name = file_path.stem
|
|
87
|
-
spec = importlib.util.spec_from_file_location(
|
|
102
|
+
spec = importlib.util.spec_from_file_location(
|
|
103
|
+
module_name, file_path
|
|
104
|
+
)
|
|
88
105
|
module = importlib.util.module_from_spec(spec)
|
|
89
106
|
parent_dir = str(file_path.parent)
|
|
90
107
|
sys_path_modified = False
|
|
@@ -99,7 +116,7 @@ def load_tools_module(tools_path: Path) -> dict:
|
|
|
99
116
|
# Add all module's non-private functions to tools_dict
|
|
100
117
|
for attr_name in dir(module):
|
|
101
118
|
attr = getattr(module, attr_name)
|
|
102
|
-
if callable(attr) and not attr_name.startswith(
|
|
119
|
+
if callable(attr) and not attr_name.startswith("_"):
|
|
103
120
|
tools_dict[attr_name] = attr
|
|
104
121
|
except Exception as e:
|
|
105
122
|
print(f"Warning: Failed to load {file_path}: {str(e)}")
|
|
@@ -117,7 +134,9 @@ def extract_tool_signatures(tools_path: Path) -> list:
|
|
|
117
134
|
elif tools_path.is_dir():
|
|
118
135
|
files_to_parse.extend(tools_path.glob("**/*.py"))
|
|
119
136
|
else:
|
|
120
|
-
raise ValueError(
|
|
137
|
+
raise ValueError(
|
|
138
|
+
f"Tools path {tools_path} is neither a file nor directory"
|
|
139
|
+
)
|
|
121
140
|
|
|
122
141
|
for file_path in files_to_parse:
|
|
123
142
|
try:
|
|
@@ -128,19 +147,24 @@ def extract_tool_signatures(tools_path: Path) -> list:
|
|
|
128
147
|
for node in parsed_code.body:
|
|
129
148
|
if isinstance(node, ast.FunctionDef):
|
|
130
149
|
name = node.name
|
|
131
|
-
args = [
|
|
150
|
+
args = [
|
|
151
|
+
arg.arg for arg in node.args.args if arg.arg != "self"
|
|
152
|
+
]
|
|
132
153
|
docstring = ast.get_docstring(node)
|
|
133
|
-
tool_data.append(
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
154
|
+
tool_data.append(
|
|
155
|
+
{
|
|
156
|
+
"Function Name": name,
|
|
157
|
+
"Arguments": args,
|
|
158
|
+
"Docstring": docstring or MISSING_DOCSTRING_PROMPT,
|
|
159
|
+
}
|
|
160
|
+
)
|
|
138
161
|
except Exception as e:
|
|
139
162
|
print(f"Warning: Failed to parse {file_path}: {str(e)}")
|
|
140
163
|
continue
|
|
141
164
|
|
|
142
165
|
return tool_data
|
|
143
166
|
|
|
167
|
+
|
|
144
168
|
def extract_tool_signatures_for_prompt(tools_path: Path) -> dict[str, str]:
|
|
145
169
|
functions = {}
|
|
146
170
|
files_to_parse = []
|
|
@@ -151,7 +175,9 @@ def extract_tool_signatures_for_prompt(tools_path: Path) -> dict[str, str]:
|
|
|
151
175
|
elif tools_path.is_dir():
|
|
152
176
|
files_to_parse.extend(tools_path.glob("**/*.py"))
|
|
153
177
|
else:
|
|
154
|
-
raise ValueError(
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"Tools path {tools_path} is neither a file nor directory"
|
|
180
|
+
)
|
|
155
181
|
|
|
156
182
|
for file_path in files_to_parse:
|
|
157
183
|
try:
|
|
@@ -168,23 +194,35 @@ def extract_tool_signatures_for_prompt(tools_path: Path) -> dict[str, str]:
|
|
|
168
194
|
for arg in node.args.args:
|
|
169
195
|
if arg.arg == "self":
|
|
170
196
|
continue
|
|
171
|
-
annotation =
|
|
197
|
+
annotation = (
|
|
198
|
+
ast.unparse(arg.annotation)
|
|
199
|
+
if arg.annotation
|
|
200
|
+
else "Any"
|
|
201
|
+
)
|
|
172
202
|
args.append((arg.arg, annotation))
|
|
173
203
|
|
|
174
204
|
# Get return type
|
|
175
|
-
returns =
|
|
205
|
+
returns = (
|
|
206
|
+
ast.unparse(node.returns) if node.returns else "None"
|
|
207
|
+
)
|
|
176
208
|
|
|
177
209
|
# Get docstring
|
|
178
210
|
docstring = ast.get_docstring(node)
|
|
179
|
-
docstring =
|
|
211
|
+
docstring = (
|
|
212
|
+
textwrap.dedent(docstring).strip() if docstring else ""
|
|
213
|
+
)
|
|
180
214
|
|
|
181
215
|
# Format parameter descriptions if available in docstring
|
|
182
216
|
doc_lines = docstring.splitlines()
|
|
183
217
|
doc_summary = doc_lines[0] if doc_lines else ""
|
|
184
|
-
param_descriptions = "\n".join(
|
|
218
|
+
param_descriptions = "\n".join(
|
|
219
|
+
[line for line in doc_lines[1:] if ":param" in line]
|
|
220
|
+
)
|
|
185
221
|
|
|
186
222
|
# Compose the final string
|
|
187
|
-
args_str = ", ".join(
|
|
223
|
+
args_str = ", ".join(
|
|
224
|
+
f"{arg}: {type_}" for arg, type_ in args
|
|
225
|
+
)
|
|
188
226
|
function_str = f"""def {name}({args_str}) -> {returns}:
|
|
189
227
|
{doc_summary}"""
|
|
190
228
|
if param_descriptions:
|
|
@@ -197,9 +235,18 @@ def extract_tool_signatures_for_prompt(tools_path: Path) -> dict[str, str]:
|
|
|
197
235
|
|
|
198
236
|
return functions
|
|
199
237
|
|
|
200
|
-
|
|
238
|
+
|
|
239
|
+
def ensure_data_available(
|
|
240
|
+
step: dict,
|
|
241
|
+
inputs: dict,
|
|
242
|
+
snapshot: dict,
|
|
243
|
+
tools_module: dict,
|
|
244
|
+
tool_signatures_for_prompt,
|
|
245
|
+
) -> dict:
|
|
201
246
|
tool_name = step["tool_name"]
|
|
202
|
-
cache = snapshot.setdefault("input_output_examples", {}).setdefault(
|
|
247
|
+
cache = snapshot.setdefault("input_output_examples", {}).setdefault(
|
|
248
|
+
tool_name, []
|
|
249
|
+
)
|
|
203
250
|
for entry in cache:
|
|
204
251
|
if entry["inputs"] == inputs:
|
|
205
252
|
return entry["output"]
|
|
@@ -212,7 +259,11 @@ def ensure_data_available(step: dict, inputs: dict, snapshot: dict, tools_module
|
|
|
212
259
|
except:
|
|
213
260
|
provider = get_provider(
|
|
214
261
|
model_id="meta-llama/llama-3-405b-instruct",
|
|
215
|
-
params={
|
|
262
|
+
params={
|
|
263
|
+
"min_new_tokens": 0,
|
|
264
|
+
"decoding_method": "greedy",
|
|
265
|
+
"max_new_tokens": 500,
|
|
266
|
+
},
|
|
216
267
|
)
|
|
217
268
|
renderer = ArgsExtractorTemplateRenderer(ARGS_EXTRACTOR_PROMPT_PATH)
|
|
218
269
|
|
|
@@ -226,14 +277,19 @@ def ensure_data_available(step: dict, inputs: dict, snapshot: dict, tools_module
|
|
|
226
277
|
try:
|
|
227
278
|
output = tools_module[json_obj["tool_name"]](**json_obj["inputs"])
|
|
228
279
|
except:
|
|
229
|
-
raise ValueError(
|
|
280
|
+
raise ValueError(
|
|
281
|
+
f"Failed to execute tool '{tool_name}' with inputs {inputs}"
|
|
282
|
+
)
|
|
230
283
|
|
|
231
284
|
cache.append({"inputs": inputs, "output": output})
|
|
232
285
|
if not isinstance(output, dict):
|
|
233
286
|
print(f" Tool {tool_name} returned non-dict output: {output}")
|
|
234
287
|
return output
|
|
235
288
|
|
|
236
|
-
|
|
289
|
+
|
|
290
|
+
def plan_tool_calls_with_llm(
|
|
291
|
+
story: str, agent_name: str, tool_signatures_str: str, provider
|
|
292
|
+
) -> list:
|
|
237
293
|
|
|
238
294
|
renderer = ToolPlannerTemplateRenderer(TOOL_PLANNER_PROMPT_PATH)
|
|
239
295
|
|
|
@@ -250,7 +306,9 @@ def plan_tool_calls_with_llm(story: str, agent_name: str, tool_signatures_str: s
|
|
|
250
306
|
|
|
251
307
|
|
|
252
308
|
# --- Tool Execution Logic ---
|
|
253
|
-
def run_tool_chain(
|
|
309
|
+
def run_tool_chain(
|
|
310
|
+
tool_plan: list, snapshot: dict, tools_module, tool_signatures_for_prompt
|
|
311
|
+
) -> None:
|
|
254
312
|
memory = {}
|
|
255
313
|
|
|
256
314
|
for step in tool_plan:
|
|
@@ -280,7 +338,9 @@ def run_tool_chain(tool_plan: list, snapshot: dict, tools_module, tool_signature
|
|
|
280
338
|
|
|
281
339
|
if list_keys:
|
|
282
340
|
if len(list_keys) > 1:
|
|
283
|
-
raise ValueError(
|
|
341
|
+
raise ValueError(
|
|
342
|
+
f"Tool '{name}' received multiple list inputs. Only one supported for now."
|
|
343
|
+
)
|
|
284
344
|
list_key = list_keys[0]
|
|
285
345
|
value_list = resolved_inputs[list_key]
|
|
286
346
|
|
|
@@ -289,20 +349,36 @@ def run_tool_chain(tool_plan: list, snapshot: dict, tools_module, tool_signature
|
|
|
289
349
|
item_inputs = resolved_inputs.copy()
|
|
290
350
|
item_inputs[list_key] = val
|
|
291
351
|
print(f" ⚙️ Running {name} with {list_key} = {val}")
|
|
292
|
-
output = ensure_data_available(
|
|
352
|
+
output = ensure_data_available(
|
|
353
|
+
step,
|
|
354
|
+
item_inputs,
|
|
355
|
+
snapshot,
|
|
356
|
+
tools_module,
|
|
357
|
+
tool_signatures_for_prompt,
|
|
358
|
+
)
|
|
293
359
|
results.append(output)
|
|
294
360
|
memory[f"{name}_{idx}"] = output
|
|
295
361
|
|
|
296
362
|
memory[name] = results
|
|
297
|
-
print(
|
|
363
|
+
print(
|
|
364
|
+
f"Stored {len(results)} outputs under '{name}' and indexed as '{name}_i'"
|
|
365
|
+
)
|
|
298
366
|
else:
|
|
299
|
-
output = ensure_data_available(
|
|
367
|
+
output = ensure_data_available(
|
|
368
|
+
step,
|
|
369
|
+
resolved_inputs,
|
|
370
|
+
snapshot,
|
|
371
|
+
tools_module,
|
|
372
|
+
tool_signatures_for_prompt,
|
|
373
|
+
)
|
|
300
374
|
memory[name] = output
|
|
301
375
|
print(f"Stored output under tool name: {name} = {output}")
|
|
302
376
|
|
|
303
377
|
|
|
304
378
|
# --- Main Snapshot Builder ---
|
|
305
|
-
def build_snapshot(
|
|
379
|
+
def build_snapshot(
|
|
380
|
+
agent_name: str, tools_path: Path, stories: list, output_path: Path
|
|
381
|
+
):
|
|
306
382
|
agent = {"name": agent_name}
|
|
307
383
|
tools_module = load_tools_module(tools_path)
|
|
308
384
|
tool_signatures = extract_tool_signatures(tools_path)
|
|
@@ -310,20 +386,28 @@ def build_snapshot(agent_name: str, tools_path: Path, stories: list, output_path
|
|
|
310
386
|
|
|
311
387
|
provider = get_provider(
|
|
312
388
|
model_id="meta-llama/llama-3-405b-instruct",
|
|
313
|
-
params={
|
|
389
|
+
params={
|
|
390
|
+
"min_new_tokens": 1,
|
|
391
|
+
"decoding_method": "greedy",
|
|
392
|
+
"max_new_tokens": 2048,
|
|
393
|
+
},
|
|
314
394
|
)
|
|
315
395
|
|
|
316
396
|
snapshot = {
|
|
317
397
|
"agent": agent,
|
|
318
398
|
"tools": tool_signatures,
|
|
319
|
-
"input_output_examples": {}
|
|
399
|
+
"input_output_examples": {},
|
|
320
400
|
}
|
|
321
401
|
|
|
322
402
|
for story in stories:
|
|
323
403
|
print(f"\n📘 Planning tool calls for story: {story}")
|
|
324
|
-
tool_plan = plan_tool_calls_with_llm(
|
|
404
|
+
tool_plan = plan_tool_calls_with_llm(
|
|
405
|
+
story, agent["name"], tool_signatures, provider
|
|
406
|
+
)
|
|
325
407
|
try:
|
|
326
|
-
run_tool_chain(
|
|
408
|
+
run_tool_chain(
|
|
409
|
+
tool_plan, snapshot, tools_module, tool_signatures_for_prompt
|
|
410
|
+
)
|
|
327
411
|
except ValueError as e:
|
|
328
412
|
print(f"❌ Error running tool chain for story '{story}': {e}")
|
|
329
413
|
continue
|
|
@@ -340,7 +424,7 @@ if __name__ == "__main__":
|
|
|
340
424
|
|
|
341
425
|
stories = []
|
|
342
426
|
agent_name = None
|
|
343
|
-
with stories_path.open("r", encoding="utf-8", newline=
|
|
427
|
+
with stories_path.open("r", encoding="utf-8", newline="") as f:
|
|
344
428
|
csv_reader = csv.DictReader(f)
|
|
345
429
|
for row in csv_reader:
|
|
346
430
|
stories.append(row["story"])
|
|
@@ -349,4 +433,4 @@ if __name__ == "__main__":
|
|
|
349
433
|
|
|
350
434
|
snapshot_path = stories_path.parent / f"{agent_name}_snapshot_llm.json"
|
|
351
435
|
|
|
352
|
-
build_snapshot(agent_name, tools_path, stories, snapshot_path)
|
|
436
|
+
build_snapshot(agent_name, tools_path, stories, snapshot_path)
|
wxo_agentic_evaluation/type.py
CHANGED
|
@@ -1,10 +1,7 @@
|
|
|
1
|
-
from typing import Dict, List, Union, Any, Optional
|
|
2
|
-
from pydantic import (
|
|
3
|
-
BaseModel,
|
|
4
|
-
ConfigDict,
|
|
5
|
-
Field
|
|
6
|
-
)
|
|
7
1
|
from enum import StrEnum
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
5
|
from rich.text import Text
|
|
9
6
|
|
|
10
7
|
|
|
@@ -61,9 +58,13 @@ class ConversationalConfidenceThresholdScore(BaseModel):
|
|
|
61
58
|
def table(self):
|
|
62
59
|
return {
|
|
63
60
|
"response_confidence": str(self.response_confidence),
|
|
64
|
-
"response_confidence_threshold": str(
|
|
61
|
+
"response_confidence_threshold": str(
|
|
62
|
+
self.response_confidence_threshold
|
|
63
|
+
),
|
|
65
64
|
"retrieval_confidence": str(self.retrieval_confidence),
|
|
66
|
-
"retrieval_confidence_threshold": str(
|
|
65
|
+
"retrieval_confidence_threshold": str(
|
|
66
|
+
self.retrieval_confidence_threshold
|
|
67
|
+
),
|
|
67
68
|
}
|
|
68
69
|
|
|
69
70
|
|
|
@@ -120,12 +121,14 @@ class GoalDetail(BaseModel):
|
|
|
120
121
|
keywords: List = None
|
|
121
122
|
knowledge_base: KnowledgeBaseGoalDetail = KnowledgeBaseGoalDetail()
|
|
122
123
|
|
|
124
|
+
|
|
123
125
|
class AttackData(BaseModel):
|
|
124
126
|
attack_category: AttackCategory
|
|
125
127
|
attack_type: str
|
|
126
128
|
attack_name: str
|
|
127
129
|
attack_instructions: str
|
|
128
130
|
|
|
131
|
+
|
|
129
132
|
class AttackData(BaseModel):
|
|
130
133
|
agent: str
|
|
131
134
|
agents_path: str
|
|
@@ -143,8 +146,8 @@ class EvaluationData(BaseModel):
|
|
|
143
146
|
goal_details: List[GoalDetail]
|
|
144
147
|
starting_sentence: str = None
|
|
145
148
|
|
|
149
|
+
|
|
146
150
|
class ToolDefinition(BaseModel):
|
|
147
151
|
tool_description: Optional[str]
|
|
148
152
|
tool_name: str
|
|
149
153
|
tool_params: List[str]
|
|
150
|
-
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import ast
|
|
2
2
|
import re
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Any, List, Mapping, Union
|
|
5
|
+
|
|
5
6
|
|
|
6
7
|
class PythonTypeToJsonType:
|
|
7
8
|
OPTIONAL_PARAM_EXTRACT = re.compile(r"[Oo]ptional\[(\w+)\]")
|
|
8
|
-
|
|
9
|
+
|
|
9
10
|
@staticmethod
|
|
10
11
|
def python_to_json_type(python_annotation: str):
|
|
11
12
|
if not python_annotation:
|
|
@@ -25,30 +26,33 @@ class PythonTypeToJsonType:
|
|
|
25
26
|
return "object"
|
|
26
27
|
if python_annotation.startswith("optional"):
|
|
27
28
|
# extract the type within Optional[T]
|
|
28
|
-
inner_type = PythonTypeToJsonType.OPTIONAL_PARAM_EXTRACT.search(
|
|
29
|
+
inner_type = PythonTypeToJsonType.OPTIONAL_PARAM_EXTRACT.search(
|
|
30
|
+
python_annotation
|
|
31
|
+
).group(1)
|
|
29
32
|
return PythonTypeToJsonType.python_to_json_type(inner_type)
|
|
30
33
|
|
|
31
34
|
return "string"
|
|
32
35
|
|
|
36
|
+
|
|
33
37
|
class ToolExtractionOpenAIFormat:
|
|
34
38
|
@staticmethod
|
|
35
39
|
def get_default_arguments(node):
|
|
36
|
-
"""
|
|
40
|
+
"""Returns the default arguments (if any)
|
|
37
41
|
|
|
38
42
|
The default arguments are stored in args.default array.
|
|
39
43
|
Since, in Python, the default arguments only come after positional arguments,
|
|
40
44
|
we can index the argument array starting from the last `n` arguments, where n is
|
|
41
45
|
the length of the default arguments.
|
|
42
46
|
|
|
43
|
-
ex.
|
|
47
|
+
ex.
|
|
44
48
|
def add(a, b=5):
|
|
45
49
|
pass
|
|
46
|
-
|
|
50
|
+
|
|
47
51
|
Then we have,
|
|
48
52
|
args = [a, b]
|
|
49
53
|
defaults = [Constant(value=5)]
|
|
50
54
|
|
|
51
|
-
args[-len(defaults):] = [b]
|
|
55
|
+
args[-len(defaults):] = [b]
|
|
52
56
|
|
|
53
57
|
(
|
|
54
58
|
"FunctionDef(
|
|
@@ -70,12 +74,12 @@ class ToolExtractionOpenAIFormat:
|
|
|
70
74
|
if num_defaults > 0:
|
|
71
75
|
for arg in node.args.args[-num_defaults:]:
|
|
72
76
|
default_arguments.add(arg)
|
|
73
|
-
|
|
77
|
+
|
|
74
78
|
return default_arguments
|
|
75
79
|
|
|
76
80
|
@staticmethod
|
|
77
81
|
def from_file(tools_path: Union[str, Path]) -> Mapping[str, Any]:
|
|
78
|
-
"""
|
|
82
|
+
"""Uses `extract_tool_signatures` function, but converts the response
|
|
79
83
|
to open-ai format
|
|
80
84
|
|
|
81
85
|
```
|
|
@@ -100,7 +104,11 @@ class ToolExtractionOpenAIFormat:
|
|
|
100
104
|
parsed_code = ast.parse(code)
|
|
101
105
|
for node in parsed_code.body:
|
|
102
106
|
if isinstance(node, ast.FunctionDef):
|
|
103
|
-
parameters = {
|
|
107
|
+
parameters = {
|
|
108
|
+
"type": "object",
|
|
109
|
+
"properties": {},
|
|
110
|
+
"required": [],
|
|
111
|
+
}
|
|
104
112
|
function_name = node.name
|
|
105
113
|
for arg in node.args.args:
|
|
106
114
|
type_annotation = None
|
|
@@ -109,16 +117,25 @@ class ToolExtractionOpenAIFormat:
|
|
|
109
117
|
if arg.annotation:
|
|
110
118
|
type_annotation = ast.unparse(arg.annotation)
|
|
111
119
|
|
|
112
|
-
parameter_type =
|
|
120
|
+
parameter_type = (
|
|
121
|
+
PythonTypeToJsonType.python_to_json_type(
|
|
122
|
+
type_annotation
|
|
123
|
+
)
|
|
124
|
+
)
|
|
113
125
|
parameters["properties"][arg.arg] = {
|
|
114
126
|
"type": parameter_type,
|
|
115
|
-
"description": "",
|
|
127
|
+
"description": "", # todo
|
|
116
128
|
}
|
|
117
129
|
|
|
118
|
-
if
|
|
130
|
+
if (
|
|
131
|
+
type_annotation
|
|
132
|
+
and "Optional" not in type_annotation
|
|
133
|
+
):
|
|
119
134
|
parameters["required"].append(arg.arg)
|
|
120
135
|
|
|
121
|
-
default_arguments =
|
|
136
|
+
default_arguments = (
|
|
137
|
+
ToolExtractionOpenAIFormat.get_default_arguments(node)
|
|
138
|
+
)
|
|
122
139
|
for arg_name in parameters["required"]:
|
|
123
140
|
if arg_name in default_arguments:
|
|
124
141
|
parameters.remove(arg_name)
|
|
@@ -128,8 +145,10 @@ class ToolExtractionOpenAIFormat:
|
|
|
128
145
|
"function": {
|
|
129
146
|
"name": function_name,
|
|
130
147
|
"parameters": parameters,
|
|
131
|
-
"description": ast.get_docstring(
|
|
132
|
-
|
|
148
|
+
"description": ast.get_docstring(
|
|
149
|
+
node
|
|
150
|
+
), # fix (does not do :params)
|
|
151
|
+
},
|
|
133
152
|
}
|
|
134
153
|
tool_data.append(open_ai_format_fn)
|
|
135
154
|
|
|
@@ -149,9 +168,11 @@ class ToolExtractionOpenAIFormat:
|
|
|
149
168
|
elif tools_path.is_dir():
|
|
150
169
|
files_to_parse.extend(tools_path.glob("**/*.py"))
|
|
151
170
|
else:
|
|
152
|
-
raise ValueError(
|
|
153
|
-
|
|
171
|
+
raise ValueError(
|
|
172
|
+
f"Tools path {tools_path} is neither a file nor directory"
|
|
173
|
+
)
|
|
174
|
+
|
|
154
175
|
for file_path in files_to_parse:
|
|
155
176
|
all_tools.extend(ToolExtractionOpenAIFormat.from_file(file_path))
|
|
156
|
-
|
|
157
|
-
return all_tools
|
|
177
|
+
|
|
178
|
+
return all_tools
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
|
|
3
3
|
import rich
|
|
4
|
+
from rich.text import Text
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def pretty_print(content: Any, style: Optional[str] = None):
|
|
@@ -33,13 +34,17 @@ def warn(
|
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
def is_ok(
|
|
36
|
-
message: str,
|
|
37
|
+
message: str,
|
|
38
|
+
style: Optional[str] = "bold green",
|
|
39
|
+
prompt: Optional[str] = "OK ✅ :",
|
|
37
40
|
) -> Text:
|
|
38
41
|
"""Utility function for formatting an OK message."""
|
|
39
42
|
return Text(f"{prompt}{message}\n\n", style=style)
|
|
40
43
|
|
|
41
44
|
|
|
42
|
-
def print_done(
|
|
45
|
+
def print_done(
|
|
46
|
+
prompt: Optional[str] = "Done ✅", style: Optional[str] = "bold cyan"
|
|
47
|
+
):
|
|
43
48
|
"""
|
|
44
49
|
Prints a prompt indicating completion of a process/routine.
|
|
45
50
|
:param prompt: default is `"Done ✅"`
|
|
@@ -63,7 +68,9 @@ def print_success(
|
|
|
63
68
|
|
|
64
69
|
|
|
65
70
|
def print_failure(
|
|
66
|
-
message: str,
|
|
71
|
+
message: str,
|
|
72
|
+
style: Optional[str] = "bold red",
|
|
73
|
+
prompt: Optional[str] = "❌ FAILED",
|
|
67
74
|
):
|
|
68
75
|
"""
|
|
69
76
|
Prints a failure message.
|
|
@@ -108,7 +115,9 @@ class IncorrectParameterUtils:
|
|
|
108
115
|
]
|
|
109
116
|
|
|
110
117
|
@staticmethod
|
|
111
|
-
def format_bad_description_message(
|
|
118
|
+
def format_bad_description_message(
|
|
119
|
+
tool_name: str, tool_desc: str
|
|
120
|
+
) -> List[Text]:
|
|
112
121
|
|
|
113
122
|
return [
|
|
114
123
|
warn(
|
|
@@ -139,12 +148,15 @@ class TestingUtils:
|
|
|
139
148
|
For example, this can be read as: `"{\n⚙️ Testing} {20} {good tool descriptions}"`.
|
|
140
149
|
"""
|
|
141
150
|
pretty_print(
|
|
142
|
-
content=f"{prompt} {test_case_count} {test_description}",
|
|
151
|
+
content=f"{prompt} {test_case_count} {test_description}",
|
|
152
|
+
style=style,
|
|
143
153
|
)
|
|
144
154
|
|
|
145
155
|
@staticmethod
|
|
146
156
|
def print_error_details(
|
|
147
|
-
expected: List[str],
|
|
157
|
+
expected: List[str],
|
|
158
|
+
detected: List[str],
|
|
159
|
+
style: Optional[str] = "bold red",
|
|
148
160
|
):
|
|
149
161
|
"""
|
|
150
162
|
Print detailed error information.
|
|
@@ -169,6 +181,8 @@ class TestingUtils:
|
|
|
169
181
|
:param style: The style for the text (default is bold red).
|
|
170
182
|
"""
|
|
171
183
|
if failed_cases:
|
|
172
|
-
pretty_print(
|
|
184
|
+
pretty_print(
|
|
185
|
+
content=f"{prompt} ({len(failed_cases)}):", style=style
|
|
186
|
+
)
|
|
173
187
|
for case in failed_cases:
|
|
174
188
|
pretty_print(content=f" - {case}", style=style)
|