ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA +35 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD +65 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +9 -3
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation.py +42 -0
  14. wxo_agentic_evaluation/evaluation_package.py +117 -70
  15. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  16. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  17. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  18. wxo_agentic_evaluation/external_agent/types.py +12 -5
  19. wxo_agentic_evaluation/inference_backend.py +183 -79
  20. wxo_agentic_evaluation/llm_matching.py +4 -3
  21. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  22. wxo_agentic_evaluation/llm_user.py +7 -3
  23. wxo_agentic_evaluation/main.py +175 -67
  24. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  25. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  26. wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
  27. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +176 -0
  28. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
  29. wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
  30. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  31. wxo_agentic_evaluation/quick_eval.py +49 -23
  32. wxo_agentic_evaluation/record_chat.py +70 -33
  33. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  34. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  35. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  40. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  41. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  42. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  43. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  44. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  45. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  46. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  47. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  48. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  49. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  50. wxo_agentic_evaluation/resource_map.py +2 -1
  51. wxo_agentic_evaluation/service_instance.py +103 -21
  52. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  53. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +216 -34
  54. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  55. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  56. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  57. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  58. wxo_agentic_evaluation/tool_planner.py +128 -44
  59. wxo_agentic_evaluation/type.py +12 -9
  60. wxo_agentic_evaluation/utils/__init__.py +1 -0
  61. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  62. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  63. wxo_agentic_evaluation/utils/utils.py +83 -52
  64. ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
  65. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/WHEEL +0 -0
  66. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,176 @@
1
+ from wxo_agentic_evaluation.otel_support.tasks_test import TASKS
2
+ from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
3
+ from typing import Any, Dict, List, Union
4
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
5
+ import json
6
+ import glob
7
+
8
+
9
+ file_paths = glob.glob("airline_traces/*.json")
10
+
11
+
12
+ def convert_span_to_messages(span: Dict[str, Any]) -> List[Message]:
13
+
14
+ attrs: Dict[str, str] = {}
15
+ for attr in span.get("attributes", []):
16
+ k = attr.get("key")
17
+ v_obj = attr.get("value", {})
18
+
19
+ v = v_obj.get("stringValue")
20
+ if v is None and v_obj:
21
+ v = next(iter(v_obj.values()))
22
+ if isinstance(v, (str, int, float, bool)):
23
+ attrs[k] = str(v)
24
+ else:
25
+ attrs[k] = json.dumps(v) if v is not None else ""
26
+
27
+ def collect_message_indexes(prefix: str) -> List[int]:
28
+ idxs = set()
29
+ plen = len(prefix)
30
+ for k in attrs:
31
+ if k.startswith(prefix):
32
+ rest = k[plen:]
33
+ first = rest.split(".", 1)[0]
34
+ if first.isdigit():
35
+ idxs.add(int(first))
36
+ return sorted(idxs)
37
+
38
+ messages: List[Message] = []
39
+
40
+ in_prefix = "llm.input_messages."
41
+ for i in collect_message_indexes(in_prefix):
42
+ role = attrs.get(f"{in_prefix}{i}.message.role", "")
43
+ tc_prefix = f"{in_prefix}{i}.message.tool_calls."
44
+ has_tool_calls = any(k.startswith(tc_prefix) for k in attrs.keys())
45
+
46
+ if has_tool_calls:
47
+ call_indexes = set()
48
+ for k in attrs.keys():
49
+ if k.startswith(tc_prefix):
50
+ rest = k[len(tc_prefix):]
51
+ first = rest.split(".", 1)[0]
52
+ if first.isdigit():
53
+ call_indexes.add(int(first))
54
+
55
+ for ci in sorted(call_indexes):
56
+ name = attrs.get(f"{tc_prefix}{ci}.tool_call.function.name", "")
57
+ args_raw = attrs.get(f"{tc_prefix}{ci}.tool_call.function.arguments", "{}")
58
+ tool_call_id = attrs.get(f"{tc_prefix}{ci}.tool_call.id", "")
59
+
60
+ try:
61
+ args = json.loads(args_raw)
62
+ except Exception:
63
+ args = {"raw": args_raw}
64
+
65
+ messages.append(
66
+ Message(
67
+ role="assistant",
68
+ content=json.dumps({"args": args, "name": name, "tool_call_id": tool_call_id}),
69
+ type=ContentType.tool_call,
70
+ )
71
+ )
72
+ else:
73
+ content = attrs.get(f"{in_prefix}{i}.message.content", "")
74
+ messages.append(
75
+ Message(
76
+ role=role if role in {"user", "assistant", "tool"} else "user",
77
+ content=content,
78
+ type=ContentType.text,
79
+ )
80
+ )
81
+ if role == "tool":
82
+ pass
83
+
84
+ out_prefix = "llm.output_messages."
85
+ for i in collect_message_indexes(out_prefix):
86
+ role = attrs.get(f"{out_prefix}{i}.message.role", "assistant")
87
+ content = attrs.get(f"{out_prefix}{i}.message.content", "")
88
+ messages.append(
89
+ Message(
90
+ role=role if role in {"user", "assistant", "tool"} else "assistant",
91
+ content=content,
92
+ type=ContentType.text,
93
+ )
94
+ )
95
+
96
+ return messages
97
+
98
+ total = 0
99
+ success = 0
100
+ for i, file in enumerate(file_paths):
101
+ # if i != 2:
102
+ # continue
103
+ with open(file, "r") as f:
104
+ data = json.load(f)
105
+
106
+ messages = []
107
+ for span in data["resourceSpans"][0]["scopeSpans"][0]["spans"]:
108
+ temp = convert_span_to_messages(span)
109
+ if len(temp) > len(messages):
110
+ messages = temp
111
+ for msg in messages:
112
+ #print(msg.role, msg.content)
113
+ pass
114
+ task_id = None
115
+ for kv in data["resourceSpans"][0]["scopeSpans"][0]["spans"][-1]["attributes"]:
116
+ if kv["key"] == "task.index":
117
+ task_id = int(kv["value"]["stringValue"])
118
+
119
+ task = TASKS[task_id].model_dump()
120
+ goal_temp = []
121
+
122
+ goals = {}
123
+ goal_details = []
124
+
125
+ i = 0
126
+ for action in task["actions"]:
127
+ goal_temp.append(action["name"] + f"_{i}")
128
+ args = {}
129
+ for k,v in action["kwargs"].items():
130
+ args[k] = v
131
+
132
+ goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": args }
133
+ goal_details.append(goal_detail)
134
+ i += 1
135
+
136
+ if not goal_temp:
137
+ continue
138
+ if len(goal_temp) == 1:
139
+ goals[goal_temp[0]] = []
140
+ else:
141
+ for i in range(len(goal_temp)-1):
142
+ goals.update({goal_temp[i]: [goal_temp[i+1]]})
143
+ goals[goal_temp[-1]]= []
144
+
145
+ gt_data = {
146
+ "agent": "airline_agent",
147
+ "goals": goals,
148
+ "goal_details": goal_details,
149
+ "story": task["instruction"],
150
+ "starting_sentence": "",
151
+ }
152
+ gt_data = EvaluationData.model_validate(gt_data)
153
+
154
+ tc_name = f"airline_test_{i}"
155
+ try:
156
+ evaluation_package = EvaluationPackage(
157
+ test_case_name=tc_name,
158
+ messages=messages,
159
+ ground_truth=gt_data,
160
+ conversational_search_data=None,
161
+ resource_map=None
162
+ )
163
+
164
+ (
165
+ keyword_semantic_matches,
166
+ knowledge_base_metrics,
167
+ messages_with_reason,
168
+ metrics,
169
+ ) = evaluation_package.generate_summary()
170
+
171
+ success += metrics.is_success
172
+ total += 1
173
+ except Exception as e:
174
+ raise e
175
+ print(success/total)
176
+ print(total)
@@ -0,0 +1,21 @@
1
+ from typing import Any, Dict, List, Union, Optional
2
+ from wxo_agentic_evaluation.type import Message, ContentType, EventTypes
3
+ import json
4
+
5
+ # with open("src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
6
+ # data = json.load(f)
7
+ #
8
+ # otel_traces = data["calls"][-1]["messages"]
9
+
10
+
11
+ def convert_otel_to_message(otel_traces):
12
+ history = []
13
+ for row in otel_traces:
14
+ print(row)
15
+ content = row["content"]
16
+ print(row.keys())
17
+ role = row.get("role", "assistant")
18
+
19
+ history.append(Message(role = role, content= content, type=ContentType.text, event=EventTypes.message_created))
20
+
21
+ return history