ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,176 +0,0 @@
1
- from wxo_agentic_evaluation.otel_support.tasks_test import TASKS
2
- from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
3
- from typing import Any, Dict, List, Union
4
- from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
5
- import json
6
- import glob
7
-
8
-
9
- file_paths = glob.glob("airline_traces/*.json")
10
-
11
-
12
- def convert_span_to_messages(span: Dict[str, Any]) -> List[Message]:
13
-
14
- attrs: Dict[str, str] = {}
15
- for attr in span.get("attributes", []):
16
- k = attr.get("key")
17
- v_obj = attr.get("value", {})
18
-
19
- v = v_obj.get("stringValue")
20
- if v is None and v_obj:
21
- v = next(iter(v_obj.values()))
22
- if isinstance(v, (str, int, float, bool)):
23
- attrs[k] = str(v)
24
- else:
25
- attrs[k] = json.dumps(v) if v is not None else ""
26
-
27
- def collect_message_indexes(prefix: str) -> List[int]:
28
- idxs = set()
29
- plen = len(prefix)
30
- for k in attrs:
31
- if k.startswith(prefix):
32
- rest = k[plen:]
33
- first = rest.split(".", 1)[0]
34
- if first.isdigit():
35
- idxs.add(int(first))
36
- return sorted(idxs)
37
-
38
- messages: List[Message] = []
39
-
40
- in_prefix = "llm.input_messages."
41
- for i in collect_message_indexes(in_prefix):
42
- role = attrs.get(f"{in_prefix}{i}.message.role", "")
43
- tc_prefix = f"{in_prefix}{i}.message.tool_calls."
44
- has_tool_calls = any(k.startswith(tc_prefix) for k in attrs.keys())
45
-
46
- if has_tool_calls:
47
- call_indexes = set()
48
- for k in attrs.keys():
49
- if k.startswith(tc_prefix):
50
- rest = k[len(tc_prefix):]
51
- first = rest.split(".", 1)[0]
52
- if first.isdigit():
53
- call_indexes.add(int(first))
54
-
55
- for ci in sorted(call_indexes):
56
- name = attrs.get(f"{tc_prefix}{ci}.tool_call.function.name", "")
57
- args_raw = attrs.get(f"{tc_prefix}{ci}.tool_call.function.arguments", "{}")
58
- tool_call_id = attrs.get(f"{tc_prefix}{ci}.tool_call.id", "")
59
-
60
- try:
61
- args = json.loads(args_raw)
62
- except Exception:
63
- args = {"raw": args_raw}
64
-
65
- messages.append(
66
- Message(
67
- role="assistant",
68
- content=json.dumps({"args": args, "name": name, "tool_call_id": tool_call_id}),
69
- type=ContentType.tool_call,
70
- )
71
- )
72
- else:
73
- content = attrs.get(f"{in_prefix}{i}.message.content", "")
74
- messages.append(
75
- Message(
76
- role=role if role in {"user", "assistant", "tool"} else "user",
77
- content=content,
78
- type=ContentType.text,
79
- )
80
- )
81
- if role == "tool":
82
- pass
83
-
84
- out_prefix = "llm.output_messages."
85
- for i in collect_message_indexes(out_prefix):
86
- role = attrs.get(f"{out_prefix}{i}.message.role", "assistant")
87
- content = attrs.get(f"{out_prefix}{i}.message.content", "")
88
- messages.append(
89
- Message(
90
- role=role if role in {"user", "assistant", "tool"} else "assistant",
91
- content=content,
92
- type=ContentType.text,
93
- )
94
- )
95
-
96
- return messages
97
-
98
- total = 0
99
- success = 0
100
- for i, file in enumerate(file_paths):
101
- # if i != 2:
102
- # continue
103
- with open(file, "r") as f:
104
- data = json.load(f)
105
-
106
- messages = []
107
- for span in data["resourceSpans"][0]["scopeSpans"][0]["spans"]:
108
- temp = convert_span_to_messages(span)
109
- if len(temp) > len(messages):
110
- messages = temp
111
- for msg in messages:
112
- #print(msg.role, msg.content)
113
- pass
114
- task_id = None
115
- for kv in data["resourceSpans"][0]["scopeSpans"][0]["spans"][-1]["attributes"]:
116
- if kv["key"] == "task.index":
117
- task_id = int(kv["value"]["stringValue"])
118
-
119
- task = TASKS[task_id].model_dump()
120
- goal_temp = []
121
-
122
- goals = {}
123
- goal_details = []
124
-
125
- i = 0
126
- for action in task["actions"]:
127
- goal_temp.append(action["name"] + f"_{i}")
128
- args = {}
129
- for k,v in action["kwargs"].items():
130
- args[k] = v
131
-
132
- goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": args }
133
- goal_details.append(goal_detail)
134
- i += 1
135
-
136
- if not goal_temp:
137
- continue
138
- if len(goal_temp) == 1:
139
- goals[goal_temp[0]] = []
140
- else:
141
- for i in range(len(goal_temp)-1):
142
- goals.update({goal_temp[i]: [goal_temp[i+1]]})
143
- goals[goal_temp[-1]]= []
144
-
145
- gt_data = {
146
- "agent": "airline_agent",
147
- "goals": goals,
148
- "goal_details": goal_details,
149
- "story": task["instruction"],
150
- "starting_sentence": "",
151
- }
152
- gt_data = EvaluationData.model_validate(gt_data)
153
-
154
- tc_name = f"airline_test_{i}"
155
- try:
156
- evaluation_package = EvaluationPackage(
157
- test_case_name=tc_name,
158
- messages=messages,
159
- ground_truth=gt_data,
160
- conversational_search_data=None,
161
- resource_map=None
162
- )
163
-
164
- (
165
- keyword_semantic_matches,
166
- knowledge_base_metrics,
167
- messages_with_reason,
168
- metrics,
169
- ) = evaluation_package.generate_summary()
170
-
171
- success += metrics.is_success
172
- total += 1
173
- except Exception as e:
174
- raise e
175
- print(success/total)
176
- print(total)