ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,21 +1,28 @@
|
|
|
1
1
|
import importlib.resources
|
|
2
2
|
import json
|
|
3
|
+
|
|
3
4
|
import rich
|
|
4
5
|
|
|
5
|
-
from wxo_agentic_evaluation.prompt.template_render import StoryGenerationTemplateRenderer
|
|
6
|
-
from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
|
|
7
6
|
from wxo_agentic_evaluation import prompt
|
|
7
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
8
|
+
StoryGenerationTemplateRenderer,
|
|
9
|
+
)
|
|
10
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
8
11
|
|
|
9
12
|
console = rich.console.Console()
|
|
10
13
|
|
|
14
|
+
|
|
11
15
|
def starting_sentence_generation_prompt():
|
|
12
|
-
with importlib.resources.path(
|
|
16
|
+
with importlib.resources.path(
|
|
17
|
+
prompt, "starting_sentence_generation_prompt.jinja2"
|
|
18
|
+
) as fp:
|
|
13
19
|
# reuse the StoryGenerationTemplateRenderer class, even though we are generating a "starting_sentence" instead of a "story"
|
|
14
20
|
# the starting sentence generation prompts uses the same input variable
|
|
15
21
|
render = StoryGenerationTemplateRenderer(str(fp))
|
|
16
|
-
|
|
22
|
+
|
|
17
23
|
return render
|
|
18
24
|
|
|
25
|
+
|
|
19
26
|
def generate_starting_sentence(annotated_data: dict):
|
|
20
27
|
renderer = starting_sentence_generation_prompt()
|
|
21
28
|
llm_decode_parameter = {
|
|
@@ -23,7 +30,9 @@ def generate_starting_sentence(annotated_data: dict):
|
|
|
23
30
|
"decoding_method": "greedy",
|
|
24
31
|
"max_new_tokens": 4096,
|
|
25
32
|
}
|
|
26
|
-
wai_client = get_provider(
|
|
33
|
+
wai_client = get_provider(
|
|
34
|
+
model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter
|
|
35
|
+
)
|
|
27
36
|
prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
|
|
28
37
|
res = wai_client.query(prompt)
|
|
29
38
|
res = res.strip()
|
|
@@ -33,5 +42,7 @@ def generate_starting_sentence(annotated_data: dict):
|
|
|
33
42
|
res = json.loads(res)
|
|
34
43
|
return res["starting_sentence"]
|
|
35
44
|
except Exception:
|
|
36
|
-
console.log(
|
|
37
|
-
|
|
45
|
+
console.log(
|
|
46
|
+
f"The generated `starting_sentence` had incorrect format: '{res}'"
|
|
47
|
+
)
|
|
48
|
+
return res
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from typing import Generator
|
|
3
|
+
|
|
2
4
|
import requests
|
|
3
|
-
import json
|
|
4
5
|
import rich
|
|
5
6
|
|
|
6
|
-
from wxo_agentic_evaluation.external_agent.types import
|
|
7
|
-
|
|
7
|
+
from wxo_agentic_evaluation.external_agent.types import (
|
|
8
|
+
SchemaValidationResults,
|
|
9
|
+
UniversalData,
|
|
10
|
+
)
|
|
8
11
|
|
|
9
12
|
MESSAGES = [
|
|
10
13
|
{"role": "user", "content": "what's the holiday is June 13th in us?"},
|
|
11
|
-
{
|
|
12
|
-
|
|
14
|
+
{
|
|
15
|
+
"role": "assistant",
|
|
16
|
+
"content": 'tool_name: calendar_lookup, args {"location": "USA", "data": "06-13-2025"}}',
|
|
17
|
+
},
|
|
18
|
+
{"role": "assistant", "content": "it's National Sewing Machine Day"},
|
|
13
19
|
]
|
|
14
20
|
|
|
15
21
|
|
|
@@ -18,7 +24,7 @@ class ExternalAgentValidation:
|
|
|
18
24
|
self.credential = credential
|
|
19
25
|
self.auth_scheme = auth_scheme
|
|
20
26
|
self.service_url = service_url
|
|
21
|
-
|
|
27
|
+
|
|
22
28
|
@property
|
|
23
29
|
def header(self):
|
|
24
30
|
header = {"Content-Type": "application/json"}
|
|
@@ -32,23 +38,23 @@ class ExternalAgentValidation:
|
|
|
32
38
|
return header
|
|
33
39
|
|
|
34
40
|
def _parse_streaming_events(self, resp: Generator[bytes, None, None]):
|
|
35
|
-
data = b
|
|
41
|
+
data = b""
|
|
36
42
|
for chunk in resp:
|
|
37
43
|
for line in chunk.splitlines(True):
|
|
38
|
-
if line.startswith(b
|
|
39
|
-
|
|
40
|
-
if line.strip() == b'[DONE]':
|
|
41
|
-
return
|
|
44
|
+
if line.startswith(b"event:"):
|
|
45
|
+
continue
|
|
42
46
|
data += line
|
|
43
|
-
if data.endswith((b
|
|
47
|
+
if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
|
|
44
48
|
# NOTE: edge case, "data" can be sent in two different chunks
|
|
45
|
-
if data.startswith(b
|
|
46
|
-
data = data.replace(b
|
|
49
|
+
if data.startswith(b"data:"):
|
|
50
|
+
data = data.replace(b"data:", b"")
|
|
51
|
+
if data.strip() == b"[DONE]":
|
|
52
|
+
return
|
|
47
53
|
yield data
|
|
48
|
-
data = b
|
|
54
|
+
data = b""
|
|
49
55
|
if data:
|
|
50
56
|
yield data
|
|
51
|
-
|
|
57
|
+
|
|
52
58
|
def _validate_streaming_response(self, resp):
|
|
53
59
|
success = True
|
|
54
60
|
logged_events = []
|
|
@@ -61,52 +67,59 @@ class ExternalAgentValidation:
|
|
|
61
67
|
except Exception as e:
|
|
62
68
|
success = False
|
|
63
69
|
break
|
|
64
|
-
|
|
70
|
+
|
|
65
71
|
return success, logged_events
|
|
66
72
|
|
|
67
73
|
def _validate_schema_compliance(self, messages):
|
|
68
74
|
payload = {"stream": True}
|
|
69
75
|
payload["messages"] = messages
|
|
70
|
-
resp = requests.post(
|
|
76
|
+
resp = requests.post(
|
|
77
|
+
url=self.service_url,
|
|
78
|
+
headers=self.header,
|
|
79
|
+
json=payload,
|
|
80
|
+
)
|
|
71
81
|
success, logged_events = self._validate_streaming_response(resp)
|
|
72
82
|
|
|
73
83
|
msg = ", ".join([msg["content"] for msg in payload["messages"]])
|
|
74
84
|
|
|
75
85
|
if success:
|
|
76
|
-
rich.print(
|
|
86
|
+
rich.print(
|
|
87
|
+
f":white_check_mark: External Agent streaming response validation succeeded for '{msg}'."
|
|
88
|
+
)
|
|
77
89
|
else:
|
|
78
|
-
rich.print(
|
|
90
|
+
rich.print(
|
|
91
|
+
f":heavy_exclamation_mark:Schema validation failed for messages: '{msg}':heavy_exclamation_mark:\n The last logged event was {logged_events[-1]}.\n"
|
|
92
|
+
)
|
|
79
93
|
|
|
80
94
|
return success, logged_events
|
|
81
95
|
|
|
82
|
-
def call_validation(
|
|
96
|
+
def call_validation(
|
|
97
|
+
self, input_str: str, add_context: bool = False
|
|
98
|
+
) -> SchemaValidationResults:
|
|
83
99
|
if add_context:
|
|
84
100
|
return self.block_validation(input_str)
|
|
85
101
|
|
|
86
|
-
msg = {
|
|
87
|
-
|
|
88
|
-
"content": input_str
|
|
89
|
-
}
|
|
90
|
-
|
|
102
|
+
msg = {"role": "user", "content": input_str}
|
|
103
|
+
|
|
91
104
|
success, logged_events = self._validate_schema_compliance([msg])
|
|
92
|
-
results = SchemaValidationResults(
|
|
105
|
+
results = SchemaValidationResults(
|
|
106
|
+
success=success, logged_events=logged_events, messages=[msg]
|
|
107
|
+
)
|
|
93
108
|
|
|
94
109
|
return results.model_dump()
|
|
95
|
-
|
|
110
|
+
|
|
96
111
|
def block_validation(self, input_str: str) -> SchemaValidationResults:
|
|
97
|
-
"""
|
|
98
|
-
"""
|
|
112
|
+
"""Tests a block of messages"""
|
|
99
113
|
rich.print(
|
|
100
114
|
f"[gold3]The following prebuilt messages, '{MESSAGES}' is prepended to the input message, '{input_str}'"
|
|
101
115
|
)
|
|
102
116
|
|
|
103
|
-
msg = {
|
|
104
|
-
"role": "user",
|
|
105
|
-
"content": input_str
|
|
106
|
-
}
|
|
117
|
+
msg = {"role": "user", "content": input_str}
|
|
107
118
|
|
|
108
119
|
messages = MESSAGES + [msg]
|
|
109
120
|
success, logged_events = self._validate_schema_compliance(messages)
|
|
110
|
-
results = SchemaValidationResults(
|
|
121
|
+
results = SchemaValidationResults(
|
|
122
|
+
success=success, logged_events=logged_events, messages=messages
|
|
123
|
+
)
|
|
111
124
|
|
|
112
|
-
return results.model_dump()
|
|
125
|
+
return results.model_dump()
|
|
@@ -1,10 +1,15 @@
|
|
|
1
|
-
from typing import List, Mapping
|
|
1
|
+
from typing import Any, List, Mapping
|
|
2
|
+
|
|
2
3
|
from rich.console import Console
|
|
3
4
|
|
|
4
|
-
from wxo_agentic_evaluation.external_agent import generate_starting_sentence
|
|
5
5
|
from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
|
|
6
|
-
from wxo_agentic_evaluation.
|
|
7
|
-
|
|
6
|
+
from wxo_agentic_evaluation.data_annotator import (
|
|
7
|
+
KeywordsGenerationLLM,
|
|
8
|
+
LlamaKeywordsGenerationTemplateRenderer,
|
|
9
|
+
)
|
|
10
|
+
from wxo_agentic_evaluation.external_agent import generate_starting_sentence
|
|
11
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
12
|
+
|
|
8
13
|
|
|
9
14
|
class ExternalAgentPerformanceTest:
|
|
10
15
|
def __init__(self, agent_name: str, test_data: List[str]):
|
|
@@ -12,55 +17,61 @@ class ExternalAgentPerformanceTest:
|
|
|
12
17
|
self.goal_template = {
|
|
13
18
|
"agent": agent_name,
|
|
14
19
|
"goals": {"summarize": []},
|
|
15
|
-
"goal_details": [
|
|
16
|
-
],
|
|
20
|
+
"goal_details": [],
|
|
17
21
|
"story": "<placeholder>",
|
|
18
22
|
}
|
|
19
23
|
|
|
20
24
|
kw_gen_config = KeywordsGenerationConfig()
|
|
21
25
|
|
|
22
|
-
provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
|
|
23
26
|
llm_decode_parameter = {
|
|
24
27
|
"min_new_tokens": 0,
|
|
25
28
|
"decoding_method": "greedy",
|
|
26
29
|
"max_new_tokens": 256,
|
|
27
30
|
}
|
|
28
|
-
wai_client = get_provider(
|
|
29
|
-
|
|
31
|
+
wai_client = get_provider(
|
|
32
|
+
model_id=kw_gen_config.model_id, params=llm_decode_parameter
|
|
33
|
+
)
|
|
34
|
+
|
|
30
35
|
self.kw_gen = KeywordsGenerationLLM(
|
|
31
36
|
provider=wai_client,
|
|
32
37
|
template=LlamaKeywordsGenerationTemplateRenderer(
|
|
33
38
|
kw_gen_config.prompt_config
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
39
|
+
),
|
|
40
|
+
)
|
|
41
|
+
|
|
37
42
|
def generate_tests(self) -> List[Mapping[str, Any]]:
|
|
38
43
|
console = Console()
|
|
39
44
|
goal_templates = []
|
|
40
45
|
|
|
41
|
-
with console.status(
|
|
46
|
+
with console.status(
|
|
47
|
+
"[gold3]Creating starting sentence for user story from input file for performance testing"
|
|
48
|
+
) as status:
|
|
42
49
|
for sentence, response in self.test_data:
|
|
43
50
|
goal_temp = self.goal_template.copy()
|
|
44
51
|
goal_temp["story"] = sentence
|
|
45
52
|
|
|
46
53
|
keywords = self.kw_gen.genereate_keywords(response)
|
|
47
54
|
summarize_step = {
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
55
|
+
"name": "summarize",
|
|
56
|
+
"type": "text",
|
|
57
|
+
"response": response,
|
|
58
|
+
"keywords": keywords,
|
|
59
|
+
}
|
|
53
60
|
goal_temp["goal_details"] = [summarize_step]
|
|
54
|
-
goal_temp["starting_sentence"] = generate_starting_sentence(
|
|
61
|
+
goal_temp["starting_sentence"] = generate_starting_sentence(
|
|
62
|
+
goal_temp
|
|
63
|
+
)
|
|
55
64
|
|
|
56
65
|
goal_templates.append(goal_temp)
|
|
57
|
-
|
|
66
|
+
|
|
58
67
|
status.stop()
|
|
59
|
-
console.print(
|
|
68
|
+
console.print(
|
|
69
|
+
"[bold green]Done creating starting sentence from provided input data"
|
|
70
|
+
)
|
|
60
71
|
|
|
61
72
|
return goal_templates
|
|
62
73
|
|
|
63
74
|
|
|
64
75
|
if __name__ == "__main__":
|
|
65
76
|
t = ExternalAgentPerformanceTest("test")
|
|
66
|
-
t.generate_tests()
|
|
77
|
+
t.generate_tests()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
from typing import Any, List, Literal, Mapping, Optional, Union
|
|
2
|
+
|
|
1
3
|
from pydantic import BaseModel
|
|
2
|
-
from typing import List, Union, Literal, Mapping, Any
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
class ThinkingStepDetails(BaseModel):
|
|
@@ -25,7 +26,9 @@ class ToolResponseStepDetails(BaseModel):
|
|
|
25
26
|
tool_call_id: str
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
StepDetails = Union[
|
|
29
|
+
StepDetails = Union[
|
|
30
|
+
ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails
|
|
31
|
+
]
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
class DeltaMessageChoice(BaseModel):
|
|
@@ -43,7 +46,7 @@ class ThreadRunStepDeltaChoice(BaseModel):
|
|
|
43
46
|
class BaseEventData(BaseModel):
|
|
44
47
|
id: str
|
|
45
48
|
object: str
|
|
46
|
-
thread_id: str
|
|
49
|
+
thread_id: Optional[str] = None
|
|
47
50
|
model: str | None = None
|
|
48
51
|
created: int | None = None
|
|
49
52
|
|
|
@@ -59,13 +62,11 @@ class ThreadRunStepDeltaData(BaseEventData):
|
|
|
59
62
|
|
|
60
63
|
|
|
61
64
|
class UniversalData(BaseEventData):
|
|
62
|
-
object:
|
|
63
|
-
Literal["thread.run.step.created"], Literal["thread.run.step.completed"]]
|
|
64
|
-
choices: List[ThreadMessageDeltaChoice]
|
|
65
|
+
object: Optional[str]
|
|
65
66
|
choices: List[Union[ThreadMessageDeltaChoice, dict]]
|
|
66
67
|
|
|
67
68
|
|
|
68
69
|
class SchemaValidationResults(BaseModel):
|
|
69
70
|
success: bool
|
|
70
71
|
logged_events: List[str]
|
|
71
|
-
messages: List[Mapping[Any, Any]]
|
|
72
|
+
messages: List[Mapping[Any, Any]]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.type import Message
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Extractor(ABC):
|
|
8
|
+
@property
|
|
9
|
+
@abstractmethod
|
|
10
|
+
def name(self) -> str:
|
|
11
|
+
"""Unique name for the extractor."""
|
|
12
|
+
raise NotImplementedError
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def extract(
|
|
17
|
+
messages: list[Message],
|
|
18
|
+
**kwargs,
|
|
19
|
+
) -> Any:
|
|
20
|
+
"""Extract data from messages."""
|
|
21
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, List, Mapping
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.extractors.extractor_base import Extractor
|
|
5
|
+
from wxo_agentic_evaluation.type import ContentType, GoalDetail, Message
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ExtractLabeledMessages(Extractor):
|
|
9
|
+
def name(self):
|
|
10
|
+
return "Labelled Messages"
|
|
11
|
+
|
|
12
|
+
def extract(
|
|
13
|
+
messages: List[Message],
|
|
14
|
+
ground_truth,
|
|
15
|
+
**kwargs,
|
|
16
|
+
) -> Any:
|
|
17
|
+
|
|
18
|
+
tool_dictionary = (
|
|
19
|
+
{
|
|
20
|
+
goal_detail.name: goal_detail
|
|
21
|
+
for goal_detail in ground_truth.goal_details
|
|
22
|
+
if goal_detail.type == ContentType.tool_call
|
|
23
|
+
}
|
|
24
|
+
if ground_truth.goal_details
|
|
25
|
+
else {}
|
|
26
|
+
)
|
|
27
|
+
labeled_messages = {}
|
|
28
|
+
for idx, message in enumerate(messages):
|
|
29
|
+
# TODO: investigate this logic - `message` body might not be consistent across providers
|
|
30
|
+
if not (message.role == "assistant" and message.tool_calls):
|
|
31
|
+
continue
|
|
32
|
+
try:
|
|
33
|
+
msg_tool_call = message.tool_calls[0].function
|
|
34
|
+
except Exception:
|
|
35
|
+
# ignore malformed tool_call content
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
matching_goal_details = [
|
|
39
|
+
gd
|
|
40
|
+
for gd in tool_dictionary.values()
|
|
41
|
+
if gd.tool_name == msg_tool_call.name
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
if matching_goal_details:
|
|
45
|
+
labeled_messages[idx] = matching_goal_details
|
|
46
|
+
|
|
47
|
+
return {"labeled_messages": labeled_messages}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from langchain.tools import tool
|
|
2
|
+
from langchain.agents import create_agent
|
|
3
|
+
from langchain_openai import ChatOpenAI
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
@tool
|
|
9
|
+
def get_assignment_id_hr_usecase(username: str) -> str:
|
|
10
|
+
"""
|
|
11
|
+
get the assignment id from username
|
|
12
|
+
:param username: username of the employee
|
|
13
|
+
"""
|
|
14
|
+
if username=="nwaters":
|
|
15
|
+
return "15778303"
|
|
16
|
+
if username=="johndoe":
|
|
17
|
+
return "15338303"
|
|
18
|
+
return "not found"
|
|
19
|
+
|
|
20
|
+
def validate_datetime(date_text):
|
|
21
|
+
try:
|
|
22
|
+
format = "%Y-%m-%d"
|
|
23
|
+
datetime.strptime(date_text, format)
|
|
24
|
+
return True
|
|
25
|
+
except ValueError:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@tool
|
|
30
|
+
def get_timeoff_schedule_hr_usecase(assignment_id: str, start_date: str, end_date: str) -> str:
|
|
31
|
+
"""
|
|
32
|
+
get timeoff schedule for employee based on assignment id, start date and end date
|
|
33
|
+
:param assignment_id: assignment_id of the user
|
|
34
|
+
:param start_date: start date of the timeoff scheduel, in YYYY-MM-DD format
|
|
35
|
+
:param assignment_id: end date of the timeoff scheduel, in YYYY-MM-DD format
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
if not validate_datetime(start_date):
|
|
39
|
+
return f"Incorrect date format {start_date}, should be YYYY-MM-DD"
|
|
40
|
+
if not validate_datetime(end_date):
|
|
41
|
+
return f"Incorrect date format {end_date}, should be YYYY-MM-DD"
|
|
42
|
+
if assignment_id=="15338303":
|
|
43
|
+
return json.dumps(["20250411", "20250311", "20250101"])
|
|
44
|
+
if assignment_id=="15778303":
|
|
45
|
+
return json.dumps(["20250105"])
|
|
46
|
+
return []
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@tool
|
|
50
|
+
def get_direct_reports_hr_usecase(username: str) -> str:
|
|
51
|
+
"""
|
|
52
|
+
get direct reports for a given username
|
|
53
|
+
:param assignment_id: assignment_id of the user
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
return json.dumps(["nwaters", "johndoe"])
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
|
60
|
+
tools = [get_assignment_id_hr_usecase, get_timeoff_schedule_hr_usecase, get_direct_reports_hr_usecase]
|
|
61
|
+
system_prompt="""You are an HR Agent that can answer questions related to timeoff and holiday calendar. Use the tools provided to answer the user's question. If you do not have enough information to answer the question, say so. If you need more information, ask follow up questions."""
|
|
62
|
+
|
|
63
|
+
agent = create_agent(
|
|
64
|
+
tools=tools,
|
|
65
|
+
model=llm,
|
|
66
|
+
system_prompt=system_prompt
|
|
67
|
+
)
|
|
68
|
+
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import List, Mapping, Union
|
|
3
|
+
|
|
4
|
+
import rich
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.type import (
|
|
7
|
+
LangfuseCollectionModel,
|
|
8
|
+
LangfuseDatasetModel,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LangfuseCollection:
|
|
13
|
+
def __init__(self, name, description="", metadata: Mapping[str, str] = {}):
|
|
14
|
+
self.name = name
|
|
15
|
+
self.description = description
|
|
16
|
+
self.metadata = metadata
|
|
17
|
+
|
|
18
|
+
def upload(self, paths: Union[str, List[str]]):
|
|
19
|
+
from langfuse import get_client
|
|
20
|
+
|
|
21
|
+
langfuse_client = get_client()
|
|
22
|
+
|
|
23
|
+
datasets = []
|
|
24
|
+
if isinstance(paths, str):
|
|
25
|
+
paths = [paths]
|
|
26
|
+
|
|
27
|
+
for path in paths:
|
|
28
|
+
with open(path, encoding="utf-8") as f:
|
|
29
|
+
dataset = json.load(f)
|
|
30
|
+
dataset = LangfuseDatasetModel(
|
|
31
|
+
starting_sentence=dataset.get("starting_sentence", ""),
|
|
32
|
+
story=dataset.get("story", ""),
|
|
33
|
+
goals=dataset.get("goals"),
|
|
34
|
+
goal_details=dataset.get("goal_details"),
|
|
35
|
+
agent=dataset.get("agent")
|
|
36
|
+
)
|
|
37
|
+
datasets.append(dataset)
|
|
38
|
+
|
|
39
|
+
collection = LangfuseCollectionModel(
|
|
40
|
+
collection_name=self.name,
|
|
41
|
+
collection_description=self.description,
|
|
42
|
+
datasets=datasets,
|
|
43
|
+
metadata=self.metadata,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
rich.print(
|
|
47
|
+
f"[g] Uploading {len(collection.datasets)} datasets to '{collection.collection_name}'"
|
|
48
|
+
)
|
|
49
|
+
langfuse_client.create_dataset(
|
|
50
|
+
name=collection.collection_name,
|
|
51
|
+
description=collection.collection_description,
|
|
52
|
+
metadata=collection.metadata,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
for dataset in collection.datasets:
|
|
56
|
+
langfuse_client.create_dataset_item(
|
|
57
|
+
dataset_name=collection.collection_name,
|
|
58
|
+
input=dataset.langfuse_input,
|
|
59
|
+
expected_output=dataset.langfuse_output,
|
|
60
|
+
)
|