ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,28 @@
1
1
  import importlib.resources
2
2
  import json
3
+
3
4
  import rich
4
5
 
5
- from wxo_agentic_evaluation.prompt.template_render import StoryGenerationTemplateRenderer
6
- from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
7
6
  from wxo_agentic_evaluation import prompt
7
+ from wxo_agentic_evaluation.prompt.template_render import (
8
+ StoryGenerationTemplateRenderer,
9
+ )
10
+ from wxo_agentic_evaluation.service_provider import get_provider
8
11
 
9
12
  console = rich.console.Console()
10
13
 
14
+
11
15
  def starting_sentence_generation_prompt():
12
- with importlib.resources.path(prompt, "starting_sentence_generation_prompt.jinja2") as fp:
16
+ with importlib.resources.path(
17
+ prompt, "starting_sentence_generation_prompt.jinja2"
18
+ ) as fp:
13
19
  # reuse the StoryGenerationTemplateRenderer class, even though we are generating a "starting_sentence" instead of a "story"
14
20
  # the starting sentence generation prompts uses the same input variable
15
21
  render = StoryGenerationTemplateRenderer(str(fp))
16
-
22
+
17
23
  return render
18
24
 
25
+
19
26
  def generate_starting_sentence(annotated_data: dict):
20
27
  renderer = starting_sentence_generation_prompt()
21
28
  llm_decode_parameter = {
@@ -23,7 +30,9 @@ def generate_starting_sentence(annotated_data: dict):
23
30
  "decoding_method": "greedy",
24
31
  "max_new_tokens": 4096,
25
32
  }
26
- wai_client = get_provider(config=ProviderConfig(), params=llm_decode_parameter)
33
+ wai_client = get_provider(
34
+ model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter
35
+ )
27
36
  prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
28
37
  res = wai_client.query(prompt)
29
38
  res = res.strip()
@@ -33,5 +42,7 @@ def generate_starting_sentence(annotated_data: dict):
33
42
  res = json.loads(res)
34
43
  return res["starting_sentence"]
35
44
  except Exception:
36
- console.log(f"The generated `starting_sentence` had incorrect format: '{res}'")
37
- return res
45
+ console.log(
46
+ f"The generated `starting_sentence` had incorrect format: '{res}'"
47
+ )
48
+ return res
@@ -1,15 +1,21 @@
1
+ import json
1
2
  from typing import Generator
3
+
2
4
  import requests
3
- import json
4
5
  import rich
5
6
 
6
- from wxo_agentic_evaluation.external_agent.types import UniversalData, SchemaValidationResults
7
-
7
+ from wxo_agentic_evaluation.external_agent.types import (
8
+ SchemaValidationResults,
9
+ UniversalData,
10
+ )
8
11
 
9
12
  MESSAGES = [
10
13
  {"role": "user", "content": "what's the holiday is June 13th in us?"},
11
- {"role": "assistant", "content": "tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}"},
12
- {"role": "assistant", "content":"it's National Sewing Machine Day"}
14
+ {
15
+ "role": "assistant",
16
+ "content": 'tool_name: calendar_lookup, args {"location": "USA", "data": "06-13-2025"}}',
17
+ },
18
+ {"role": "assistant", "content": "it's National Sewing Machine Day"},
13
19
  ]
14
20
 
15
21
 
@@ -18,7 +24,7 @@ class ExternalAgentValidation:
18
24
  self.credential = credential
19
25
  self.auth_scheme = auth_scheme
20
26
  self.service_url = service_url
21
-
27
+
22
28
  @property
23
29
  def header(self):
24
30
  header = {"Content-Type": "application/json"}
@@ -32,23 +38,23 @@ class ExternalAgentValidation:
32
38
  return header
33
39
 
34
40
  def _parse_streaming_events(self, resp: Generator[bytes, None, None]):
35
- data = b''
41
+ data = b""
36
42
  for chunk in resp:
37
43
  for line in chunk.splitlines(True):
38
- if line.startswith(b'data:'):
39
- line = line.replace(b'data:', b'')
40
- if line.strip() == b'[DONE]':
41
- return
44
+ if line.startswith(b"event:"):
45
+ continue
42
46
  data += line
43
- if data.endswith((b'\r\r', b'\n\n', b'\r\n\r\n')):
47
+ if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
44
48
  # NOTE: edge case, "data" can be sent in two different chunks
45
- if data.startswith(b'data:'):
46
- data = data.replace(b'data:', b'')
49
+ if data.startswith(b"data:"):
50
+ data = data.replace(b"data:", b"")
51
+ if data.strip() == b"[DONE]":
52
+ return
47
53
  yield data
48
- data = b''
54
+ data = b""
49
55
  if data:
50
56
  yield data
51
-
57
+
52
58
  def _validate_streaming_response(self, resp):
53
59
  success = True
54
60
  logged_events = []
@@ -61,52 +67,59 @@ class ExternalAgentValidation:
61
67
  except Exception as e:
62
68
  success = False
63
69
  break
64
-
70
+
65
71
  return success, logged_events
66
72
 
67
73
  def _validate_schema_compliance(self, messages):
68
74
  payload = {"stream": True}
69
75
  payload["messages"] = messages
70
- resp = requests.post(url=self.service_url, headers=self.header, json=payload)
76
+ resp = requests.post(
77
+ url=self.service_url,
78
+ headers=self.header,
79
+ json=payload,
80
+ )
71
81
  success, logged_events = self._validate_streaming_response(resp)
72
82
 
73
83
  msg = ", ".join([msg["content"] for msg in payload["messages"]])
74
84
 
75
85
  if success:
76
- rich.print(f":white_check_mark: External Agent streaming response validation succeeded for '{msg}'.")
86
+ rich.print(
87
+ f":white_check_mark: External Agent streaming response validation succeeded for '{msg}'."
88
+ )
77
89
  else:
78
- rich.print(f":heavy_exclamation_mark:Schema validation failed for messages: '{msg}':heavy_exclamation_mark:\n The last logged event was {logged_events[-1]}.\n")
90
+ rich.print(
91
+ f":heavy_exclamation_mark:Schema validation failed for messages: '{msg}':heavy_exclamation_mark:\n The last logged event was {logged_events[-1]}.\n"
92
+ )
79
93
 
80
94
  return success, logged_events
81
95
 
82
- def call_validation(self, input_str: str, add_context: bool = False) -> SchemaValidationResults:
96
+ def call_validation(
97
+ self, input_str: str, add_context: bool = False
98
+ ) -> SchemaValidationResults:
83
99
  if add_context:
84
100
  return self.block_validation(input_str)
85
101
 
86
- msg = {
87
- "role": "user",
88
- "content": input_str
89
- }
90
-
102
+ msg = {"role": "user", "content": input_str}
103
+
91
104
  success, logged_events = self._validate_schema_compliance([msg])
92
- results = SchemaValidationResults(success=success, logged_events=logged_events, messages=[msg])
105
+ results = SchemaValidationResults(
106
+ success=success, logged_events=logged_events, messages=[msg]
107
+ )
93
108
 
94
109
  return results.model_dump()
95
-
110
+
96
111
  def block_validation(self, input_str: str) -> SchemaValidationResults:
97
- """ Tests a block of messages
98
- """
112
+ """Tests a block of messages"""
99
113
  rich.print(
100
114
  f"[gold3]The following prebuilt messages, '{MESSAGES}' is prepended to the input message, '{input_str}'"
101
115
  )
102
116
 
103
- msg = {
104
- "role": "user",
105
- "content": input_str
106
- }
117
+ msg = {"role": "user", "content": input_str}
107
118
 
108
119
  messages = MESSAGES + [msg]
109
120
  success, logged_events = self._validate_schema_compliance(messages)
110
- results = SchemaValidationResults(success=success, logged_events=logged_events, messages=messages)
121
+ results = SchemaValidationResults(
122
+ success=success, logged_events=logged_events, messages=messages
123
+ )
111
124
 
112
- return results.model_dump()
125
+ return results.model_dump()
@@ -1,10 +1,15 @@
1
- from typing import List, Mapping, Any
1
+ from typing import Any, List, Mapping
2
+
2
3
  from rich.console import Console
3
4
 
4
- from wxo_agentic_evaluation.external_agent import generate_starting_sentence
5
5
  from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
6
- from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
7
- from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
6
+ from wxo_agentic_evaluation.data_annotator import (
7
+ KeywordsGenerationLLM,
8
+ LlamaKeywordsGenerationTemplateRenderer,
9
+ )
10
+ from wxo_agentic_evaluation.external_agent import generate_starting_sentence
11
+ from wxo_agentic_evaluation.service_provider import get_provider
12
+
8
13
 
9
14
  class ExternalAgentPerformanceTest:
10
15
  def __init__(self, agent_name: str, test_data: List[str]):
@@ -12,55 +17,61 @@ class ExternalAgentPerformanceTest:
12
17
  self.goal_template = {
13
18
  "agent": agent_name,
14
19
  "goals": {"summarize": []},
15
- "goal_details": [
16
- ],
20
+ "goal_details": [],
17
21
  "story": "<placeholder>",
18
22
  }
19
23
 
20
24
  kw_gen_config = KeywordsGenerationConfig()
21
25
 
22
- provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
23
26
  llm_decode_parameter = {
24
27
  "min_new_tokens": 0,
25
28
  "decoding_method": "greedy",
26
29
  "max_new_tokens": 256,
27
30
  }
28
- wai_client = get_provider(config=provider_config, params=llm_decode_parameter)
29
-
31
+ wai_client = get_provider(
32
+ model_id=kw_gen_config.model_id, params=llm_decode_parameter
33
+ )
34
+
30
35
  self.kw_gen = KeywordsGenerationLLM(
31
36
  provider=wai_client,
32
37
  template=LlamaKeywordsGenerationTemplateRenderer(
33
38
  kw_gen_config.prompt_config
34
- ),
35
- )
36
-
39
+ ),
40
+ )
41
+
37
42
  def generate_tests(self) -> List[Mapping[str, Any]]:
38
43
  console = Console()
39
44
  goal_templates = []
40
45
 
41
- with console.status("[gold3]Creating starting sentence for user story from input file for performance testing") as status:
46
+ with console.status(
47
+ "[gold3]Creating starting sentence for user story from input file for performance testing"
48
+ ) as status:
42
49
  for sentence, response in self.test_data:
43
50
  goal_temp = self.goal_template.copy()
44
51
  goal_temp["story"] = sentence
45
52
 
46
53
  keywords = self.kw_gen.genereate_keywords(response)
47
54
  summarize_step = {
48
- "name": "summarize",
49
- "type": "text",
50
- "response": response,
51
- "keywords": keywords
52
- }
55
+ "name": "summarize",
56
+ "type": "text",
57
+ "response": response,
58
+ "keywords": keywords,
59
+ }
53
60
  goal_temp["goal_details"] = [summarize_step]
54
- goal_temp["starting_sentence"] = generate_starting_sentence(goal_temp)
61
+ goal_temp["starting_sentence"] = generate_starting_sentence(
62
+ goal_temp
63
+ )
55
64
 
56
65
  goal_templates.append(goal_temp)
57
-
66
+
58
67
  status.stop()
59
- console.print("[bold green]Done creating starting sentence from provided input data")
68
+ console.print(
69
+ "[bold green]Done creating starting sentence from provided input data"
70
+ )
60
71
 
61
72
  return goal_templates
62
73
 
63
74
 
64
75
  if __name__ == "__main__":
65
76
  t = ExternalAgentPerformanceTest("test")
66
- t.generate_tests()
77
+ t.generate_tests()
@@ -1,5 +1,6 @@
1
+ from typing import Any, List, Literal, Mapping, Optional, Union
2
+
1
3
  from pydantic import BaseModel
2
- from typing import List, Union, Literal, Mapping, Any
3
4
 
4
5
 
5
6
  class ThinkingStepDetails(BaseModel):
@@ -25,7 +26,9 @@ class ToolResponseStepDetails(BaseModel):
25
26
  tool_call_id: str
26
27
 
27
28
 
28
- StepDetails = Union[ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails]
29
+ StepDetails = Union[
30
+ ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails
31
+ ]
29
32
 
30
33
 
31
34
  class DeltaMessageChoice(BaseModel):
@@ -43,7 +46,7 @@ class ThreadRunStepDeltaChoice(BaseModel):
43
46
  class BaseEventData(BaseModel):
44
47
  id: str
45
48
  object: str
46
- thread_id: str
49
+ thread_id: Optional[str] = None
47
50
  model: str | None = None
48
51
  created: int | None = None
49
52
 
@@ -59,13 +62,11 @@ class ThreadRunStepDeltaData(BaseEventData):
59
62
 
60
63
 
61
64
  class UniversalData(BaseEventData):
62
- object: Union[Literal["thread.message.delta"], Literal["thread.run.step.delta"],
63
- Literal["thread.run.step.created"], Literal["thread.run.step.completed"]]
64
- choices: List[ThreadMessageDeltaChoice]
65
+ object: Optional[str]
65
66
  choices: List[Union[ThreadMessageDeltaChoice, dict]]
66
67
 
67
68
 
68
69
  class SchemaValidationResults(BaseModel):
69
70
  success: bool
70
71
  logged_events: List[str]
71
- messages: List[Mapping[Any, Any]]
72
+ messages: List[Mapping[Any, Any]]
@@ -0,0 +1,3 @@
1
+ from wxo_agentic_evaluation.extractors.labeled_messages import (
2
+ ExtractLabeledMessages,
3
+ )
@@ -0,0 +1,21 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ from wxo_agentic_evaluation.type import Message
5
+
6
+
7
+ class Extractor(ABC):
8
+ @property
9
+ @abstractmethod
10
+ def name(self) -> str:
11
+ """Unique name for the extractor."""
12
+ raise NotImplementedError
13
+
14
+ @staticmethod
15
+ @abstractmethod
16
+ def extract(
17
+ messages: list[Message],
18
+ **kwargs,
19
+ ) -> Any:
20
+ """Extract data from messages."""
21
+ raise NotImplementedError
@@ -0,0 +1,47 @@
1
+ import json
2
+ from typing import Any, List, Mapping
3
+
4
+ from wxo_agentic_evaluation.extractors.extractor_base import Extractor
5
+ from wxo_agentic_evaluation.type import ContentType, GoalDetail, Message
6
+
7
+
8
+ class ExtractLabeledMessages(Extractor):
9
+ def name(self):
10
+ return "Labelled Messages"
11
+
12
+ def extract(
13
+ messages: List[Message],
14
+ ground_truth,
15
+ **kwargs,
16
+ ) -> Any:
17
+
18
+ tool_dictionary = (
19
+ {
20
+ goal_detail.name: goal_detail
21
+ for goal_detail in ground_truth.goal_details
22
+ if goal_detail.type == ContentType.tool_call
23
+ }
24
+ if ground_truth.goal_details
25
+ else {}
26
+ )
27
+ labeled_messages = {}
28
+ for idx, message in enumerate(messages):
29
+ # TODO: investigate this logic - `message` body might not be consistent across providers
30
+ if not (message.role == "assistant" and message.tool_calls):
31
+ continue
32
+ try:
33
+ msg_tool_call = message.tool_calls[0].function
34
+ except Exception:
35
+ # ignore malformed tool_call content
36
+ continue
37
+
38
+ matching_goal_details = [
39
+ gd
40
+ for gd in tool_dictionary.values()
41
+ if gd.tool_name == msg_tool_call.name
42
+ ]
43
+
44
+ if matching_goal_details:
45
+ labeled_messages[idx] = matching_goal_details
46
+
47
+ return {"labeled_messages": labeled_messages}
@@ -0,0 +1,68 @@
1
+ from langchain.tools import tool
2
+ from langchain.agents import create_agent
3
+ from langchain_openai import ChatOpenAI
4
+
5
+ import json
6
+ from datetime import datetime
7
+
8
+ @tool
9
+ def get_assignment_id_hr_usecase(username: str) -> str:
10
+ """
11
+ get the assignment id from username
12
+ :param username: username of the employee
13
+ """
14
+ if username=="nwaters":
15
+ return "15778303"
16
+ if username=="johndoe":
17
+ return "15338303"
18
+ return "not found"
19
+
20
+ def validate_datetime(date_text):
21
+ try:
22
+ format = "%Y-%m-%d"
23
+ datetime.strptime(date_text, format)
24
+ return True
25
+ except ValueError:
26
+ return False
27
+
28
+
29
+ @tool
30
+ def get_timeoff_schedule_hr_usecase(assignment_id: str, start_date: str, end_date: str) -> str:
31
+ """
32
+ get timeoff schedule for employee based on assignment id, start date and end date
33
+ :param assignment_id: assignment_id of the user
34
+ :param start_date: start date of the timeoff scheduel, in YYYY-MM-DD format
35
+ :param assignment_id: end date of the timeoff scheduel, in YYYY-MM-DD format
36
+ """
37
+
38
+ if not validate_datetime(start_date):
39
+ return f"Incorrect date format {start_date}, should be YYYY-MM-DD"
40
+ if not validate_datetime(end_date):
41
+ return f"Incorrect date format {end_date}, should be YYYY-MM-DD"
42
+ if assignment_id=="15338303":
43
+ return json.dumps(["20250411", "20250311", "20250101"])
44
+ if assignment_id=="15778303":
45
+ return json.dumps(["20250105"])
46
+ return []
47
+
48
+
49
+ @tool
50
+ def get_direct_reports_hr_usecase(username: str) -> str:
51
+ """
52
+ get direct reports for a given username
53
+ :param assignment_id: assignment_id of the user
54
+ """
55
+
56
+ return json.dumps(["nwaters", "johndoe"])
57
+
58
+
59
+ llm = ChatOpenAI(model="gpt-4o-mini")
60
+ tools = [get_assignment_id_hr_usecase, get_timeoff_schedule_hr_usecase, get_direct_reports_hr_usecase]
61
+ system_prompt="""You are an HR Agent that can answer questions related to timeoff and holiday calendar. Use the tools provided to answer the user's question. If you do not have enough information to answer the question, say so. If you need more information, ask follow up questions."""
62
+
63
+ agent = create_agent(
64
+ tools=tools,
65
+ model=llm,
66
+ system_prompt=system_prompt
67
+ )
68
+
@@ -0,0 +1,60 @@
1
+ import json
2
+ from typing import List, Mapping, Union
3
+
4
+ import rich
5
+
6
+ from wxo_agentic_evaluation.type import (
7
+ LangfuseCollectionModel,
8
+ LangfuseDatasetModel,
9
+ )
10
+
11
+
12
+ class LangfuseCollection:
13
+ def __init__(self, name, description="", metadata: Mapping[str, str] = {}):
14
+ self.name = name
15
+ self.description = description
16
+ self.metadata = metadata
17
+
18
+ def upload(self, paths: Union[str, List[str]]):
19
+ from langfuse import get_client
20
+
21
+ langfuse_client = get_client()
22
+
23
+ datasets = []
24
+ if isinstance(paths, str):
25
+ paths = [paths]
26
+
27
+ for path in paths:
28
+ with open(path, encoding="utf-8") as f:
29
+ dataset = json.load(f)
30
+ dataset = LangfuseDatasetModel(
31
+ starting_sentence=dataset.get("starting_sentence", ""),
32
+ story=dataset.get("story", ""),
33
+ goals=dataset.get("goals"),
34
+ goal_details=dataset.get("goal_details"),
35
+ agent=dataset.get("agent")
36
+ )
37
+ datasets.append(dataset)
38
+
39
+ collection = LangfuseCollectionModel(
40
+ collection_name=self.name,
41
+ collection_description=self.description,
42
+ datasets=datasets,
43
+ metadata=self.metadata,
44
+ )
45
+
46
+ rich.print(
47
+ f"[g] Uploading {len(collection.datasets)} datasets to '{collection.collection_name}'"
48
+ )
49
+ langfuse_client.create_dataset(
50
+ name=collection.collection_name,
51
+ description=collection.collection_description,
52
+ metadata=collection.metadata,
53
+ )
54
+
55
+ for dataset in collection.datasets:
56
+ langfuse_client.create_dataset_item(
57
+ dataset_name=collection.collection_name,
58
+ input=dataset.langfuse_input,
59
+ expected_output=dataset.langfuse_output,
60
+ )