ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,193 @@
1
+ import json
2
+ from typing import Any, Dict, List, Optional, Tuple
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from wxo_agentic_evaluation.referenceless_eval.metrics.field import NumericField
7
+ from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
8
+ from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
9
+ MetricPrompt,
10
+ )
11
+ from wxo_agentic_evaluation.referenceless_eval.prompt.runner import (
12
+ AsyncGen,
13
+ Prompt,
14
+ PromptAndSchema,
15
+ PromptResult,
16
+ PromptRunner,
17
+ SyncGen,
18
+ )
19
+
20
+
21
+ class MetricRunResult(BaseModel):
22
+ """
23
+ Structured result for a single metric invocation.
24
+ """
25
+
26
+ metric_name: str
27
+ jsonschema: Dict[str, Any]
28
+ prompt: Prompt
29
+ raw_response: Any
30
+ numeric_thresholds_checks: Dict[str, bool]
31
+ error: Optional[str]
32
+ is_important: bool
33
+ importance_reason: Optional[str]
34
+ is_correct: bool
35
+ correctness_reason: Optional[str]
36
+ is_issue: bool
37
+
38
+
39
+ class MetricRunner:
40
+ """
41
+ Orchestrates running multiple metrics via LLM calls.
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None,
47
+ ) -> None:
48
+ """
49
+ Args:
50
+ entries: Optional list of (MetricPrompt, user_kwargs) pairs.
51
+ """
52
+ self.entries: List[Dict[str, Any]] = []
53
+ if entries:
54
+ for mp, kw in entries:
55
+ self.add(mp, kw)
56
+
57
+ def add(
58
+ self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]
59
+ ) -> None:
60
+ """
61
+ Add a metric to run.
62
+
63
+ Args:
64
+ metric_prompt: MetricPrompt instance.
65
+ user_kwargs: Dict of variables to render the user template.
66
+ """
67
+ messages = metric_prompt.build_messages(user_kwargs)
68
+ self.entries.append(
69
+ {
70
+ "metric_prompt": metric_prompt,
71
+ "user_kwargs": user_kwargs,
72
+ "messages": messages,
73
+ "schema": metric_prompt.metric.to_jsonschema(),
74
+ }
75
+ )
76
+
77
+ def remove(self, index: int) -> None:
78
+ """Remove the entry at the given index."""
79
+ self.entries.pop(index)
80
+
81
+ def clear(self) -> None:
82
+ """Remove all entries."""
83
+ self.entries.clear()
84
+
85
+ def _assemble_prompts(self) -> List[PromptAndSchema]:
86
+ return [(e["messages"], e["schema"]) for e in self.entries]
87
+
88
+ def _process_results(
89
+ self, prompt_results: List[PromptResult]
90
+ ) -> List[MetricRunResult]:
91
+ """
92
+ Combine PromptResult with metric parsing, threshold checks,
93
+ importance and correctness determinations.
94
+ """
95
+ results: List[MetricRunResult] = []
96
+
97
+ for entry, pr in zip(self.entries, prompt_results):
98
+ mp: MetricPrompt = entry["metric_prompt"]
99
+ metric: Metric = mp.metric
100
+
101
+ # default values
102
+ numeric_thresholds_checks: Dict[str, bool] = {}
103
+ err = pr.error
104
+ is_imp = False
105
+ imp_reason = None
106
+ is_corr = False
107
+ corr_reason = None
108
+ data = None
109
+
110
+ if pr.error is None:
111
+ try:
112
+ # parse raw response into JSON-compatible dict
113
+ raw = pr.response
114
+ if isinstance(raw, str):
115
+ data = json.loads(raw)
116
+ else:
117
+ data = raw
118
+
119
+ # numeric threshold checks
120
+ for field in metric.fields:
121
+ if isinstance(field, NumericField):
122
+ val = data.get(field.name)
123
+ ok = False
124
+ if isinstance(val, (int, float)):
125
+ ok = field.is_within_threshold(val)
126
+ numeric_thresholds_checks[field.name] = ok
127
+
128
+ # importance and correctness
129
+ is_imp, imp_reason = metric.is_important(data)
130
+ is_corr, corr_reason = metric.is_correct(data)
131
+
132
+ except Exception as e:
133
+ err = str(e)
134
+
135
+ # Build the result model
136
+ result = MetricRunResult(
137
+ metric_name=metric.name,
138
+ jsonschema=entry["schema"],
139
+ prompt=pr.prompt,
140
+ raw_response=data,
141
+ numeric_thresholds_checks=numeric_thresholds_checks,
142
+ error=err,
143
+ is_important=is_imp,
144
+ importance_reason=imp_reason,
145
+ is_correct=is_corr,
146
+ correctness_reason=corr_reason,
147
+ is_issue=is_imp and not is_corr,
148
+ )
149
+ results.append(result)
150
+
151
+ return results
152
+
153
+ def run_all(
154
+ self,
155
+ gen_fn: SyncGen,
156
+ prompt_param_name: str = "prompt",
157
+ schema_param_name: Optional[str] = None,
158
+ **kwargs: Any,
159
+ ) -> List[MetricRunResult]:
160
+ """
161
+ Run all metrics using a synchronous single-prompt generator.
162
+ """
163
+ prompts = self._assemble_prompts()
164
+ runner = PromptRunner(prompts)
165
+ pr_results = runner.run_all(
166
+ gen_fn,
167
+ prompt_param_name=prompt_param_name,
168
+ schema_param_name=schema_param_name,
169
+ **kwargs,
170
+ )
171
+ return self._process_results(pr_results)
172
+
173
+ async def run_async(
174
+ self,
175
+ async_fn: AsyncGen,
176
+ max_parallel: int = 10,
177
+ prompt_param_name: str = "prompt",
178
+ schema_param_name: Optional[str] = None,
179
+ **kwargs: Any,
180
+ ) -> List[MetricRunResult]:
181
+ """
182
+ Run all metrics using asynchronous single-prompt generation.
183
+ """
184
+ prompts = self._assemble_prompts()
185
+ runner = PromptRunner(prompts)
186
+ pr_results = await runner.run_async(
187
+ async_fn,
188
+ max_parallel=max_parallel,
189
+ prompt_param_name=prompt_param_name,
190
+ schema_param_name=schema_param_name,
191
+ **kwargs,
192
+ )
193
+ return self._process_results(pr_results)
@@ -0,0 +1,413 @@
1
+ import json
2
+ from typing import Any, Dict, List, Optional, Tuple, Type
3
+
4
+ import jsonschema
5
+ from jinja2 import BaseLoader, Environment, Template
6
+ from pydantic import BaseModel, ValidationError, create_model
7
+
8
+ from wxo_agentic_evaluation.referenceless_eval.metrics.field import (
9
+ CorrectionField,
10
+ EvidenceField,
11
+ ExplanationField,
12
+ NumericField,
13
+ )
14
+ from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
15
+ from wxo_agentic_evaluation.referenceless_eval.metrics.utils import (
16
+ remove_threshold_fields,
17
+ validate_template_context,
18
+ )
19
+
20
+ # Jinja2 environment for string templates
21
+ _jinja_env = Environment(loader=BaseLoader(), autoescape=False)
22
+
23
+
24
+ class MetricPrompt:
25
+ """
26
+ Combines a Metric with system and user prompt templates, plus optional few-shot examples.
27
+
28
+ Attributes:
29
+ metric: Metric instance describing the schema to validate outputs.
30
+ system_template: Jinja2 Template for the system message.
31
+ user_template: Jinja2 Template for the user message.
32
+ examples: List of (user_kwargs, output_dict) pairs.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ metric: Metric,
38
+ system_template: str,
39
+ user_template: str,
40
+ *,
41
+ system_kwargs_defaults: Optional[Dict[str, Any]] = None,
42
+ ) -> None:
43
+ """
44
+ Args:
45
+ metric: Metric whose JSONSchema will be used for validation.
46
+ system_template: Template string for the system message; may reference keys.
47
+ user_template: Template string for the user message; may reference user_kwargs keys.
48
+ system_kwargs_defaults: Optional default context for system template.
49
+ """
50
+ if not isinstance(system_template, str):
51
+ raise TypeError("system_template must be a string")
52
+ if not isinstance(user_template, str):
53
+ raise TypeError("user_template must be a string")
54
+ if not system_kwargs_defaults:
55
+ system_kwargs_defaults = {} # Default to empty dict if None
56
+ if not isinstance(system_kwargs_defaults, dict):
57
+ raise TypeError("system_kwargs_defaults must be a dict")
58
+ if not isinstance(metric, Metric):
59
+ raise TypeError("metric must be an instance of Metric")
60
+
61
+ self._system_template_str: str = system_template
62
+ self._user_template_str: str = user_template
63
+
64
+ # Compile Jinja2 templates
65
+ self._system_tmpl: Template = _jinja_env.from_string(system_template)
66
+ self._user_tmpl: Template = _jinja_env.from_string(user_template)
67
+
68
+ # Store defaults for system context
69
+ # This allows overriding system context without modifying the template
70
+ # during prompt building
71
+ self.system_kwargs_defaults: Dict[str, Any] = (
72
+ system_kwargs_defaults.copy()
73
+ )
74
+
75
+ # Initialize examples list
76
+ # This will hold (user_kwargs, output) pairs for few-shot prompting
77
+ self.examples: List[Tuple[Dict[str, Any], Dict[str, Any]]] = []
78
+
79
+ # Store the metric for validation
80
+ # This allows the prompt to validate example outputs against the metric's schema
81
+ self.metric = metric
82
+
83
+ # --- Getters and Setters ---
84
+
85
+ def get_system_template(self) -> str:
86
+ return self._system_tmpl.source
87
+
88
+ def set_system_template(self, template_str: str) -> None:
89
+ self._system_tmpl = _jinja_env.from_string(template_str)
90
+
91
+ def get_user_template(self) -> str:
92
+ return self._user_tmpl.source
93
+
94
+ def set_user_template(self, template_str: str) -> None:
95
+ """
96
+ Setting a new user template clears existing examples.
97
+ """
98
+ self._user_tmpl = _jinja_env.from_string(template_str)
99
+ self.examples.clear()
100
+
101
+ def get_system_kwargs_defaults(self) -> Dict[str, Any]:
102
+ return dict(self.system_kwargs_defaults)
103
+
104
+ def set_system_kwargs_defaults(self, defaults: Dict[str, Any]) -> None:
105
+ self.system_kwargs_defaults = defaults
106
+
107
+ # --- Example Management ---
108
+
109
+ def add_example(
110
+ self, user_kwargs: Dict[str, Any], output: Dict[str, Any]
111
+ ) -> None:
112
+ """
113
+ Add a few-shot example.
114
+
115
+ Validates that `output` adheres to this.metric's JSONSchema.
116
+
117
+ Args:
118
+ user_kwargs: Variables for rendering the user_template.
119
+ output: Dict matching the metric's schema.
120
+
121
+ Raises:
122
+ ValidationError if output invalid.
123
+ """
124
+ schema = self.metric.to_jsonschema()
125
+ # 1) JSONSchema structural validation
126
+ jsonschema.validate(instance=output, schema=schema)
127
+ # 2) Pydantic type/enum validation
128
+ Model: Type[BaseModel] = self._build_response_model()
129
+ try:
130
+ Model.model_validate(output)
131
+ except ValidationError as e:
132
+ raise ValueError(f"Example output failed validation: {e}")
133
+ self.examples.append((user_kwargs, output))
134
+
135
+ # --- Prompt Building ---
136
+
137
+ def build_messages(
138
+ self,
139
+ user_kwargs: Dict[str, Any],
140
+ system_kwargs: Optional[Dict[str, Any]] = None,
141
+ ) -> List[Dict[str, str]]:
142
+ """
143
+ Build the full chat messages sequence:
144
+
145
+ 1. System message rendered with:
146
+ - metric_jsonschema
147
+ - plus any system_kwargs (overrides defaults)
148
+ 2. For each example:
149
+ - User message from user_template with example user_kwargs
150
+ - Assistant message: JSON dump of example output
151
+ 3. Final user message with provided user_kwargs
152
+
153
+ Args:
154
+ user_kwargs: Variables for the final user prompt.
155
+ system_kwargs: Optional overrides for system template context.
156
+
157
+ Returns:
158
+ List of {"role": "...", "content": "..."} dicts.
159
+ """
160
+ msgs: List[Dict[str, str]] = []
161
+ # Prepare system context
162
+ ctx = self.system_kwargs_defaults
163
+ ctx["metric_jsonschema"] = json.dumps(
164
+ remove_threshold_fields(self.metric.to_jsonschema())
165
+ )
166
+
167
+ if system_kwargs:
168
+ ctx.update(system_kwargs)
169
+
170
+ # 1) System message
171
+ sys_text = self._system_tmpl.render(**ctx)
172
+ msgs.append({"role": "system", "content": sys_text})
173
+
174
+ try:
175
+ # 2) Few-shot examples
176
+ for ex_user_kwargs, ex_output in self.examples:
177
+ ex_user_kwargs_parsed = {
178
+ k: json.dumps(d) for k, d in ex_user_kwargs.items()
179
+ }
180
+ user_text = self._user_tmpl.render(**ex_user_kwargs_parsed)
181
+ msgs.append({"role": "user", "content": user_text})
182
+ assistant_text = json.dumps(ex_output, indent=None)
183
+ msgs.append({"role": "assistant", "content": assistant_text})
184
+
185
+ # 3) Final user message
186
+ final_user_kwargs_parsed = {}
187
+ for key, obj in user_kwargs.items():
188
+ final_user_kwargs_parsed[key] = json.dumps(obj)
189
+ final_user = self._user_tmpl.render(**final_user_kwargs_parsed)
190
+ except Exception as e:
191
+ raise e
192
+
193
+ msgs.append({"role": "user", "content": final_user})
194
+
195
+ return msgs
196
+
197
+ def build_messages(
198
+ self,
199
+ user_kwargs: Dict[str, Any],
200
+ system_kwargs: Optional[Dict[str, Any]] = None,
201
+ ) -> List[Dict[str, str]]:
202
+ """
203
+ Build the full chat messages sequence:
204
+
205
+ 1. System message rendered with:
206
+ - metric_jsonschema
207
+ - plus any system_kwargs (overrides defaults)
208
+ 2. For each example:
209
+ - User message from user_template with example user_kwargs
210
+ - Assistant message: JSON dump of example output
211
+ 3. Final user message with provided user_kwargs
212
+
213
+ Args:
214
+ user_kwargs: Variables for the final user prompt.
215
+ system_kwargs: Optional overrides for system template context.
216
+
217
+ Returns:
218
+ List of {"role": "...", "content": "..."} dicts.
219
+ """
220
+ msgs: List[Dict[str, str]] = []
221
+
222
+ # Prepare system context
223
+ ctx = self.system_kwargs_defaults.copy()
224
+ ctx["metric_jsonschema"] = json.dumps(
225
+ remove_threshold_fields(self.metric.to_jsonschema())
226
+ )
227
+ if system_kwargs:
228
+ ctx.update(system_kwargs)
229
+
230
+ # Validate and render system message
231
+ validate_template_context(
232
+ _jinja_env, self._system_template_str, ctx, "system_template"
233
+ )
234
+
235
+ sys_text = self._system_tmpl.render(**ctx)
236
+ msgs.append({"role": "system", "content": sys_text})
237
+
238
+ try:
239
+ # Few-shot examples
240
+ for ex_user_kwargs, ex_output in self.examples:
241
+ ex_user_kwargs_parsed = {
242
+ k: json.dumps(d) for k, d in ex_user_kwargs.items()
243
+ }
244
+ validate_template_context(
245
+ _jinja_env,
246
+ self._user_template_str,
247
+ ex_user_kwargs_parsed,
248
+ "user_template (example)",
249
+ )
250
+ user_text = self._user_tmpl.render(**ex_user_kwargs_parsed)
251
+ msgs.append({"role": "user", "content": user_text})
252
+
253
+ assistant_text = json.dumps(ex_output, indent=None)
254
+ msgs.append({"role": "assistant", "content": assistant_text})
255
+
256
+ # Final user message
257
+ final_user_kwargs_parsed = {
258
+ k: json.dumps(obj) for k, obj in user_kwargs.items()
259
+ }
260
+ validate_template_context(
261
+ _jinja_env,
262
+ self._user_template_str,
263
+ final_user_kwargs_parsed,
264
+ "user_template (final)",
265
+ )
266
+ # Render final user message
267
+ final_user = self._user_tmpl.render(**final_user_kwargs_parsed)
268
+
269
+ except Exception as e:
270
+ raise e
271
+
272
+ msgs.append({"role": "user", "content": final_user})
273
+ return msgs
274
+
275
+ def _build_response_model(self) -> Type[BaseModel]:
276
+ """
277
+ Dynamically construct a Pydantic model matching metric.to_jsonschema().
278
+ Used to enforce types beyond JSONSchema.
279
+ """
280
+ schema = self.metric.to_jsonschema()
281
+ props = schema.get("properties", {})
282
+ fields: Dict[str, Tuple[Any, Any]] = {}
283
+ for name, subs in props.items():
284
+ jtype = subs.get("type")
285
+ # map JSONSchema types -> Python types
286
+ if name in schema.get("required", []):
287
+ secondary_type = ...
288
+ else:
289
+ secondary_type = None
290
+
291
+ if jtype == "integer":
292
+ py = (int, secondary_type)
293
+ elif jtype == "number":
294
+ py = (float, secondary_type)
295
+ elif jtype == "string":
296
+ py = (str, secondary_type)
297
+ elif jtype == "boolean":
298
+ py = (bool, secondary_type)
299
+ elif jtype == "object":
300
+ py = (dict, secondary_type)
301
+ else:
302
+ py = (Any, secondary_type)
303
+ # handle enums
304
+ if "enum" in subs:
305
+ from typing import Literal
306
+
307
+ enum_vals = subs["enum"]
308
+ py = (Literal[tuple(enum_vals)], secondary_type)
309
+
310
+ # handle additional properties
311
+ if subs.get("additionalProperties", False):
312
+ # If additionalProperties is true, we allow any type
313
+ py = (Dict[str, Any], secondary_type)
314
+ fields[name] = py
315
+
316
+ Model = create_model(schema.get("title", "ResponseModel"), **fields)
317
+ return Model
318
+
319
+
320
+ # --- Example Subclass: RelevancePrompt ---
321
+
322
+
323
+ class RelevanceMetric(Metric):
324
+ """
325
+ Metric for assessing relevance of a response to its context.
326
+ """
327
+
328
+ def __init__(self) -> None:
329
+ desc = "Rate how relevant the response is to the given context on a 0-1 scale."
330
+ super().__init__(
331
+ name="Relevance",
332
+ description=desc,
333
+ fields=[
334
+ ExplanationField(
335
+ name="explanation",
336
+ json_type="string",
337
+ description="Why the response is or is not relevant, step by step.",
338
+ ),
339
+ EvidenceField(
340
+ name="evidence",
341
+ json_type="string",
342
+ description="Portion of context or response that supports your relevance rating.",
343
+ ),
344
+ NumericField(
345
+ name="output",
346
+ json_type="number",
347
+ description="Relevance score from 0.0 (not relevant) to 1.0 (fully relevant).",
348
+ jsonschema_extra={"minimum": 0.0, "maximum": 1.0},
349
+ extra_params={"threshold_low": 0.0, "threshold_high": 1.0},
350
+ ),
351
+ NumericField(
352
+ name="confidence",
353
+ json_type="number",
354
+ description="Confidence in your relevance judgment (0.0-1.0).",
355
+ jsonschema_extra={"minimum": 0.0, "maximum": 1.0},
356
+ extra_params={"threshold_low": 0.0, "threshold_high": 1.0},
357
+ ),
358
+ CorrectionField(
359
+ name="correction",
360
+ json_type="object",
361
+ description="If relevance is low, suggest how to improve relevance.",
362
+ ),
363
+ ],
364
+ )
365
+
366
+
367
+ class RelevancePrompt(MetricPrompt):
368
+ """
369
+ Prompt builder specialized for the RelevanceMetric.
370
+ Provides default templates and example usage.
371
+ """
372
+
373
+ def __init__(self) -> None:
374
+ metric = RelevanceMetric()
375
+ system_tmpl = (
376
+ "You are an expert judge that assesses response relevance. "
377
+ "Here is the JSONSchema for your response:\n"
378
+ "{{ metric_jsonschema }}"
379
+ )
380
+ user_tmpl = (
381
+ "Context: {{ context }}\n"
382
+ "Response: {{ response }}\n"
383
+ "Provide your evaluation as JSON as specified in the system prompt."
384
+ )
385
+ super().__init__(metric, system_tmpl, user_tmpl)
386
+
387
+ # Initialize default few-shot examples
388
+ self.add_example(
389
+ {
390
+ "context": "The sky is blue.",
391
+ "response": "The sky appears azure due to Rayleigh scattering.",
392
+ },
393
+ {
394
+ "evidence": "The sky appears azure due to Rayleigh scattering.",
395
+ "explanation": "The response directly addresses sky color by naming scattering physics.",
396
+ "output": 1.0,
397
+ "confidence": 0.9,
398
+ "correction": {},
399
+ },
400
+ )
401
+ self.add_example(
402
+ {
403
+ "context": "What is the capital of France?",
404
+ "response": "The moon orbits Earth every 27 days.",
405
+ },
406
+ {
407
+ "evidence": "The moon orbits Earth every 27 days.",
408
+ "explanation": "The response is about lunar orbit, unrelated to capitals.",
409
+ "output": 0.0,
410
+ "confidence": 0.8,
411
+ "correction": {"suggestion": "The capital of France is Paris."},
412
+ },
413
+ )
@@ -0,0 +1,46 @@
1
+ from typing import Any, Dict
2
+
3
+ from jinja2 import Environment, meta
4
+
5
+
6
+ def remove_threshold_fields(schema: dict) -> dict:
7
+ """
8
+ Recursively removes 'threshold_low' and 'threshold_high' fields from a JSON schema.
9
+ """
10
+ if isinstance(schema, dict):
11
+ # Remove the threshold fields if present
12
+ schema.pop("threshold_low", None)
13
+ schema.pop("threshold_high", None)
14
+ # Recurse into nested dictionaries and lists
15
+ for key, value in schema.items():
16
+ if isinstance(value, dict):
17
+ schema[key] = remove_threshold_fields(value)
18
+ elif isinstance(value, list):
19
+ schema[key] = [
20
+ (
21
+ remove_threshold_fields(item)
22
+ if isinstance(item, dict)
23
+ else item
24
+ )
25
+ for item in value
26
+ ]
27
+ return schema
28
+
29
+
30
+ def validate_template_context(
31
+ env: Environment,
32
+ template_str: str,
33
+ context: Dict[str, Any],
34
+ template_name: str = "",
35
+ ):
36
+ parsed = env.parse(template_str)
37
+ required_vars = meta.find_undeclared_variables(parsed)
38
+ missing_or_empty = [
39
+ var
40
+ for var in required_vars
41
+ if var not in context or context[var] in (None, "", [], {}, ())
42
+ ]
43
+ if missing_or_empty:
44
+ raise ValueError(
45
+ f"Missing or empty variables in template '{template_name or 'unnamed'}': {missing_or_empty}"
46
+ )