ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA +35 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD +65 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +9 -3
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation.py +42 -0
  14. wxo_agentic_evaluation/evaluation_package.py +117 -70
  15. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  16. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  17. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  18. wxo_agentic_evaluation/external_agent/types.py +12 -5
  19. wxo_agentic_evaluation/inference_backend.py +183 -79
  20. wxo_agentic_evaluation/llm_matching.py +4 -3
  21. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  22. wxo_agentic_evaluation/llm_user.py +7 -3
  23. wxo_agentic_evaluation/main.py +175 -67
  24. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  25. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  26. wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
  27. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +176 -0
  28. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
  29. wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
  30. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  31. wxo_agentic_evaluation/quick_eval.py +49 -23
  32. wxo_agentic_evaluation/record_chat.py +70 -33
  33. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  34. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  35. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  40. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  41. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  42. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  43. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  44. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  45. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  46. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  47. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  48. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  49. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  50. wxo_agentic_evaluation/resource_map.py +2 -1
  51. wxo_agentic_evaluation/service_instance.py +103 -21
  52. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  53. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +216 -34
  54. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  55. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  56. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  57. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  58. wxo_agentic_evaluation/tool_planner.py +128 -44
  59. wxo_agentic_evaluation/type.py +12 -9
  60. wxo_agentic_evaluation/utils/__init__.py +1 -0
  61. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  62. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  63. wxo_agentic_evaluation/utils/utils.py +83 -52
  64. ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
  65. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/WHEEL +0 -0
  66. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,6 @@
1
1
  from typing import Dict, List
2
2
 
3
- from jsonschema import (
4
- Draft7Validator,
5
- )
3
+ from jsonschema import Draft7Validator
6
4
 
7
5
  from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
8
6
  StaticMetricResult,
@@ -27,7 +25,9 @@ _STATIC_CHECKS: Dict[str, str] = {
27
25
  }
28
26
 
29
27
 
30
- def evaluate_static(apis_specs: List[ToolSpec], api_call: ToolCall) -> StaticResult:
28
+ def evaluate_static(
29
+ apis_specs: List[ToolSpec], api_call: ToolCall
30
+ ) -> StaticResult:
31
31
  """
32
32
  Perform static validation on a single tool call.
33
33
 
@@ -97,7 +97,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
97
97
  errors: Dict[str, str] = {}
98
98
 
99
99
  # 1) Function existence
100
- spec = next((s for s in specs if s.function.name == call.function.name), None)
100
+ spec = next(
101
+ (s for s in specs if s.function.name == call.function.name), None
102
+ )
101
103
  if not spec:
102
104
  errors["non_existent_function"] = (
103
105
  f"Function '{call.function.name}' does not exist in the provided API specifications:"
@@ -110,7 +112,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
110
112
  parsed_arguments = call.function.parsed_arguments
111
113
 
112
114
  # 2) Parameter existence check
113
- if non_existent_params := set(parsed_arguments.keys()) - set(properties.keys()):
115
+ if non_existent_params := set(parsed_arguments.keys()) - set(
116
+ properties.keys()
117
+ ):
114
118
  errors["non_existent_parameter"] = (
115
119
  f"Parameters not defined in function '{call.function.name}': "
116
120
  f"{', '.join(sorted(non_existent_params))}. "
@@ -126,7 +130,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
126
130
  other_errors = []
127
131
 
128
132
  for error in validator.iter_errors(parsed_arguments):
129
- field = ".".join(str(x) for x in error.path) if error.path else "unknown"
133
+ field = (
134
+ ".".join(str(x) for x in error.path) if error.path else "unknown"
135
+ )
130
136
  if error.validator == "required":
131
137
  missing_required.append(error.message)
132
138
  elif error.validator == "type":
@@ -145,12 +151,12 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
145
151
  "Incorrect parameter type(s): " + "; ".join(incorrect_types)
146
152
  )
147
153
  if invalid_enum:
148
- errors["allowed_values_violation"] = "Invalid parameter value(s): " + "; ".join(
149
- invalid_enum
154
+ errors["allowed_values_violation"] = (
155
+ "Invalid parameter value(s): " + "; ".join(invalid_enum)
150
156
  )
151
157
  if other_errors:
152
- errors["json_schema_validation"] = "Other validation error(s): " + "; ".join(
153
- other_errors
158
+ errors["json_schema_validation"] = (
159
+ "Other validation error(s): " + "; ".join(other_errors)
154
160
  )
155
161
 
156
162
  return errors
@@ -2,17 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  from types import NoneType
5
- from typing import (
6
- Any,
7
- Dict,
8
- List,
9
- Literal,
10
- Optional,
11
- Union,
12
- )
13
- from typing_extensions import Self
5
+ from typing import Any, Dict, List, Literal, Optional, Union
14
6
 
15
7
  from pydantic import BaseModel, Field, ValidationError, model_validator
8
+ from typing_extensions import Self
16
9
 
17
10
  from wxo_agentic_evaluation.referenceless_eval.metrics import MetricRunResult
18
11
 
@@ -32,12 +25,14 @@ class FunctionCallMetric(BaseModel):
32
25
  jsonschema: Dict[str, Any] = Field(
33
26
  ..., description="JSON Schema dict for this metric's output."
34
27
  )
35
- examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = Field(
36
- None,
37
- description=(
38
- "List of example inputs and outputs for this metric; "
39
- "each example is a dict with 'user_kwargs' and 'output' keys."
40
- ),
28
+ examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = (
29
+ Field(
30
+ None,
31
+ description=(
32
+ "List of example inputs and outputs for this metric; "
33
+ "each example is a dict with 'user_kwargs' and 'output' keys."
34
+ ),
35
+ )
41
36
  )
42
37
 
43
38
 
@@ -52,7 +47,8 @@ class StaticMetricResult(BaseModel):
52
47
  """
53
48
 
54
49
  description: str = Field(
55
- ..., description="Human-readable description of this static validation check."
50
+ ...,
51
+ description="Human-readable description of this static validation check.",
56
52
  )
57
53
  valid: bool = Field(
58
54
  ..., description="True if this static check passed; False otherwise."
@@ -73,7 +69,9 @@ class StaticResult(BaseModel):
73
69
 
74
70
  metrics: Dict[str, StaticMetricResult] = Field(
75
71
  ...,
76
- description=("Mapping from each static-check name to its StaticMetricResult."),
72
+ description=(
73
+ "Mapping from each static-check name to its StaticMetricResult."
74
+ ),
77
75
  )
78
76
  final_decision: bool = Field(
79
77
  ...,
@@ -133,7 +131,8 @@ class SemanticMetricResult(BaseModel):
133
131
  error: Optional[str] = Field(
134
132
  None,
135
133
  description=(
136
- "Error message if prompt generation or parsing failed; " "otherwise None."
134
+ "Error message if prompt generation or parsing failed; "
135
+ "otherwise None."
137
136
  ),
138
137
  )
139
138
  is_correct: bool = Field(
@@ -157,11 +156,11 @@ class SemanticMetricResult(BaseModel):
157
156
  ),
158
157
  )
159
158
 
160
- @model_validator(mode='after')
159
+ @model_validator(mode="after")
161
160
  def raw_response_json(self) -> Self:
162
161
  if isinstance(self.raw_response, str):
163
162
  self.raw_response = json.loads(self.raw_response)
164
-
163
+
165
164
  return self
166
165
 
167
166
  @classmethod
@@ -211,7 +210,9 @@ class SemanticCategoryResult(BaseModel):
211
210
 
212
211
  metrics: Optional[Dict[str, SemanticMetricResult]] = Field(
213
212
  None,
214
- description=("Mapping metric_name -> SemanticMetricResult for this category."),
213
+ description=(
214
+ "Mapping metric_name -> SemanticMetricResult for this category."
215
+ ),
215
216
  )
216
217
  avg_score: Optional[float] = Field(
217
218
  None,
@@ -222,7 +223,9 @@ class SemanticCategoryResult(BaseModel):
222
223
  )
223
224
 
224
225
  @classmethod
225
- def from_results(cls, results: List[MetricRunResult]) -> "SemanticCategoryResult":
226
+ def from_results(
227
+ cls, results: List[MetricRunResult]
228
+ ) -> "SemanticCategoryResult":
226
229
  """
227
230
  Build a category result from a list of MetricRunResult objects.
228
231
  """
@@ -249,11 +252,15 @@ class SemanticResult(BaseModel):
249
252
 
250
253
  general: Optional[SemanticCategoryResult] = Field(
251
254
  None,
252
- description=("Results of general tool-call metrics, if any; otherwise None."),
255
+ description=(
256
+ "Results of general tool-call metrics, if any; otherwise None."
257
+ ),
253
258
  )
254
259
  function_selection: Optional[SemanticCategoryResult] = Field(
255
260
  None,
256
- description=("Results of function-selection metrics, if any; otherwise None."),
261
+ description=(
262
+ "Results of function-selection metrics, if any; otherwise None."
263
+ ),
257
264
  )
258
265
  parameter: Optional[Dict[str, SemanticCategoryResult]] = Field(
259
266
  None,
@@ -302,7 +309,8 @@ class TransformResult(BaseModel):
302
309
  ),
303
310
  )
304
311
  execution_output: Any = Field(
305
- None, description="The actual output of executing the transformation code."
312
+ None,
313
+ description="The actual output of executing the transformation code.",
306
314
  )
307
315
  correction: Optional[str] = Field(
308
316
  None,
@@ -311,7 +319,8 @@ class TransformResult(BaseModel):
311
319
  error: Optional[str] = Field(
312
320
  None,
313
321
  description=(
314
- "Error message if code generation or execution failed; " "otherwise None."
322
+ "Error message if code generation or execution failed; "
323
+ "otherwise None."
315
324
  ),
316
325
  )
317
326
 
@@ -356,7 +365,9 @@ class PipelineResult(BaseModel):
356
365
  Final output of the function-calling pipeline for one tool call.
357
366
  """
358
367
 
359
- inputs: FunctionCallInput = Field(..., description="Echo of the pipeline inputs.")
368
+ inputs: FunctionCallInput = Field(
369
+ ..., description="Echo of the pipeline inputs."
370
+ )
360
371
  static: Optional[StaticResult] = Field(
361
372
  None, description="Static schema-validation results, if enabled."
362
373
  )
@@ -430,7 +441,9 @@ class PipelineResult(BaseModel):
430
441
  if param_avgs:
431
442
  cat_avgs.append(sum(param_avgs) / len(param_avgs))
432
443
 
433
- values.overall_avg_score = sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
444
+ values.overall_avg_score = (
445
+ sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
446
+ )
434
447
  values.overall_valid = ok
435
448
  return values
436
449
 
@@ -506,7 +519,9 @@ class ToolFunctionCall(BaseModel):
506
519
  Parsed representation of an LLM's function call response.
507
520
  """
508
521
 
509
- name: str = Field(..., description="Name of the function the LLM chose to call.")
522
+ name: str = Field(
523
+ ..., description="Name of the function the LLM chose to call."
524
+ )
510
525
  arguments: str = Field(
511
526
  ..., description="JSON-encoded string of the call's arguments."
512
527
  )
@@ -60,7 +60,9 @@ class BaseField(BaseModel, ABC):
60
60
  if field_cls.can_handle(name, schema):
61
61
  desc = schema.get("description", "")
62
62
  extra = {
63
- k: v for k, v in schema.items() if k not in ("type", "description")
63
+ k: v
64
+ for k, v in schema.items()
65
+ if k not in ("type", "description")
64
66
  }
65
67
  return field_cls(
66
68
  name=name,
@@ -74,7 +76,9 @@ class BaseField(BaseModel, ABC):
74
76
  json_type=schema.get("type", "string"),
75
77
  description=schema.get("description", ""),
76
78
  jsonschema_extra={
77
- k: v for k, v in schema.items() if k not in ("type", "description")
79
+ k: v
80
+ for k, v in schema.items()
81
+ if k not in ("type", "description")
78
82
  },
79
83
  extra_params={},
80
84
  )
@@ -122,10 +126,12 @@ class NumericField(BaseField):
122
126
  """
123
127
 
124
128
  threshold_low: Optional[float] = PydanticField(
125
- None, description="Lower bound for correctness checks (not in JSONSchema)."
129
+ None,
130
+ description="Lower bound for correctness checks (not in JSONSchema).",
126
131
  )
127
132
  threshold_high: Optional[float] = PydanticField(
128
- None, description="Upper bound for correctness checks (not in JSONSchema)."
133
+ None,
134
+ description="Upper bound for correctness checks (not in JSONSchema).",
129
135
  )
130
136
 
131
137
  __abstract__ = False
@@ -153,7 +159,9 @@ class NumericField(BaseField):
153
159
  json_type=schema.get("type", "number"),
154
160
  description=schema.get("description", ""),
155
161
  jsonschema_extra={
156
- k: v for k, v in schema.items() if k not in ("type", "description")
162
+ k: v
163
+ for k, v in schema.items()
164
+ if k not in ("type", "description")
157
165
  },
158
166
  extra_params={},
159
167
  )
@@ -131,7 +131,9 @@ class Metric:
131
131
  additional_properties=additional_props,
132
132
  )
133
133
 
134
- def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
134
+ def is_important(
135
+ self, result: Dict[str, Any]
136
+ ) -> Tuple[bool, Optional[str]]:
135
137
  """
136
138
  A result is 'important' if its confidence lies within the defined confidence thresholds.
137
139
 
@@ -146,7 +148,9 @@ class Metric:
146
148
  except (TypeError, ValueError):
147
149
  return False, "Invalid confidence value"
148
150
  # locate the confidence field
149
- conf_field = next((f for f in self.fields if f.name == "confidence"), None)
151
+ conf_field = next(
152
+ (f for f in self.fields if f.name == "confidence"), None
153
+ )
150
154
  if isinstance(conf_field, NumericField):
151
155
  ok = conf_field.is_within_threshold(conf)
152
156
  reason = (
@@ -266,7 +270,10 @@ class StandardMetric(Metric):
266
270
  json_type="number",
267
271
  description=f"Confidence in the output value (range {min_conf} to {max_conf}).",
268
272
  jsonschema_extra={"minimum": min_conf, "maximum": max_conf},
269
- extra_params={"threshold_low": min_conf, "threshold_high": max_conf},
273
+ extra_params={
274
+ "threshold_low": min_conf,
275
+ "threshold_high": max_conf,
276
+ },
270
277
  )
271
278
  correction = CorrectionField(
272
279
  name="correction",
@@ -277,7 +284,9 @@ class StandardMetric(Metric):
277
284
  fields = [explanation, evidence, output, confidence, correction]
278
285
  super().__init__(name=name, description=description, fields=fields)
279
286
 
280
- def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
287
+ def is_important(
288
+ self, result: Dict[str, Any]
289
+ ) -> Tuple[bool, Optional[str]]:
281
290
  """
282
291
  A result is 'important' if its confidence lies within the defined confidence thresholds.
283
292
 
@@ -292,7 +301,9 @@ class StandardMetric(Metric):
292
301
  except (TypeError, ValueError):
293
302
  return False, "Invalid confidence value"
294
303
  # locate the confidence field
295
- conf_field = next((f for f in self.fields if f.name == "confidence"), None)
304
+ conf_field = next(
305
+ (f for f in self.fields if f.name == "confidence"), None
306
+ )
296
307
  if isinstance(conf_field, NumericField):
297
308
  ok = conf_field.is_within_threshold(conf)
298
309
  reason = (
@@ -5,7 +5,9 @@ from pydantic import BaseModel
5
5
 
6
6
  from wxo_agentic_evaluation.referenceless_eval.metrics.field import NumericField
7
7
  from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
8
- from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import MetricPrompt
8
+ from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
9
+ MetricPrompt,
10
+ )
9
11
  from wxo_agentic_evaluation.referenceless_eval.prompt.runner import (
10
12
  AsyncGen,
11
13
  Prompt,
@@ -40,7 +42,8 @@ class MetricRunner:
40
42
  """
41
43
 
42
44
  def __init__(
43
- self, entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None
45
+ self,
46
+ entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None,
44
47
  ) -> None:
45
48
  """
46
49
  Args:
@@ -51,7 +54,9 @@ class MetricRunner:
51
54
  for mp, kw in entries:
52
55
  self.add(mp, kw)
53
56
 
54
- def add(self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]) -> None:
57
+ def add(
58
+ self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]
59
+ ) -> None:
55
60
  """
56
61
  Add a metric to run.
57
62
 
@@ -68,7 +68,9 @@ class MetricPrompt:
68
68
  # Store defaults for system context
69
69
  # This allows overriding system context without modifying the template
70
70
  # during prompt building
71
- self.system_kwargs_defaults: Dict[str, Any] = system_kwargs_defaults.copy()
71
+ self.system_kwargs_defaults: Dict[str, Any] = (
72
+ system_kwargs_defaults.copy()
73
+ )
72
74
 
73
75
  # Initialize examples list
74
76
  # This will hold (user_kwargs, output) pairs for few-shot prompting
@@ -104,7 +106,9 @@ class MetricPrompt:
104
106
 
105
107
  # --- Example Management ---
106
108
 
107
- def add_example(self, user_kwargs: Dict[str, Any], output: Dict[str, Any]) -> None:
109
+ def add_example(
110
+ self, user_kwargs: Dict[str, Any], output: Dict[str, Any]
111
+ ) -> None:
108
112
  """
109
113
  Add a few-shot example.
110
114
 
@@ -17,7 +17,11 @@ def remove_threshold_fields(schema: dict) -> dict:
17
17
  schema[key] = remove_threshold_fields(value)
18
18
  elif isinstance(value, list):
19
19
  schema[key] = [
20
- remove_threshold_fields(item) if isinstance(item, dict) else item
20
+ (
21
+ remove_threshold_fields(item)
22
+ if isinstance(item, dict)
23
+ else item
24
+ )
21
25
  for item in value
22
26
  ]
23
27
  return schema
@@ -1,10 +1,22 @@
1
1
  import asyncio
2
- from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, TypeVar, Union
2
+ from typing import (
3
+ Any,
4
+ Awaitable,
5
+ Callable,
6
+ Dict,
7
+ List,
8
+ Optional,
9
+ Tuple,
10
+ TypeVar,
11
+ Union,
12
+ )
3
13
 
4
14
  from pydantic import BaseModel
5
15
 
6
16
  Prompt = Union[str, List[Dict[str, Any]]]
7
- PromptAndSchema = Tuple[Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]]
17
+ PromptAndSchema = Tuple[
18
+ Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]
19
+ ]
8
20
  SyncGen = Callable[[Prompt], Union[str, Any]]
9
21
  BatchGen = Callable[[List[Prompt]], List[Union[str, Any]]]
10
22
  AsyncGen = Callable[[Prompt], Awaitable[Union[str, Any]]]
@@ -137,7 +149,8 @@ class PromptRunner:
137
149
  return index, PromptResult(prompt=prompt, error=str(e))
138
150
 
139
151
  tasks = [
140
- asyncio.create_task(_run_one(i, p)) for i, p in enumerate(self.prompts)
152
+ asyncio.create_task(_run_one(i, p))
153
+ for i, p in enumerate(self.prompts)
141
154
  ]
142
155
  indexed_results = await asyncio.gather(*tasks)
143
156
  # Sort results to match original order
@@ -14,30 +14,37 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
14
14
  ToolCall,
15
15
  ToolSpec,
16
16
  )
17
- from wxo_agentic_evaluation.type import Message
18
17
  from wxo_agentic_evaluation.service_provider import get_provider
18
+ from wxo_agentic_evaluation.type import Message
19
+
19
20
 
20
21
  class ReferencelessEvaluation:
21
22
  """
22
- Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
23
- Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
24
- ---
25
- Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
26
- Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
27
- Note: When parsing the semantic metrics, check for `is_correct` field. if `false` there is some mistake that the LLMaJ found in that tool call.
23
+ Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
24
+ Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
25
+ ---
26
+ Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
27
+ Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
28
+ Note: When parsing the semantic metrics, check for `is_correct` field. if `false` there is some mistake that the LLMaJ found in that tool call.
28
29
  """
30
+
29
31
  def __init__(
30
32
  self,
31
33
  api_spec: List[Mapping[str, Any]],
32
34
  messages: List[Message],
33
35
  model_id: str,
34
36
  task_n: str,
35
- dataset_name: str,):
36
-
37
+ dataset_name: str,
38
+ ):
39
+
37
40
  self.metrics_client = get_provider(
38
41
  model_id=model_id,
39
- params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 4096},
40
- referenceless_eval=True
42
+ params={
43
+ "min_new_tokens": 0,
44
+ "decoding_method": "greedy",
45
+ "max_new_tokens": 4096,
46
+ },
47
+ referenceless_eval=True,
41
48
  )
42
49
 
43
50
  self.pipeline = ReflectionPipeline(
@@ -72,7 +79,11 @@ class ReferencelessEvaluation:
72
79
  examples = []
73
80
 
74
81
  processed_data = [
75
- {k: msg.model_dump().get(k) for k in ["role", "content", "type"] if k in msg.model_dump()}
82
+ {
83
+ k: msg.model_dump().get(k)
84
+ for k in ["role", "content", "type"]
85
+ if k in msg.model_dump()
86
+ }
76
87
  for msg in self.messages
77
88
  ]
78
89
 
@@ -1,4 +1,5 @@
1
1
  from collections import defaultdict
2
+
2
3
  from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
3
4
 
4
5
 
@@ -44,4 +45,4 @@ class ResourceMap:
44
45
 
45
46
  agent2tools = dict(agent2tools)
46
47
  tools2agents = dict(tools2agents)
47
- return agent2tools, tools2agents
48
+ return agent2tools, tools2agents