ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +8 -2
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation_package.py +114 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +158 -73
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +24 -11
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA +0 -385
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -2,17 +2,10 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
from types import NoneType
|
|
5
|
-
from typing import
|
|
6
|
-
Any,
|
|
7
|
-
Dict,
|
|
8
|
-
List,
|
|
9
|
-
Literal,
|
|
10
|
-
Optional,
|
|
11
|
-
Union,
|
|
12
|
-
)
|
|
13
|
-
from typing_extensions import Self
|
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
14
6
|
|
|
15
7
|
from pydantic import BaseModel, Field, ValidationError, model_validator
|
|
8
|
+
from typing_extensions import Self
|
|
16
9
|
|
|
17
10
|
from wxo_agentic_evaluation.referenceless_eval.metrics import MetricRunResult
|
|
18
11
|
|
|
@@ -32,12 +25,14 @@ class FunctionCallMetric(BaseModel):
|
|
|
32
25
|
jsonschema: Dict[str, Any] = Field(
|
|
33
26
|
..., description="JSON Schema dict for this metric's output."
|
|
34
27
|
)
|
|
35
|
-
examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] =
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
28
|
+
examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = (
|
|
29
|
+
Field(
|
|
30
|
+
None,
|
|
31
|
+
description=(
|
|
32
|
+
"List of example inputs and outputs for this metric; "
|
|
33
|
+
"each example is a dict with 'user_kwargs' and 'output' keys."
|
|
34
|
+
),
|
|
35
|
+
)
|
|
41
36
|
)
|
|
42
37
|
|
|
43
38
|
|
|
@@ -52,7 +47,8 @@ class StaticMetricResult(BaseModel):
|
|
|
52
47
|
"""
|
|
53
48
|
|
|
54
49
|
description: str = Field(
|
|
55
|
-
...,
|
|
50
|
+
...,
|
|
51
|
+
description="Human-readable description of this static validation check.",
|
|
56
52
|
)
|
|
57
53
|
valid: bool = Field(
|
|
58
54
|
..., description="True if this static check passed; False otherwise."
|
|
@@ -73,7 +69,9 @@ class StaticResult(BaseModel):
|
|
|
73
69
|
|
|
74
70
|
metrics: Dict[str, StaticMetricResult] = Field(
|
|
75
71
|
...,
|
|
76
|
-
description=(
|
|
72
|
+
description=(
|
|
73
|
+
"Mapping from each static-check name to its StaticMetricResult."
|
|
74
|
+
),
|
|
77
75
|
)
|
|
78
76
|
final_decision: bool = Field(
|
|
79
77
|
...,
|
|
@@ -133,7 +131,8 @@ class SemanticMetricResult(BaseModel):
|
|
|
133
131
|
error: Optional[str] = Field(
|
|
134
132
|
None,
|
|
135
133
|
description=(
|
|
136
|
-
"Error message if prompt generation or parsing failed; "
|
|
134
|
+
"Error message if prompt generation or parsing failed; "
|
|
135
|
+
"otherwise None."
|
|
137
136
|
),
|
|
138
137
|
)
|
|
139
138
|
is_correct: bool = Field(
|
|
@@ -157,11 +156,11 @@ class SemanticMetricResult(BaseModel):
|
|
|
157
156
|
),
|
|
158
157
|
)
|
|
159
158
|
|
|
160
|
-
@model_validator(mode=
|
|
159
|
+
@model_validator(mode="after")
|
|
161
160
|
def raw_response_json(self) -> Self:
|
|
162
161
|
if isinstance(self.raw_response, str):
|
|
163
162
|
self.raw_response = json.loads(self.raw_response)
|
|
164
|
-
|
|
163
|
+
|
|
165
164
|
return self
|
|
166
165
|
|
|
167
166
|
@classmethod
|
|
@@ -211,7 +210,9 @@ class SemanticCategoryResult(BaseModel):
|
|
|
211
210
|
|
|
212
211
|
metrics: Optional[Dict[str, SemanticMetricResult]] = Field(
|
|
213
212
|
None,
|
|
214
|
-
description=(
|
|
213
|
+
description=(
|
|
214
|
+
"Mapping metric_name -> SemanticMetricResult for this category."
|
|
215
|
+
),
|
|
215
216
|
)
|
|
216
217
|
avg_score: Optional[float] = Field(
|
|
217
218
|
None,
|
|
@@ -222,7 +223,9 @@ class SemanticCategoryResult(BaseModel):
|
|
|
222
223
|
)
|
|
223
224
|
|
|
224
225
|
@classmethod
|
|
225
|
-
def from_results(
|
|
226
|
+
def from_results(
|
|
227
|
+
cls, results: List[MetricRunResult]
|
|
228
|
+
) -> "SemanticCategoryResult":
|
|
226
229
|
"""
|
|
227
230
|
Build a category result from a list of MetricRunResult objects.
|
|
228
231
|
"""
|
|
@@ -249,11 +252,15 @@ class SemanticResult(BaseModel):
|
|
|
249
252
|
|
|
250
253
|
general: Optional[SemanticCategoryResult] = Field(
|
|
251
254
|
None,
|
|
252
|
-
description=(
|
|
255
|
+
description=(
|
|
256
|
+
"Results of general tool-call metrics, if any; otherwise None."
|
|
257
|
+
),
|
|
253
258
|
)
|
|
254
259
|
function_selection: Optional[SemanticCategoryResult] = Field(
|
|
255
260
|
None,
|
|
256
|
-
description=(
|
|
261
|
+
description=(
|
|
262
|
+
"Results of function-selection metrics, if any; otherwise None."
|
|
263
|
+
),
|
|
257
264
|
)
|
|
258
265
|
parameter: Optional[Dict[str, SemanticCategoryResult]] = Field(
|
|
259
266
|
None,
|
|
@@ -302,7 +309,8 @@ class TransformResult(BaseModel):
|
|
|
302
309
|
),
|
|
303
310
|
)
|
|
304
311
|
execution_output: Any = Field(
|
|
305
|
-
None,
|
|
312
|
+
None,
|
|
313
|
+
description="The actual output of executing the transformation code.",
|
|
306
314
|
)
|
|
307
315
|
correction: Optional[str] = Field(
|
|
308
316
|
None,
|
|
@@ -311,7 +319,8 @@ class TransformResult(BaseModel):
|
|
|
311
319
|
error: Optional[str] = Field(
|
|
312
320
|
None,
|
|
313
321
|
description=(
|
|
314
|
-
"Error message if code generation or execution failed; "
|
|
322
|
+
"Error message if code generation or execution failed; "
|
|
323
|
+
"otherwise None."
|
|
315
324
|
),
|
|
316
325
|
)
|
|
317
326
|
|
|
@@ -356,7 +365,9 @@ class PipelineResult(BaseModel):
|
|
|
356
365
|
Final output of the function-calling pipeline for one tool call.
|
|
357
366
|
"""
|
|
358
367
|
|
|
359
|
-
inputs: FunctionCallInput = Field(
|
|
368
|
+
inputs: FunctionCallInput = Field(
|
|
369
|
+
..., description="Echo of the pipeline inputs."
|
|
370
|
+
)
|
|
360
371
|
static: Optional[StaticResult] = Field(
|
|
361
372
|
None, description="Static schema-validation results, if enabled."
|
|
362
373
|
)
|
|
@@ -430,7 +441,9 @@ class PipelineResult(BaseModel):
|
|
|
430
441
|
if param_avgs:
|
|
431
442
|
cat_avgs.append(sum(param_avgs) / len(param_avgs))
|
|
432
443
|
|
|
433
|
-
values.overall_avg_score =
|
|
444
|
+
values.overall_avg_score = (
|
|
445
|
+
sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
|
|
446
|
+
)
|
|
434
447
|
values.overall_valid = ok
|
|
435
448
|
return values
|
|
436
449
|
|
|
@@ -506,7 +519,9 @@ class ToolFunctionCall(BaseModel):
|
|
|
506
519
|
Parsed representation of an LLM's function call response.
|
|
507
520
|
"""
|
|
508
521
|
|
|
509
|
-
name: str = Field(
|
|
522
|
+
name: str = Field(
|
|
523
|
+
..., description="Name of the function the LLM chose to call."
|
|
524
|
+
)
|
|
510
525
|
arguments: str = Field(
|
|
511
526
|
..., description="JSON-encoded string of the call's arguments."
|
|
512
527
|
)
|
|
@@ -60,7 +60,9 @@ class BaseField(BaseModel, ABC):
|
|
|
60
60
|
if field_cls.can_handle(name, schema):
|
|
61
61
|
desc = schema.get("description", "")
|
|
62
62
|
extra = {
|
|
63
|
-
k: v
|
|
63
|
+
k: v
|
|
64
|
+
for k, v in schema.items()
|
|
65
|
+
if k not in ("type", "description")
|
|
64
66
|
}
|
|
65
67
|
return field_cls(
|
|
66
68
|
name=name,
|
|
@@ -74,7 +76,9 @@ class BaseField(BaseModel, ABC):
|
|
|
74
76
|
json_type=schema.get("type", "string"),
|
|
75
77
|
description=schema.get("description", ""),
|
|
76
78
|
jsonschema_extra={
|
|
77
|
-
k: v
|
|
79
|
+
k: v
|
|
80
|
+
for k, v in schema.items()
|
|
81
|
+
if k not in ("type", "description")
|
|
78
82
|
},
|
|
79
83
|
extra_params={},
|
|
80
84
|
)
|
|
@@ -122,10 +126,12 @@ class NumericField(BaseField):
|
|
|
122
126
|
"""
|
|
123
127
|
|
|
124
128
|
threshold_low: Optional[float] = PydanticField(
|
|
125
|
-
None,
|
|
129
|
+
None,
|
|
130
|
+
description="Lower bound for correctness checks (not in JSONSchema).",
|
|
126
131
|
)
|
|
127
132
|
threshold_high: Optional[float] = PydanticField(
|
|
128
|
-
None,
|
|
133
|
+
None,
|
|
134
|
+
description="Upper bound for correctness checks (not in JSONSchema).",
|
|
129
135
|
)
|
|
130
136
|
|
|
131
137
|
__abstract__ = False
|
|
@@ -153,7 +159,9 @@ class NumericField(BaseField):
|
|
|
153
159
|
json_type=schema.get("type", "number"),
|
|
154
160
|
description=schema.get("description", ""),
|
|
155
161
|
jsonschema_extra={
|
|
156
|
-
k: v
|
|
162
|
+
k: v
|
|
163
|
+
for k, v in schema.items()
|
|
164
|
+
if k not in ("type", "description")
|
|
157
165
|
},
|
|
158
166
|
extra_params={},
|
|
159
167
|
)
|
|
@@ -131,7 +131,9 @@ class Metric:
|
|
|
131
131
|
additional_properties=additional_props,
|
|
132
132
|
)
|
|
133
133
|
|
|
134
|
-
def is_important(
|
|
134
|
+
def is_important(
|
|
135
|
+
self, result: Dict[str, Any]
|
|
136
|
+
) -> Tuple[bool, Optional[str]]:
|
|
135
137
|
"""
|
|
136
138
|
A result is 'important' if its confidence lies within the defined confidence thresholds.
|
|
137
139
|
|
|
@@ -146,7 +148,9 @@ class Metric:
|
|
|
146
148
|
except (TypeError, ValueError):
|
|
147
149
|
return False, "Invalid confidence value"
|
|
148
150
|
# locate the confidence field
|
|
149
|
-
conf_field = next(
|
|
151
|
+
conf_field = next(
|
|
152
|
+
(f for f in self.fields if f.name == "confidence"), None
|
|
153
|
+
)
|
|
150
154
|
if isinstance(conf_field, NumericField):
|
|
151
155
|
ok = conf_field.is_within_threshold(conf)
|
|
152
156
|
reason = (
|
|
@@ -266,7 +270,10 @@ class StandardMetric(Metric):
|
|
|
266
270
|
json_type="number",
|
|
267
271
|
description=f"Confidence in the output value (range {min_conf} to {max_conf}).",
|
|
268
272
|
jsonschema_extra={"minimum": min_conf, "maximum": max_conf},
|
|
269
|
-
extra_params={
|
|
273
|
+
extra_params={
|
|
274
|
+
"threshold_low": min_conf,
|
|
275
|
+
"threshold_high": max_conf,
|
|
276
|
+
},
|
|
270
277
|
)
|
|
271
278
|
correction = CorrectionField(
|
|
272
279
|
name="correction",
|
|
@@ -277,7 +284,9 @@ class StandardMetric(Metric):
|
|
|
277
284
|
fields = [explanation, evidence, output, confidence, correction]
|
|
278
285
|
super().__init__(name=name, description=description, fields=fields)
|
|
279
286
|
|
|
280
|
-
def is_important(
|
|
287
|
+
def is_important(
|
|
288
|
+
self, result: Dict[str, Any]
|
|
289
|
+
) -> Tuple[bool, Optional[str]]:
|
|
281
290
|
"""
|
|
282
291
|
A result is 'important' if its confidence lies within the defined confidence thresholds.
|
|
283
292
|
|
|
@@ -292,7 +301,9 @@ class StandardMetric(Metric):
|
|
|
292
301
|
except (TypeError, ValueError):
|
|
293
302
|
return False, "Invalid confidence value"
|
|
294
303
|
# locate the confidence field
|
|
295
|
-
conf_field = next(
|
|
304
|
+
conf_field = next(
|
|
305
|
+
(f for f in self.fields if f.name == "confidence"), None
|
|
306
|
+
)
|
|
296
307
|
if isinstance(conf_field, NumericField):
|
|
297
308
|
ok = conf_field.is_within_threshold(conf)
|
|
298
309
|
reason = (
|
|
@@ -5,7 +5,9 @@ from pydantic import BaseModel
|
|
|
5
5
|
|
|
6
6
|
from wxo_agentic_evaluation.referenceless_eval.metrics.field import NumericField
|
|
7
7
|
from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
|
|
8
|
-
from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import
|
|
8
|
+
from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
|
|
9
|
+
MetricPrompt,
|
|
10
|
+
)
|
|
9
11
|
from wxo_agentic_evaluation.referenceless_eval.prompt.runner import (
|
|
10
12
|
AsyncGen,
|
|
11
13
|
Prompt,
|
|
@@ -40,7 +42,8 @@ class MetricRunner:
|
|
|
40
42
|
"""
|
|
41
43
|
|
|
42
44
|
def __init__(
|
|
43
|
-
self,
|
|
45
|
+
self,
|
|
46
|
+
entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None,
|
|
44
47
|
) -> None:
|
|
45
48
|
"""
|
|
46
49
|
Args:
|
|
@@ -51,7 +54,9 @@ class MetricRunner:
|
|
|
51
54
|
for mp, kw in entries:
|
|
52
55
|
self.add(mp, kw)
|
|
53
56
|
|
|
54
|
-
def add(
|
|
57
|
+
def add(
|
|
58
|
+
self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]
|
|
59
|
+
) -> None:
|
|
55
60
|
"""
|
|
56
61
|
Add a metric to run.
|
|
57
62
|
|
|
@@ -68,7 +68,9 @@ class MetricPrompt:
|
|
|
68
68
|
# Store defaults for system context
|
|
69
69
|
# This allows overriding system context without modifying the template
|
|
70
70
|
# during prompt building
|
|
71
|
-
self.system_kwargs_defaults: Dict[str, Any] =
|
|
71
|
+
self.system_kwargs_defaults: Dict[str, Any] = (
|
|
72
|
+
system_kwargs_defaults.copy()
|
|
73
|
+
)
|
|
72
74
|
|
|
73
75
|
# Initialize examples list
|
|
74
76
|
# This will hold (user_kwargs, output) pairs for few-shot prompting
|
|
@@ -104,7 +106,9 @@ class MetricPrompt:
|
|
|
104
106
|
|
|
105
107
|
# --- Example Management ---
|
|
106
108
|
|
|
107
|
-
def add_example(
|
|
109
|
+
def add_example(
|
|
110
|
+
self, user_kwargs: Dict[str, Any], output: Dict[str, Any]
|
|
111
|
+
) -> None:
|
|
108
112
|
"""
|
|
109
113
|
Add a few-shot example.
|
|
110
114
|
|
|
@@ -17,7 +17,11 @@ def remove_threshold_fields(schema: dict) -> dict:
|
|
|
17
17
|
schema[key] = remove_threshold_fields(value)
|
|
18
18
|
elif isinstance(value, list):
|
|
19
19
|
schema[key] = [
|
|
20
|
-
|
|
20
|
+
(
|
|
21
|
+
remove_threshold_fields(item)
|
|
22
|
+
if isinstance(item, dict)
|
|
23
|
+
else item
|
|
24
|
+
)
|
|
21
25
|
for item in value
|
|
22
26
|
]
|
|
23
27
|
return schema
|
|
@@ -1,10 +1,22 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import (
|
|
3
|
+
Any,
|
|
4
|
+
Awaitable,
|
|
5
|
+
Callable,
|
|
6
|
+
Dict,
|
|
7
|
+
List,
|
|
8
|
+
Optional,
|
|
9
|
+
Tuple,
|
|
10
|
+
TypeVar,
|
|
11
|
+
Union,
|
|
12
|
+
)
|
|
3
13
|
|
|
4
14
|
from pydantic import BaseModel
|
|
5
15
|
|
|
6
16
|
Prompt = Union[str, List[Dict[str, Any]]]
|
|
7
|
-
PromptAndSchema = Tuple[
|
|
17
|
+
PromptAndSchema = Tuple[
|
|
18
|
+
Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]
|
|
19
|
+
]
|
|
8
20
|
SyncGen = Callable[[Prompt], Union[str, Any]]
|
|
9
21
|
BatchGen = Callable[[List[Prompt]], List[Union[str, Any]]]
|
|
10
22
|
AsyncGen = Callable[[Prompt], Awaitable[Union[str, Any]]]
|
|
@@ -137,7 +149,8 @@ class PromptRunner:
|
|
|
137
149
|
return index, PromptResult(prompt=prompt, error=str(e))
|
|
138
150
|
|
|
139
151
|
tasks = [
|
|
140
|
-
asyncio.create_task(_run_one(i, p))
|
|
152
|
+
asyncio.create_task(_run_one(i, p))
|
|
153
|
+
for i, p in enumerate(self.prompts)
|
|
141
154
|
]
|
|
142
155
|
indexed_results = await asyncio.gather(*tasks)
|
|
143
156
|
# Sort results to match original order
|
|
@@ -14,30 +14,37 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
|
|
|
14
14
|
ToolCall,
|
|
15
15
|
ToolSpec,
|
|
16
16
|
)
|
|
17
|
-
from wxo_agentic_evaluation.type import Message
|
|
18
17
|
from wxo_agentic_evaluation.service_provider import get_provider
|
|
18
|
+
from wxo_agentic_evaluation.type import Message
|
|
19
|
+
|
|
19
20
|
|
|
20
21
|
class ReferencelessEvaluation:
|
|
21
22
|
"""
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
|
|
24
|
+
Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
|
|
25
|
+
---
|
|
26
|
+
Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
|
|
27
|
+
Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
|
|
28
|
+
Note: When parsing the semantic metrics, check for `is_correct` field. if `false` there is some mistake that the LLMaJ found in that tool call.
|
|
28
29
|
"""
|
|
30
|
+
|
|
29
31
|
def __init__(
|
|
30
32
|
self,
|
|
31
33
|
api_spec: List[Mapping[str, Any]],
|
|
32
34
|
messages: List[Message],
|
|
33
35
|
model_id: str,
|
|
34
36
|
task_n: str,
|
|
35
|
-
dataset_name: str,
|
|
36
|
-
|
|
37
|
+
dataset_name: str,
|
|
38
|
+
):
|
|
39
|
+
|
|
37
40
|
self.metrics_client = get_provider(
|
|
38
41
|
model_id=model_id,
|
|
39
|
-
params={
|
|
40
|
-
|
|
42
|
+
params={
|
|
43
|
+
"min_new_tokens": 0,
|
|
44
|
+
"decoding_method": "greedy",
|
|
45
|
+
"max_new_tokens": 4096,
|
|
46
|
+
},
|
|
47
|
+
referenceless_eval=True,
|
|
41
48
|
)
|
|
42
49
|
|
|
43
50
|
self.pipeline = ReflectionPipeline(
|
|
@@ -72,7 +79,11 @@ class ReferencelessEvaluation:
|
|
|
72
79
|
examples = []
|
|
73
80
|
|
|
74
81
|
processed_data = [
|
|
75
|
-
{
|
|
82
|
+
{
|
|
83
|
+
k: msg.model_dump().get(k)
|
|
84
|
+
for k in ["role", "content", "type"]
|
|
85
|
+
if k in msg.model_dump()
|
|
86
|
+
}
|
|
76
87
|
for msg in self.messages
|
|
77
88
|
]
|
|
78
89
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
|
+
|
|
2
3
|
from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
|
|
3
4
|
|
|
4
5
|
|
|
@@ -44,4 +45,4 @@ class ResourceMap:
|
|
|
44
45
|
|
|
45
46
|
agent2tools = dict(agent2tools)
|
|
46
47
|
tools2agents = dict(tools2agents)
|
|
47
|
-
return agent2tools, tools2agents
|
|
48
|
+
return agent2tools, tools2agents
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import yaml
|
|
3
2
|
import os
|
|
3
|
+
|
|
4
4
|
import requests
|
|
5
|
-
|
|
5
|
+
import yaml
|
|
6
|
+
|
|
7
|
+
from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url, is_saas_url
|
|
6
8
|
|
|
7
9
|
logger = logging.getLogger(__name__)
|
|
8
10
|
|
|
@@ -11,13 +13,15 @@ USER = {"username": "wxo.archer@ibm.com", "password": "watsonx"}
|
|
|
11
13
|
|
|
12
14
|
class ServiceInstance:
|
|
13
15
|
def __init__(
|
|
14
|
-
self,
|
|
16
|
+
self,
|
|
17
|
+
service_url,
|
|
18
|
+
tenant_name,
|
|
19
|
+
is_saas: bool = None,
|
|
20
|
+
is_ibm_cloud: bool = None,
|
|
15
21
|
) -> None:
|
|
16
22
|
self.service_url = service_url
|
|
17
23
|
self.tenant_name = tenant_name
|
|
18
|
-
STAGING_AUTH_ENDPOINT =
|
|
19
|
-
"https://iam.platform.test.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
|
|
20
|
-
)
|
|
24
|
+
STAGING_AUTH_ENDPOINT = "https://iam.platform.test.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
|
|
21
25
|
PROD_AUTH_ENDPOINT = (
|
|
22
26
|
"https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
|
|
23
27
|
)
|
|
@@ -25,7 +29,9 @@ class ServiceInstance:
|
|
|
25
29
|
|
|
26
30
|
self.is_saas = is_saas_url(service_url) if is_saas is None else is_saas
|
|
27
31
|
self.is_ibm_cloud = (
|
|
28
|
-
is_ibm_cloud_url(service_url)
|
|
32
|
+
is_ibm_cloud_url(service_url)
|
|
33
|
+
if is_ibm_cloud is None
|
|
34
|
+
else is_ibm_cloud
|
|
29
35
|
)
|
|
30
36
|
|
|
31
37
|
if self.is_saas:
|
|
@@ -88,7 +94,8 @@ class ServiceInstance:
|
|
|
88
94
|
|
|
89
95
|
def _get_tenant_token(self, tenant_id: str):
|
|
90
96
|
resp = requests.post(
|
|
91
|
-
self.tenant_auth_endpoint.format(self.service_url, tenant_id),
|
|
97
|
+
self.tenant_auth_endpoint.format(self.service_url, tenant_id),
|
|
98
|
+
data=USER,
|
|
92
99
|
)
|
|
93
100
|
if resp.status_code == 200:
|
|
94
101
|
return resp.json()["access_token"]
|
|
@@ -122,7 +129,9 @@ class ServiceInstance:
|
|
|
122
129
|
"tags": ["test"],
|
|
123
130
|
}
|
|
124
131
|
|
|
125
|
-
resp = requests.post(
|
|
132
|
+
resp = requests.post(
|
|
133
|
+
self.tenant_url, headers=headers, json=tenant_config
|
|
134
|
+
)
|
|
126
135
|
if resp.status_code == 201:
|
|
127
136
|
return True
|
|
128
137
|
else:
|
|
@@ -159,8 +168,12 @@ def tenant_setup(service_url: str, tenant_name: str):
|
|
|
159
168
|
# else:
|
|
160
169
|
# tenant_token = service_instance._get_tenant_token(tenant_id)
|
|
161
170
|
|
|
162
|
-
auth_config_path =
|
|
163
|
-
|
|
171
|
+
auth_config_path = (
|
|
172
|
+
f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
|
|
173
|
+
)
|
|
174
|
+
env_config_path = (
|
|
175
|
+
f"{os.path.expanduser('~')}/.config/orchestrate/config.yaml"
|
|
176
|
+
)
|
|
164
177
|
|
|
165
178
|
# TO-DO: update SDK and use SDK to manage this
|
|
166
179
|
with open(auth_config_path, "r") as f:
|
|
@@ -1,12 +1,24 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
from wxo_agentic_evaluation.service_provider.model_proxy_provider import ModelProxyProvider
|
|
4
|
-
from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import ModelProxyProviderLLMKitWrapper, WatsonXLLMKitWrapper
|
|
1
|
+
import os
|
|
2
|
+
|
|
5
3
|
from wxo_agentic_evaluation.arg_configs import ProviderConfig
|
|
4
|
+
from wxo_agentic_evaluation.service_provider.model_proxy_provider import (
|
|
5
|
+
ModelProxyProvider,
|
|
6
|
+
)
|
|
7
|
+
from wxo_agentic_evaluation.service_provider.ollama_provider import (
|
|
8
|
+
OllamaProvider,
|
|
9
|
+
)
|
|
10
|
+
from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
|
|
11
|
+
ModelProxyProviderLLMKitWrapper,
|
|
12
|
+
WatsonXLLMKitWrapper,
|
|
13
|
+
)
|
|
14
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
15
|
+
WatsonXProvider,
|
|
16
|
+
)
|
|
6
17
|
|
|
7
|
-
import os
|
|
8
18
|
|
|
9
|
-
def _instantiate_provider(
|
|
19
|
+
def _instantiate_provider(
|
|
20
|
+
config: ProviderConfig, is_referenceless_eval: bool = False, **kwargs
|
|
21
|
+
):
|
|
10
22
|
if config.provider == "watsonx":
|
|
11
23
|
if is_referenceless_eval:
|
|
12
24
|
provider = WatsonXLLMKitWrapper
|
|
@@ -22,12 +34,17 @@ def _instantiate_provider(config: ProviderConfig, is_referenceless_eval: bool =
|
|
|
22
34
|
provider = ModelProxyProvider
|
|
23
35
|
return provider(model_id=config.model_id, **kwargs)
|
|
24
36
|
else:
|
|
25
|
-
raise RuntimeError(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
return _instantiate_provider(config, **kwargs)
|
|
37
|
+
raise RuntimeError(
|
|
38
|
+
f"target provider is not supported {config.provider}"
|
|
39
|
+
)
|
|
40
|
+
|
|
30
41
|
|
|
42
|
+
def get_provider(
|
|
43
|
+
config: ProviderConfig = None,
|
|
44
|
+
model_id: str = None,
|
|
45
|
+
referenceless_eval: bool = False,
|
|
46
|
+
**kwargs,
|
|
47
|
+
):
|
|
31
48
|
if not model_id:
|
|
32
49
|
raise ValueError("model_id must be provided if config is not supplied")
|
|
33
50
|
|
|
@@ -35,10 +52,13 @@ def get_provider(config: ProviderConfig = None, model_id: str = None, referencel
|
|
|
35
52
|
config = ProviderConfig(provider="watsonx", model_id=model_id)
|
|
36
53
|
return _instantiate_provider(config, referenceless_eval, **kwargs)
|
|
37
54
|
|
|
38
|
-
if "
|
|
55
|
+
if "WO_INSTANCE" in os.environ:
|
|
39
56
|
config = ProviderConfig(provider="model_proxy", model_id=model_id)
|
|
40
57
|
return _instantiate_provider(config, referenceless_eval, **kwargs)
|
|
58
|
+
|
|
59
|
+
if config:
|
|
60
|
+
return _instantiate_provider(config, **kwargs)
|
|
41
61
|
|
|
42
62
|
raise RuntimeError(
|
|
43
63
|
"No provider found. Please either provide a config or set the required environment variables."
|
|
44
|
-
)
|
|
64
|
+
)
|