ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +8 -2
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation_package.py +114 -70
  14. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  15. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  16. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  17. wxo_agentic_evaluation/external_agent/types.py +12 -5
  18. wxo_agentic_evaluation/inference_backend.py +158 -73
  19. wxo_agentic_evaluation/llm_matching.py +4 -3
  20. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  21. wxo_agentic_evaluation/llm_user.py +7 -3
  22. wxo_agentic_evaluation/main.py +175 -67
  23. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  24. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  25. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  26. wxo_agentic_evaluation/quick_eval.py +49 -23
  27. wxo_agentic_evaluation/record_chat.py +70 -33
  28. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  29. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  30. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  38. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  39. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  40. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  41. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  42. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  43. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  44. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  45. wxo_agentic_evaluation/resource_map.py +2 -1
  46. wxo_agentic_evaluation/service_instance.py +24 -11
  47. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  48. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
  49. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  50. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  51. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  52. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  53. wxo_agentic_evaluation/tool_planner.py +128 -44
  54. wxo_agentic_evaluation/type.py +12 -9
  55. wxo_agentic_evaluation/utils/__init__.py +1 -0
  56. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  57. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  58. wxo_agentic_evaluation/utils/utils.py +83 -52
  59. ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA +0 -385
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
  61. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
@@ -2,17 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  from types import NoneType
5
- from typing import (
6
- Any,
7
- Dict,
8
- List,
9
- Literal,
10
- Optional,
11
- Union,
12
- )
13
- from typing_extensions import Self
5
+ from typing import Any, Dict, List, Literal, Optional, Union
14
6
 
15
7
  from pydantic import BaseModel, Field, ValidationError, model_validator
8
+ from typing_extensions import Self
16
9
 
17
10
  from wxo_agentic_evaluation.referenceless_eval.metrics import MetricRunResult
18
11
 
@@ -32,12 +25,14 @@ class FunctionCallMetric(BaseModel):
32
25
  jsonschema: Dict[str, Any] = Field(
33
26
  ..., description="JSON Schema dict for this metric's output."
34
27
  )
35
- examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = Field(
36
- None,
37
- description=(
38
- "List of example inputs and outputs for this metric; "
39
- "each example is a dict with 'user_kwargs' and 'output' keys."
40
- ),
28
+ examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = (
29
+ Field(
30
+ None,
31
+ description=(
32
+ "List of example inputs and outputs for this metric; "
33
+ "each example is a dict with 'user_kwargs' and 'output' keys."
34
+ ),
35
+ )
41
36
  )
42
37
 
43
38
 
@@ -52,7 +47,8 @@ class StaticMetricResult(BaseModel):
52
47
  """
53
48
 
54
49
  description: str = Field(
55
- ..., description="Human-readable description of this static validation check."
50
+ ...,
51
+ description="Human-readable description of this static validation check.",
56
52
  )
57
53
  valid: bool = Field(
58
54
  ..., description="True if this static check passed; False otherwise."
@@ -73,7 +69,9 @@ class StaticResult(BaseModel):
73
69
 
74
70
  metrics: Dict[str, StaticMetricResult] = Field(
75
71
  ...,
76
- description=("Mapping from each static-check name to its StaticMetricResult."),
72
+ description=(
73
+ "Mapping from each static-check name to its StaticMetricResult."
74
+ ),
77
75
  )
78
76
  final_decision: bool = Field(
79
77
  ...,
@@ -133,7 +131,8 @@ class SemanticMetricResult(BaseModel):
133
131
  error: Optional[str] = Field(
134
132
  None,
135
133
  description=(
136
- "Error message if prompt generation or parsing failed; " "otherwise None."
134
+ "Error message if prompt generation or parsing failed; "
135
+ "otherwise None."
137
136
  ),
138
137
  )
139
138
  is_correct: bool = Field(
@@ -157,11 +156,11 @@ class SemanticMetricResult(BaseModel):
157
156
  ),
158
157
  )
159
158
 
160
- @model_validator(mode='after')
159
+ @model_validator(mode="after")
161
160
  def raw_response_json(self) -> Self:
162
161
  if isinstance(self.raw_response, str):
163
162
  self.raw_response = json.loads(self.raw_response)
164
-
163
+
165
164
  return self
166
165
 
167
166
  @classmethod
@@ -211,7 +210,9 @@ class SemanticCategoryResult(BaseModel):
211
210
 
212
211
  metrics: Optional[Dict[str, SemanticMetricResult]] = Field(
213
212
  None,
214
- description=("Mapping metric_name -> SemanticMetricResult for this category."),
213
+ description=(
214
+ "Mapping metric_name -> SemanticMetricResult for this category."
215
+ ),
215
216
  )
216
217
  avg_score: Optional[float] = Field(
217
218
  None,
@@ -222,7 +223,9 @@ class SemanticCategoryResult(BaseModel):
222
223
  )
223
224
 
224
225
  @classmethod
225
- def from_results(cls, results: List[MetricRunResult]) -> "SemanticCategoryResult":
226
+ def from_results(
227
+ cls, results: List[MetricRunResult]
228
+ ) -> "SemanticCategoryResult":
226
229
  """
227
230
  Build a category result from a list of MetricRunResult objects.
228
231
  """
@@ -249,11 +252,15 @@ class SemanticResult(BaseModel):
249
252
 
250
253
  general: Optional[SemanticCategoryResult] = Field(
251
254
  None,
252
- description=("Results of general tool-call metrics, if any; otherwise None."),
255
+ description=(
256
+ "Results of general tool-call metrics, if any; otherwise None."
257
+ ),
253
258
  )
254
259
  function_selection: Optional[SemanticCategoryResult] = Field(
255
260
  None,
256
- description=("Results of function-selection metrics, if any; otherwise None."),
261
+ description=(
262
+ "Results of function-selection metrics, if any; otherwise None."
263
+ ),
257
264
  )
258
265
  parameter: Optional[Dict[str, SemanticCategoryResult]] = Field(
259
266
  None,
@@ -302,7 +309,8 @@ class TransformResult(BaseModel):
302
309
  ),
303
310
  )
304
311
  execution_output: Any = Field(
305
- None, description="The actual output of executing the transformation code."
312
+ None,
313
+ description="The actual output of executing the transformation code.",
306
314
  )
307
315
  correction: Optional[str] = Field(
308
316
  None,
@@ -311,7 +319,8 @@ class TransformResult(BaseModel):
311
319
  error: Optional[str] = Field(
312
320
  None,
313
321
  description=(
314
- "Error message if code generation or execution failed; " "otherwise None."
322
+ "Error message if code generation or execution failed; "
323
+ "otherwise None."
315
324
  ),
316
325
  )
317
326
 
@@ -356,7 +365,9 @@ class PipelineResult(BaseModel):
356
365
  Final output of the function-calling pipeline for one tool call.
357
366
  """
358
367
 
359
- inputs: FunctionCallInput = Field(..., description="Echo of the pipeline inputs.")
368
+ inputs: FunctionCallInput = Field(
369
+ ..., description="Echo of the pipeline inputs."
370
+ )
360
371
  static: Optional[StaticResult] = Field(
361
372
  None, description="Static schema-validation results, if enabled."
362
373
  )
@@ -430,7 +441,9 @@ class PipelineResult(BaseModel):
430
441
  if param_avgs:
431
442
  cat_avgs.append(sum(param_avgs) / len(param_avgs))
432
443
 
433
- values.overall_avg_score = sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
444
+ values.overall_avg_score = (
445
+ sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
446
+ )
434
447
  values.overall_valid = ok
435
448
  return values
436
449
 
@@ -506,7 +519,9 @@ class ToolFunctionCall(BaseModel):
506
519
  Parsed representation of an LLM's function call response.
507
520
  """
508
521
 
509
- name: str = Field(..., description="Name of the function the LLM chose to call.")
522
+ name: str = Field(
523
+ ..., description="Name of the function the LLM chose to call."
524
+ )
510
525
  arguments: str = Field(
511
526
  ..., description="JSON-encoded string of the call's arguments."
512
527
  )
@@ -60,7 +60,9 @@ class BaseField(BaseModel, ABC):
60
60
  if field_cls.can_handle(name, schema):
61
61
  desc = schema.get("description", "")
62
62
  extra = {
63
- k: v for k, v in schema.items() if k not in ("type", "description")
63
+ k: v
64
+ for k, v in schema.items()
65
+ if k not in ("type", "description")
64
66
  }
65
67
  return field_cls(
66
68
  name=name,
@@ -74,7 +76,9 @@ class BaseField(BaseModel, ABC):
74
76
  json_type=schema.get("type", "string"),
75
77
  description=schema.get("description", ""),
76
78
  jsonschema_extra={
77
- k: v for k, v in schema.items() if k not in ("type", "description")
79
+ k: v
80
+ for k, v in schema.items()
81
+ if k not in ("type", "description")
78
82
  },
79
83
  extra_params={},
80
84
  )
@@ -122,10 +126,12 @@ class NumericField(BaseField):
122
126
  """
123
127
 
124
128
  threshold_low: Optional[float] = PydanticField(
125
- None, description="Lower bound for correctness checks (not in JSONSchema)."
129
+ None,
130
+ description="Lower bound for correctness checks (not in JSONSchema).",
126
131
  )
127
132
  threshold_high: Optional[float] = PydanticField(
128
- None, description="Upper bound for correctness checks (not in JSONSchema)."
133
+ None,
134
+ description="Upper bound for correctness checks (not in JSONSchema).",
129
135
  )
130
136
 
131
137
  __abstract__ = False
@@ -153,7 +159,9 @@ class NumericField(BaseField):
153
159
  json_type=schema.get("type", "number"),
154
160
  description=schema.get("description", ""),
155
161
  jsonschema_extra={
156
- k: v for k, v in schema.items() if k not in ("type", "description")
162
+ k: v
163
+ for k, v in schema.items()
164
+ if k not in ("type", "description")
157
165
  },
158
166
  extra_params={},
159
167
  )
@@ -131,7 +131,9 @@ class Metric:
131
131
  additional_properties=additional_props,
132
132
  )
133
133
 
134
- def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
134
+ def is_important(
135
+ self, result: Dict[str, Any]
136
+ ) -> Tuple[bool, Optional[str]]:
135
137
  """
136
138
  A result is 'important' if its confidence lies within the defined confidence thresholds.
137
139
 
@@ -146,7 +148,9 @@ class Metric:
146
148
  except (TypeError, ValueError):
147
149
  return False, "Invalid confidence value"
148
150
  # locate the confidence field
149
- conf_field = next((f for f in self.fields if f.name == "confidence"), None)
151
+ conf_field = next(
152
+ (f for f in self.fields if f.name == "confidence"), None
153
+ )
150
154
  if isinstance(conf_field, NumericField):
151
155
  ok = conf_field.is_within_threshold(conf)
152
156
  reason = (
@@ -266,7 +270,10 @@ class StandardMetric(Metric):
266
270
  json_type="number",
267
271
  description=f"Confidence in the output value (range {min_conf} to {max_conf}).",
268
272
  jsonschema_extra={"minimum": min_conf, "maximum": max_conf},
269
- extra_params={"threshold_low": min_conf, "threshold_high": max_conf},
273
+ extra_params={
274
+ "threshold_low": min_conf,
275
+ "threshold_high": max_conf,
276
+ },
270
277
  )
271
278
  correction = CorrectionField(
272
279
  name="correction",
@@ -277,7 +284,9 @@ class StandardMetric(Metric):
277
284
  fields = [explanation, evidence, output, confidence, correction]
278
285
  super().__init__(name=name, description=description, fields=fields)
279
286
 
280
- def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
287
+ def is_important(
288
+ self, result: Dict[str, Any]
289
+ ) -> Tuple[bool, Optional[str]]:
281
290
  """
282
291
  A result is 'important' if its confidence lies within the defined confidence thresholds.
283
292
 
@@ -292,7 +301,9 @@ class StandardMetric(Metric):
292
301
  except (TypeError, ValueError):
293
302
  return False, "Invalid confidence value"
294
303
  # locate the confidence field
295
- conf_field = next((f for f in self.fields if f.name == "confidence"), None)
304
+ conf_field = next(
305
+ (f for f in self.fields if f.name == "confidence"), None
306
+ )
296
307
  if isinstance(conf_field, NumericField):
297
308
  ok = conf_field.is_within_threshold(conf)
298
309
  reason = (
@@ -5,7 +5,9 @@ from pydantic import BaseModel
5
5
 
6
6
  from wxo_agentic_evaluation.referenceless_eval.metrics.field import NumericField
7
7
  from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
8
- from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import MetricPrompt
8
+ from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
9
+ MetricPrompt,
10
+ )
9
11
  from wxo_agentic_evaluation.referenceless_eval.prompt.runner import (
10
12
  AsyncGen,
11
13
  Prompt,
@@ -40,7 +42,8 @@ class MetricRunner:
40
42
  """
41
43
 
42
44
  def __init__(
43
- self, entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None
45
+ self,
46
+ entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None,
44
47
  ) -> None:
45
48
  """
46
49
  Args:
@@ -51,7 +54,9 @@ class MetricRunner:
51
54
  for mp, kw in entries:
52
55
  self.add(mp, kw)
53
56
 
54
- def add(self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]) -> None:
57
+ def add(
58
+ self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]
59
+ ) -> None:
55
60
  """
56
61
  Add a metric to run.
57
62
 
@@ -68,7 +68,9 @@ class MetricPrompt:
68
68
  # Store defaults for system context
69
69
  # This allows overriding system context without modifying the template
70
70
  # during prompt building
71
- self.system_kwargs_defaults: Dict[str, Any] = system_kwargs_defaults.copy()
71
+ self.system_kwargs_defaults: Dict[str, Any] = (
72
+ system_kwargs_defaults.copy()
73
+ )
72
74
 
73
75
  # Initialize examples list
74
76
  # This will hold (user_kwargs, output) pairs for few-shot prompting
@@ -104,7 +106,9 @@ class MetricPrompt:
104
106
 
105
107
  # --- Example Management ---
106
108
 
107
- def add_example(self, user_kwargs: Dict[str, Any], output: Dict[str, Any]) -> None:
109
+ def add_example(
110
+ self, user_kwargs: Dict[str, Any], output: Dict[str, Any]
111
+ ) -> None:
108
112
  """
109
113
  Add a few-shot example.
110
114
 
@@ -17,7 +17,11 @@ def remove_threshold_fields(schema: dict) -> dict:
17
17
  schema[key] = remove_threshold_fields(value)
18
18
  elif isinstance(value, list):
19
19
  schema[key] = [
20
- remove_threshold_fields(item) if isinstance(item, dict) else item
20
+ (
21
+ remove_threshold_fields(item)
22
+ if isinstance(item, dict)
23
+ else item
24
+ )
21
25
  for item in value
22
26
  ]
23
27
  return schema
@@ -1,10 +1,22 @@
1
1
  import asyncio
2
- from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, TypeVar, Union
2
+ from typing import (
3
+ Any,
4
+ Awaitable,
5
+ Callable,
6
+ Dict,
7
+ List,
8
+ Optional,
9
+ Tuple,
10
+ TypeVar,
11
+ Union,
12
+ )
3
13
 
4
14
  from pydantic import BaseModel
5
15
 
6
16
  Prompt = Union[str, List[Dict[str, Any]]]
7
- PromptAndSchema = Tuple[Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]]
17
+ PromptAndSchema = Tuple[
18
+ Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]
19
+ ]
8
20
  SyncGen = Callable[[Prompt], Union[str, Any]]
9
21
  BatchGen = Callable[[List[Prompt]], List[Union[str, Any]]]
10
22
  AsyncGen = Callable[[Prompt], Awaitable[Union[str, Any]]]
@@ -137,7 +149,8 @@ class PromptRunner:
137
149
  return index, PromptResult(prompt=prompt, error=str(e))
138
150
 
139
151
  tasks = [
140
- asyncio.create_task(_run_one(i, p)) for i, p in enumerate(self.prompts)
152
+ asyncio.create_task(_run_one(i, p))
153
+ for i, p in enumerate(self.prompts)
141
154
  ]
142
155
  indexed_results = await asyncio.gather(*tasks)
143
156
  # Sort results to match original order
@@ -14,30 +14,37 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
14
14
  ToolCall,
15
15
  ToolSpec,
16
16
  )
17
- from wxo_agentic_evaluation.type import Message
18
17
  from wxo_agentic_evaluation.service_provider import get_provider
18
+ from wxo_agentic_evaluation.type import Message
19
+
19
20
 
20
21
  class ReferencelessEvaluation:
21
22
  """
22
- Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
23
- Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
24
- ---
25
- Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
26
- Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
27
- Note: When parsing the semantic metrics, check for `is_correct` field. if `false` there is some mistake that the LLMaJ found in that tool call.
23
+ Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
24
+ Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
25
+ ---
26
+ Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
27
+ Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
28
+ Note: When parsing the semantic metrics, check for `is_correct` field. if `false` there is some mistake that the LLMaJ found in that tool call.
28
29
  """
30
+
29
31
  def __init__(
30
32
  self,
31
33
  api_spec: List[Mapping[str, Any]],
32
34
  messages: List[Message],
33
35
  model_id: str,
34
36
  task_n: str,
35
- dataset_name: str,):
36
-
37
+ dataset_name: str,
38
+ ):
39
+
37
40
  self.metrics_client = get_provider(
38
41
  model_id=model_id,
39
- params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 4096},
40
- referenceless_eval=True
42
+ params={
43
+ "min_new_tokens": 0,
44
+ "decoding_method": "greedy",
45
+ "max_new_tokens": 4096,
46
+ },
47
+ referenceless_eval=True,
41
48
  )
42
49
 
43
50
  self.pipeline = ReflectionPipeline(
@@ -72,7 +79,11 @@ class ReferencelessEvaluation:
72
79
  examples = []
73
80
 
74
81
  processed_data = [
75
- {k: msg.model_dump().get(k) for k in ["role", "content", "type"] if k in msg.model_dump()}
82
+ {
83
+ k: msg.model_dump().get(k)
84
+ for k in ["role", "content", "type"]
85
+ if k in msg.model_dump()
86
+ }
76
87
  for msg in self.messages
77
88
  ]
78
89
 
@@ -1,4 +1,5 @@
1
1
  from collections import defaultdict
2
+
2
3
  from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
3
4
 
4
5
 
@@ -44,4 +45,4 @@ class ResourceMap:
44
45
 
45
46
  agent2tools = dict(agent2tools)
46
47
  tools2agents = dict(tools2agents)
47
- return agent2tools, tools2agents
48
+ return agent2tools, tools2agents
@@ -1,8 +1,10 @@
1
1
  import logging
2
- import yaml
3
2
  import os
3
+
4
4
  import requests
5
- from wxo_agentic_evaluation.utils.utils import is_saas_url, is_ibm_cloud_url
5
+ import yaml
6
+
7
+ from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url, is_saas_url
6
8
 
7
9
  logger = logging.getLogger(__name__)
8
10
 
@@ -11,13 +13,15 @@ USER = {"username": "wxo.archer@ibm.com", "password": "watsonx"}
11
13
 
12
14
  class ServiceInstance:
13
15
  def __init__(
14
- self, service_url, tenant_name, is_saas: bool = None, is_ibm_cloud: bool = None
16
+ self,
17
+ service_url,
18
+ tenant_name,
19
+ is_saas: bool = None,
20
+ is_ibm_cloud: bool = None,
15
21
  ) -> None:
16
22
  self.service_url = service_url
17
23
  self.tenant_name = tenant_name
18
- STAGING_AUTH_ENDPOINT = (
19
- "https://iam.platform.test.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
20
- )
24
+ STAGING_AUTH_ENDPOINT = "https://iam.platform.test.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
21
25
  PROD_AUTH_ENDPOINT = (
22
26
  "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
23
27
  )
@@ -25,7 +29,9 @@ class ServiceInstance:
25
29
 
26
30
  self.is_saas = is_saas_url(service_url) if is_saas is None else is_saas
27
31
  self.is_ibm_cloud = (
28
- is_ibm_cloud_url(service_url) if is_ibm_cloud is None else is_ibm_cloud
32
+ is_ibm_cloud_url(service_url)
33
+ if is_ibm_cloud is None
34
+ else is_ibm_cloud
29
35
  )
30
36
 
31
37
  if self.is_saas:
@@ -88,7 +94,8 @@ class ServiceInstance:
88
94
 
89
95
  def _get_tenant_token(self, tenant_id: str):
90
96
  resp = requests.post(
91
- self.tenant_auth_endpoint.format(self.service_url, tenant_id), data=USER
97
+ self.tenant_auth_endpoint.format(self.service_url, tenant_id),
98
+ data=USER,
92
99
  )
93
100
  if resp.status_code == 200:
94
101
  return resp.json()["access_token"]
@@ -122,7 +129,9 @@ class ServiceInstance:
122
129
  "tags": ["test"],
123
130
  }
124
131
 
125
- resp = requests.post(self.tenant_url, headers=headers, json=tenant_config)
132
+ resp = requests.post(
133
+ self.tenant_url, headers=headers, json=tenant_config
134
+ )
126
135
  if resp.status_code == 201:
127
136
  return True
128
137
  else:
@@ -159,8 +168,12 @@ def tenant_setup(service_url: str, tenant_name: str):
159
168
  # else:
160
169
  # tenant_token = service_instance._get_tenant_token(tenant_id)
161
170
 
162
- auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
163
- env_config_path = f"{os.path.expanduser('~')}/.config/orchestrate/config.yaml"
171
+ auth_config_path = (
172
+ f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
173
+ )
174
+ env_config_path = (
175
+ f"{os.path.expanduser('~')}/.config/orchestrate/config.yaml"
176
+ )
164
177
 
165
178
  # TO-DO: update SDK and use SDK to manage this
166
179
  with open(auth_config_path, "r") as f:
@@ -1,12 +1,24 @@
1
- from wxo_agentic_evaluation.service_provider.ollama_provider import OllamaProvider
2
- from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
3
- from wxo_agentic_evaluation.service_provider.model_proxy_provider import ModelProxyProvider
4
- from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import ModelProxyProviderLLMKitWrapper, WatsonXLLMKitWrapper
1
+ import os
2
+
5
3
  from wxo_agentic_evaluation.arg_configs import ProviderConfig
4
+ from wxo_agentic_evaluation.service_provider.model_proxy_provider import (
5
+ ModelProxyProvider,
6
+ )
7
+ from wxo_agentic_evaluation.service_provider.ollama_provider import (
8
+ OllamaProvider,
9
+ )
10
+ from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
11
+ ModelProxyProviderLLMKitWrapper,
12
+ WatsonXLLMKitWrapper,
13
+ )
14
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
15
+ WatsonXProvider,
16
+ )
6
17
 
7
- import os
8
18
 
9
- def _instantiate_provider(config: ProviderConfig, is_referenceless_eval: bool = False, **kwargs):
19
+ def _instantiate_provider(
20
+ config: ProviderConfig, is_referenceless_eval: bool = False, **kwargs
21
+ ):
10
22
  if config.provider == "watsonx":
11
23
  if is_referenceless_eval:
12
24
  provider = WatsonXLLMKitWrapper
@@ -22,12 +34,17 @@ def _instantiate_provider(config: ProviderConfig, is_referenceless_eval: bool =
22
34
  provider = ModelProxyProvider
23
35
  return provider(model_id=config.model_id, **kwargs)
24
36
  else:
25
- raise RuntimeError(f"target provider is not supported {config.provider}")
26
-
27
- def get_provider(config: ProviderConfig = None, model_id: str = None, referenceless_eval: bool = False, **kwargs):
28
- if config:
29
- return _instantiate_provider(config, **kwargs)
37
+ raise RuntimeError(
38
+ f"target provider is not supported {config.provider}"
39
+ )
40
+
30
41
 
42
+ def get_provider(
43
+ config: ProviderConfig = None,
44
+ model_id: str = None,
45
+ referenceless_eval: bool = False,
46
+ **kwargs,
47
+ ):
31
48
  if not model_id:
32
49
  raise ValueError("model_id must be provided if config is not supplied")
33
50
 
@@ -35,10 +52,13 @@ def get_provider(config: ProviderConfig = None, model_id: str = None, referencel
35
52
  config = ProviderConfig(provider="watsonx", model_id=model_id)
36
53
  return _instantiate_provider(config, referenceless_eval, **kwargs)
37
54
 
38
- if "WO_API_KEY" in os.environ and "WO_INSTANCE" in os.environ:
55
+ if "WO_INSTANCE" in os.environ:
39
56
  config = ProviderConfig(provider="model_proxy", model_id=model_id)
40
57
  return _instantiate_provider(config, referenceless_eval, **kwargs)
58
+
59
+ if config:
60
+ return _instantiate_provider(config, **kwargs)
41
61
 
42
62
  raise RuntimeError(
43
63
  "No provider found. Please either provide a config or set the required environment variables."
44
- )
64
+ )