ibm-watsonx-orchestrate-evaluation-framework 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (35) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/METADATA +1 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/RECORD +35 -31
  3. wxo_agentic_evaluation/analyze_run.py +805 -344
  4. wxo_agentic_evaluation/arg_configs.py +10 -1
  5. wxo_agentic_evaluation/description_quality_checker.py +11 -2
  6. wxo_agentic_evaluation/evaluation_package.py +8 -3
  7. wxo_agentic_evaluation/external_agent/external_validate.py +5 -5
  8. wxo_agentic_evaluation/external_agent/types.py +3 -9
  9. wxo_agentic_evaluation/inference_backend.py +46 -79
  10. wxo_agentic_evaluation/llm_matching.py +14 -2
  11. wxo_agentic_evaluation/main.py +1 -1
  12. wxo_agentic_evaluation/metrics/__init__.py +1 -0
  13. wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
  14. wxo_agentic_evaluation/metrics/metrics.py +43 -1
  15. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  16. wxo_agentic_evaluation/prompt/template_render.py +4 -2
  17. wxo_agentic_evaluation/quick_eval.py +7 -9
  18. wxo_agentic_evaluation/record_chat.py +22 -29
  19. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +139 -100
  20. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -34
  21. wxo_agentic_evaluation/red_teaming/attack_list.py +89 -18
  22. wxo_agentic_evaluation/red_teaming/attack_runner.py +51 -11
  23. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  24. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  26. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +77 -39
  27. wxo_agentic_evaluation/resource_map.py +3 -1
  28. wxo_agentic_evaluation/service_instance.py +7 -0
  29. wxo_agentic_evaluation/type.py +1 -1
  30. wxo_agentic_evaluation/utils/__init__.py +3 -0
  31. wxo_agentic_evaluation/utils/parsers.py +71 -0
  32. wxo_agentic_evaluation/utils/utils.py +131 -16
  33. wxo_agentic_evaluation/wxo_client.py +80 -0
  34. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/WHEEL +0 -0
  35. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/top_level.txt +0 -0
@@ -390,18 +390,18 @@ class PipelineResult(BaseModel):
390
390
  )
391
391
 
392
392
  @model_validator(mode="after")
393
- def compute_overall(cls, values: PipelineResult) -> PipelineResult:
393
+ def compute_overall(self) -> Self:
394
394
  """
395
395
  After validation, compute overall_valid as AND of:
396
396
  • all semantic is_correct flags
397
397
  • if transform exists: all execution_success flags
398
398
  """
399
- static: StaticResult = values.static
399
+ static: StaticResult = self.static
400
400
  if static:
401
401
  # static checks
402
402
  ok = static.final_decision
403
403
 
404
- sem: SemanticResult = values.semantic
404
+ sem: SemanticResult = self.semantic
405
405
  if sem:
406
406
  # semantic checks
407
407
  if sem.general and sem.general.metrics:
@@ -441,11 +441,11 @@ class PipelineResult(BaseModel):
441
441
  if param_avgs:
442
442
  cat_avgs.append(sum(param_avgs) / len(param_avgs))
443
443
 
444
- values.overall_avg_score = (
444
+ self.overall_avg_score = (
445
445
  sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
446
446
  )
447
- values.overall_valid = ok
448
- return values
447
+ self.overall_valid = ok
448
+ return self
449
449
 
450
450
 
451
451
  # ----------------------------------------------------------------------
@@ -531,17 +531,17 @@ class ToolFunctionCall(BaseModel):
531
531
  )
532
532
 
533
533
  @model_validator(mode="after")
534
- def _parse_arguments(cls, values: ToolFunctionCall) -> ToolFunctionCall:
534
+ def _parse_arguments(self) -> Self:
535
535
  """
536
536
  After model construction, parse the `arguments` JSON string
537
537
  into `parsed_arguments`, or raise a ValidationError.
538
538
  """
539
539
  try:
540
- raw = values.arguments
541
- values.parsed_arguments = json.loads(raw)
540
+ raw = self.arguments
541
+ self.parsed_arguments = json.loads(raw)
542
542
  except json.JSONDecodeError as e:
543
543
  raise ValidationError(f"Invalid JSON in arguments: {e}") from e
544
- return values
544
+ return self
545
545
 
546
546
 
547
547
  class ToolCall(BaseModel):
@@ -17,6 +17,11 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
17
17
  from wxo_agentic_evaluation.service_provider import get_provider
18
18
  from wxo_agentic_evaluation.type import Message
19
19
 
20
+ DEFAULT_GENERATION_PARAMS= {
21
+ "min_new_tokens": 0,
22
+ "decoding_method": "greedy",
23
+ "max_new_tokens": 4096,
24
+ }
20
25
 
21
26
  class ReferencelessEvaluation:
22
27
  """
@@ -31,19 +36,16 @@ class ReferencelessEvaluation:
31
36
  def __init__(
32
37
  self,
33
38
  api_spec: List[Mapping[str, Any]],
34
- messages: List[Message],
35
39
  model_id: str,
36
40
  task_n: str,
37
41
  dataset_name: str,
42
+ runtime_pipeline: bool = True,
43
+ generation_params = DEFAULT_GENERATION_PARAMS
38
44
  ):
39
45
 
40
46
  self.metrics_client = get_provider(
41
47
  model_id=model_id,
42
- params={
43
- "min_new_tokens": 0,
44
- "decoding_method": "greedy",
45
- "max_new_tokens": 4096,
46
- },
48
+ params=generation_params,
47
49
  referenceless_eval=True,
48
50
  )
49
51
 
@@ -52,39 +54,45 @@ class ReferencelessEvaluation:
52
54
  general_metrics=[METRIC_GENERAL_HALLUCINATION_CHECK],
53
55
  function_metrics=[METRIC_FUNCTION_SELECTION_APPROPRIATENESS],
54
56
  parameter_metrics=None,
57
+ runtime_pipeline=runtime_pipeline,
55
58
  )
56
59
 
57
60
  self.task_n = task_n
58
61
  self.dataset_name = dataset_name
59
62
 
60
63
  self.apis_specs = [ToolSpec.model_validate(spec) for spec in api_spec]
61
- self.messages = messages
62
-
63
- def _run_pipeline(self, examples: List[Mapping[str, Any]]):
64
- results = []
65
- for example in examples:
66
- result = self.pipeline.run_sync(
67
- conversation=example["context"],
68
- inventory=self.apis_specs,
69
- call=example["call"],
70
- continue_on_static=False,
71
- retries=2,
72
- )
73
- result_dict = result.model_dump()
74
- results.append(result_dict)
75
64
 
76
- return results
65
+ @staticmethod
66
+ def fmt_tool_call(tool_id, tool_call_name, arguments, context):
67
+ call = {
68
+ "call": {
69
+ "id": tool_id,
70
+ "type": "function",
71
+ "function": {
72
+ "name": tool_call_name,
73
+ "arguments": arguments,
74
+ },
75
+ },
76
+ "context": context,
77
+ }
78
+
79
+ return call
77
80
 
78
- def run(self):
81
+ @staticmethod
82
+ def fmt_msgs_referenceless(
83
+ messages: List[Message],
84
+ ) -> List[Mapping[str, Any]]:
85
+ """Assume that the last item in the `messages` array is the tool call, and preceding items
86
+ in the messages array is the context.
87
+ """
79
88
  examples = []
80
-
81
89
  processed_data = [
82
90
  {
83
91
  k: msg.model_dump().get(k)
84
92
  for k in ["role", "content", "type"]
85
93
  if k in msg.model_dump()
86
94
  }
87
- for msg in self.messages
95
+ for msg in messages
88
96
  ]
89
97
 
90
98
  for idx, message in enumerate(processed_data):
@@ -96,23 +104,48 @@ class ReferencelessEvaluation:
96
104
  tool_call_msg = json.loads(content)
97
105
  if tool_call_msg["name"].startswith("transfer_to"):
98
106
  continue
99
-
100
- call = {
101
- "call": {
102
- "id": tool_call_msg.get("id", "1"),
103
- "type": "function",
104
- "function": {
105
- "name": tool_call_msg["name"],
106
- "arguments": json.dumps(tool_call_msg["args"]),
107
- },
108
- },
109
- "context": context,
110
- }
107
+
108
+ call = ReferencelessEvaluation.fmt_tool_call(
109
+ tool_id=tool_call_msg.get("id", "1"),
110
+ tool_call_name=tool_call_msg["name"],
111
+ arguments=json.dumps(tool_call_msg["args"]),
112
+ context=context
113
+ )
111
114
  examples.append(call)
112
115
 
113
- rich.print(
114
- f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
115
- )
116
+ return examples
117
+
118
+ def _run_pipeline(self, examples: List[Mapping[str, Any]]):
119
+ results = []
120
+ for example in examples:
121
+ result = self.pipeline.run_sync(
122
+ conversation=example["context"],
123
+ inventory=self.apis_specs,
124
+ call=example["call"],
125
+ continue_on_static=False,
126
+ retries=2,
127
+ )
128
+ result_dict = result.model_dump()
129
+ results.append(result_dict)
130
+
131
+ return results
132
+
133
+ def run(self, examples: List[Mapping[str, str]], verbose=False):
134
+ """`examples` should be an array where each element is formatted:
135
+
136
+ call = {
137
+ "call": {
138
+ "id": tool_call_msg.get("id", "1"),
139
+ "type": "function",
140
+ "function": {
141
+ "name": tool_call_msg["name"],
142
+ "arguments": json.dumps(tool_call_msg["args"]),
143
+ },
144
+ },
145
+ "context": context,
146
+ }
147
+ """
148
+
116
149
  examples = [
117
150
  {
118
151
  "call": ToolCall.model_validate(ex["call"]),
@@ -120,6 +153,11 @@ class ReferencelessEvaluation:
120
153
  }
121
154
  for ex in examples
122
155
  ]
156
+
157
+ if verbose:
158
+ rich.print(
159
+ f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
160
+ )
123
161
  results = self._run_pipeline(examples)
124
162
 
125
163
  return results
@@ -1,6 +1,7 @@
1
1
  from collections import defaultdict
2
2
 
3
- from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
3
+ from wxo_agentic_evaluation.utils.utils import is_saas_url
4
+ from wxo_agentic_evaluation.wxo_client import WXOClient
4
5
 
5
6
 
6
7
  class ResourceMap:
@@ -34,6 +35,7 @@ class ResourceMap:
34
35
 
35
36
  if resp.status_code == 200:
36
37
  agents = resp.json()
38
+ self.all_agent_objs = agents
37
39
  for agent in agents:
38
40
  agent_name = agent["name"]
39
41
  tools = [tool_map[id] for id in agent["tools"]]
@@ -247,6 +247,13 @@ def tenant_setup(
247
247
 
248
248
  context["active_environment"] = tenant_name
249
249
 
250
+ # Ensure parent directories exist so tests (which may run in clean envs)
251
+ # can write these files without raising FileNotFoundError.
252
+ auth_dir = os.path.dirname(auth_config_path)
253
+ env_dir = os.path.dirname(env_config_path)
254
+ os.makedirs(auth_dir, exist_ok=True)
255
+ os.makedirs(env_dir, exist_ok=True)
256
+
250
257
  with open(auth_config_path, "w") as f:
251
258
  yaml.dump(auth_config, f)
252
259
  with open(env_config_path, "w") as f:
@@ -131,7 +131,7 @@ class AttackData(BaseModel):
131
131
 
132
132
  class AttackData(BaseModel):
133
133
  agent: str
134
- agents_path: str
134
+ agents_list_or_path: Union[List[str], str]
135
135
  attack_data: AttackData
136
136
  story: str
137
137
  starting_sentence: str
@@ -1,4 +1,7 @@
1
1
  import json
2
+ from wxo_agentic_evaluation.utils.utils import TestCaseResources, add_line_seperator, list_run_files, load_run_metrics, N_A
3
+ from wxo_agentic_evaluation.utils.open_ai_tool_extractor import ToolExtractionOpenAIFormat
4
+ from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
2
5
 
3
6
 
4
7
  def json_dump(output_path, object):
@@ -0,0 +1,71 @@
1
+ from typing import Any, List, Mapping, Optional
2
+
3
+ from wxo_agentic_evaluation.metrics import (
4
+ Annotation,
5
+ FailedSemanticTestCases,
6
+ FailedStaticTestCases,
7
+ )
8
+
9
+
10
+ class ReferencelessEvalParser:
11
+ @staticmethod
12
+ def static_parser(
13
+ static_metrics: Mapping[str, Mapping[str, Any]],
14
+ ) -> List[FailedStaticTestCases]:
15
+ """
16
+ static.metrics
17
+ """
18
+
19
+ failed_test_cases = []
20
+
21
+ for metric, metric_data in static_metrics.items():
22
+ if not metric_data.get("valid", False):
23
+ fail = FailedStaticTestCases(
24
+ metric_name=metric,
25
+ description=metric_data.get("description"),
26
+ explanation=metric_data.get("explanation"),
27
+ )
28
+
29
+ failed_test_cases.append(fail)
30
+
31
+ return failed_test_cases
32
+
33
+ @staticmethod
34
+ def parse_annotations(
35
+ actionable_reccomendations, filters: List[str]
36
+ ) -> Optional[List[Annotation]]:
37
+ annotations = [
38
+ Annotation(
39
+ parameter_name=recc.get("parameter_name"),
40
+ recommendation=recc.get("recommendation"),
41
+ details=recc.get("details"),
42
+ quote=recc.get("quote"),
43
+ )
44
+ for recc in actionable_reccomendations
45
+ if recc.get("recommendation") in filters
46
+ ]
47
+
48
+ annotations = annotations if annotations else None
49
+
50
+ return annotations
51
+
52
+ @staticmethod
53
+ def semantic_parser(
54
+ metric_name, data, annotation_filters: Optional[List[str]]
55
+ ):
56
+ semantic_metric = FailedSemanticTestCases(
57
+ metric_name=metric_name,
58
+ evidence=data.get("evidence"),
59
+ explanation=data.get("explanation"),
60
+ output=data.get("output"),
61
+ confidence=data.get("confidence"),
62
+ )
63
+
64
+ if annotation_filters and (
65
+ annotations := ReferencelessEvalParser.parse_annotations(
66
+ data.get("actionable_recommendations"), annotation_filters
67
+ )
68
+ ):
69
+ semantic_metric.annotations = annotations
70
+
71
+ return semantic_metric
@@ -2,10 +2,14 @@ import glob
2
2
  import json
3
3
  import os
4
4
  import re
5
- from typing import List, Optional, Union
5
+ import csv
6
+ from collections import defaultdict
7
+ from pathlib import Path
8
+ from typing import List, Optional, Union, Mapping, Tuple, Any
6
9
  from urllib.parse import urlparse
7
10
 
8
11
  import yaml
12
+ import rich
9
13
  from rich import box, print
10
14
  from rich.console import Console, Group
11
15
  from rich.panel import Panel
@@ -18,14 +22,20 @@ from wxo_agentic_evaluation.metrics.metrics import (
18
22
  KnowledgeBaseMetricSummary,
19
23
  ReferenceLessEvalMetrics,
20
24
  ToolCallAndRoutingMetrics,
25
+ EnhancedAnalyzeMetrics,
21
26
  )
22
27
  from wxo_agentic_evaluation.type import (
23
28
  ConversationalConfidenceThresholdScore,
24
29
  Message,
30
+ ExtendedMessage,
25
31
  )
26
32
 
27
33
  console = Console()
28
34
 
35
+ RUN_FILE_RE = re.compile(
36
+ r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
37
+ )
38
+ N_A = "N/A"
29
39
 
30
40
  class AttackResultsTable:
31
41
  def __init__(self, attack_results: dict):
@@ -65,6 +75,94 @@ class AttackResultsTable:
65
75
  console.print(self.table)
66
76
 
67
77
 
78
+ class TestCaseResources:
79
+ def __init__(self, output_dir: str):
80
+ """Todo flesh out for all resources that are saved"""
81
+ self.output_dir = Path(output_dir)
82
+
83
+ @property
84
+ def get_summary(self):
85
+ summary = []
86
+
87
+ with open(self.output_dir / "summary_metrics.csv", "r") as f:
88
+ reader = csv.reader(f)
89
+ header = next(reader)
90
+ for row in reader:
91
+ summary.append(dict(zip(header, row)))
92
+
93
+ return summary
94
+
95
+ def get_analyze_messages(
96
+ self, test_case_name=None, path=None
97
+ ) -> Tuple[List[ExtendedMessage], Mapping[str, Any]]:
98
+ test_messages = []
99
+
100
+ if test_case_name:
101
+ path = os.path.join(
102
+ self.output_dir,
103
+ "messages",
104
+ f"{test_case_name}.messages.analyze.json",
105
+ )
106
+
107
+ if not Path(str(path)).is_file():
108
+ rich.print(f"[r]No analyze file found at {path}")
109
+ raise Exception(f"No analyze file found at {path}")
110
+
111
+ with open(path, "r", encoding="utf-8") as f:
112
+ temp = json.load(f)
113
+ meta = None
114
+ if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
115
+ meta = temp[-1]["meta"]
116
+ temp = temp[:-1]
117
+
118
+ for entry in temp:
119
+ msg = ExtendedMessage(**entry)
120
+ test_messages.append(msg)
121
+
122
+ return test_messages, meta
123
+
124
+ def get_messages(self, test_case_name=None, path=None) -> List[Message]:
125
+ test_messages = []
126
+
127
+ if test_case_name:
128
+ path = os.path.join(
129
+ self.output_dir,
130
+ "messages",
131
+ f"{test_case_name}.messages.json",
132
+ )
133
+
134
+ if not Path(str(path)).is_file():
135
+ rich.print(f"[r]No messages file found at {path}")
136
+ raise Exception(f"No messages file found at {path}")
137
+
138
+ with open(path, "r", encoding="utf-8") as f:
139
+ temp = json.load(f)
140
+ for entry in temp:
141
+ msg = Message(**entry)
142
+ test_messages.append(msg)
143
+
144
+ return test_messages
145
+
146
+ def get_test_metrics(
147
+ self, test_case_name=None, path=None
148
+ ) -> ToolCallAndRoutingMetrics:
149
+ if test_case_name:
150
+ path = os.path.join(
151
+ self.output_dir,
152
+ "messages",
153
+ f"{test_case_name}.metrics.json",
154
+ )
155
+
156
+ if not Path(str(path)).is_file():
157
+ rich.print(f"[r]No metrics file found at {path}")
158
+ raise Exception(f"No metrics file found at {path}")
159
+
160
+ with open(path, "r", encoding="utf-8") as f:
161
+ metrics = ToolCallAndRoutingMetrics(**json.load(f))
162
+
163
+ return metrics
164
+
165
+
68
166
  class AgentMetricsTable:
69
167
  def __init__(self, data):
70
168
  self.table = Table(
@@ -123,18 +221,27 @@ def is_ibm_cloud_url(service_url: str) -> bool:
123
221
 
124
222
  def add_line_seperator(
125
223
  style_config: Optional[Union[str, Style]] = None,
224
+ print=True,
126
225
  ):
226
+ """
227
+ Adds a lined seperator provided the style config.
228
+ `print` is a boolean to indicate if the lined seperator should go to stdout immeadiatly or returned as an object.
229
+ Set `print` to False, the lined seperator is printed later as part of the pager view for example.
230
+ """
127
231
 
128
232
  if not style_config:
129
233
  style = "grey42"
130
234
  else:
131
235
  style = style_config
132
236
 
133
- console.print(
134
- Rule(
135
- style=style,
237
+ if print:
238
+ console.print(
239
+ Rule(
240
+ style=style,
241
+ )
136
242
  )
137
- )
243
+ else:
244
+ return Rule(style=style, characters="==")
138
245
 
139
246
 
140
247
  class FaithfulnessTable:
@@ -347,6 +454,7 @@ class ReferencelessEvalPanel:
347
454
 
348
455
  # Function to load messages from JSON file
349
456
  def load_messages(file_path):
457
+ """TODO: replace in favor of TestCaseResources.get_messages(...)"""
350
458
  with open(file_path, "r") as f:
351
459
  try:
352
460
  message_data = json.load(f)
@@ -362,7 +470,7 @@ def load_messages(file_path):
362
470
  return None
363
471
 
364
472
 
365
- def load_agents(agents_path: str):
473
+ def load_agents_from_disk(agents_path: str):
366
474
  agents_json = glob.glob(os.path.join(agents_path, "*.json"))
367
475
  agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
368
476
 
@@ -379,32 +487,39 @@ def load_agents(agents_path: str):
379
487
  return agents
380
488
 
381
489
 
382
- RUN_FILE_RE = re.compile(
383
- r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
384
- )
385
-
386
-
387
- def list_run_files(messages_dir: str, dataset_base: str):
490
+ def list_run_files(messages_dir: str, dataset_base: str, filter_run: int = -1):
388
491
  """
389
492
  Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
390
493
  (We only need analyze+metrics for this feature.)
494
+
495
+ `filter_run` only get gets the runs files for that run. If it is -1, then all run files are retrieved
496
+ For example, if there is `data3.run1.messages.json`, `data3.run2.messages.json`, and filter_run is 2, then,
497
+ the files related to only the second run are retrieved.
498
+
391
499
  """
392
- runs = {}
500
+ runs = defaultdict(
501
+ lambda: {"analyze": None, "metrics": None, "messages": None}
502
+ )
393
503
  for fn in os.listdir(messages_dir):
394
504
  m = RUN_FILE_RE.match(fn)
395
505
  if not m or m.group("base") != dataset_base:
396
506
  continue
397
507
  run_id = int(m.group("run"))
508
+ if filter_run != -1 and run_id != filter_run:
509
+ continue
510
+
398
511
  kind = m.group("kind")
399
- entry = runs.setdefault(run_id, {"analyze": None, "metrics": None})
400
512
  full = os.path.join(messages_dir, fn)
401
513
  if kind == "messages.analyze":
402
- entry["analyze"] = full
514
+ runs[run_id]["analyze"] = full
403
515
  elif kind == "metrics":
404
- entry["metrics"] = full
516
+ runs[run_id]["metrics"] = full
517
+ elif kind == "messages":
518
+ runs[run_id]["messages"] = full
405
519
  return runs
406
520
 
407
521
 
408
522
  def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
523
+ """Todo remove in a later PR"""
409
524
  with open(metrics_path, "r", encoding="utf-8") as f:
410
525
  return ToolCallAndRoutingMetrics(**json.load(f))
@@ -0,0 +1,80 @@
1
+ import os
2
+ import requests
3
+ import urllib3
4
+ from urllib3.exceptions import InsecureRequestWarning
5
+ from typing import Dict, Any, Optional
6
+
7
+ from wxo_agentic_evaluation.service_instance import tenant_setup
8
+
9
+
10
+ class WXOClient:
11
+ def __init__(
12
+ self, service_url, api_key, env: Optional[Dict[str, Any]] = None
13
+ ):
14
+ self.service_url = service_url
15
+ self.api_key = api_key
16
+
17
+ ov = os.getenv("WO_SSL_VERIFY")
18
+ if ov and ov.strip().lower() in ("true", "false"):
19
+ self._verify_ssl = ov.strip().lower() == "true"
20
+ else:
21
+ v, bs = (env.get("verify") if env else None), (
22
+ env.get("bypass_ssl") if env else None
23
+ )
24
+ self._verify_ssl = (
25
+ False
26
+ if (
27
+ (bs is True)
28
+ or (isinstance(bs, str) and bs.strip().lower() == "true")
29
+ or (v is None)
30
+ or (
31
+ isinstance(v, str)
32
+ and v.strip().lower() in {"none", "null"}
33
+ )
34
+ )
35
+ else (v if isinstance(v, bool) else True)
36
+ )
37
+
38
+ if not self._verify_ssl:
39
+ urllib3.disable_warnings(InsecureRequestWarning)
40
+
41
+ def _get_headers(self) -> dict:
42
+ headers = {}
43
+ if self.api_key:
44
+ headers["Authorization"] = f"Bearer {self.api_key}"
45
+ return headers
46
+
47
+ def post(self, payload: dict, path: str, stream=False):
48
+ url = f"{self.service_url}/{path}"
49
+ return requests.post(
50
+ url=url,
51
+ headers=self._get_headers(),
52
+ json=payload,
53
+ stream=stream,
54
+ verify=self._verify_ssl,
55
+ )
56
+
57
+ def get(self, path: str, params: dict = None):
58
+ url = f"{self.service_url}/{path}"
59
+ return requests.get(
60
+ url,
61
+ params=params,
62
+ headers=self._get_headers(),
63
+ verify=self._verify_ssl,
64
+ )
65
+
66
+
67
+ def get_wxo_client(
68
+ service_url: Optional[str], tenant_name: str, token: Optional[str] = None
69
+ ) -> WXOClient:
70
+
71
+ token, resolved_url, env = tenant_setup(service_url, tenant_name)
72
+ service_url = service_url or resolved_url
73
+
74
+ if not (service_url and str(service_url).strip()):
75
+ raise ValueError(
76
+ f"service_url not provided and not found in config for tenant '{tenant_name}'"
77
+ )
78
+
79
+ wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
80
+ return wxo_client