ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,71 @@
1
+ from typing import Any, List, Mapping, Optional
2
+
3
+ from wxo_agentic_evaluation.metrics.metrics import (
4
+ Annotation,
5
+ FailedSemanticTestCases,
6
+ FailedStaticTestCases,
7
+ )
8
+
9
+
10
+ class ReferencelessEvalParser:
11
+ @staticmethod
12
+ def static_parser(
13
+ static_metrics: Mapping[str, Mapping[str, Any]],
14
+ ) -> List[FailedStaticTestCases]:
15
+ """
16
+ static.metrics
17
+ """
18
+
19
+ failed_test_cases = []
20
+
21
+ for metric, metric_data in static_metrics.items():
22
+ if not metric_data.get("valid", False):
23
+ fail = FailedStaticTestCases(
24
+ metric_name=metric,
25
+ description=metric_data.get("description"),
26
+ explanation=metric_data.get("explanation"),
27
+ )
28
+
29
+ failed_test_cases.append(fail)
30
+
31
+ return failed_test_cases
32
+
33
+ @staticmethod
34
+ def parse_annotations(
35
+ actionable_reccomendations, filters: List[str]
36
+ ) -> Optional[List[Annotation]]:
37
+ annotations = [
38
+ Annotation(
39
+ parameter_name=recc.get("parameter_name"),
40
+ recommendation=recc.get("recommendation"),
41
+ details=recc.get("details"),
42
+ quote=recc.get("quote"),
43
+ )
44
+ for recc in actionable_reccomendations
45
+ if recc.get("recommendation") in filters
46
+ ]
47
+
48
+ annotations = annotations if annotations else None
49
+
50
+ return annotations
51
+
52
+ @staticmethod
53
+ def semantic_parser(
54
+ metric_name, data, annotation_filters: Optional[List[str]]
55
+ ):
56
+ semantic_metric = FailedSemanticTestCases(
57
+ metric_name=metric_name,
58
+ evidence=data.get("evidence"),
59
+ explanation=data.get("explanation"),
60
+ output=data.get("output"),
61
+ confidence=data.get("confidence"),
62
+ )
63
+
64
+ if annotation_filters and (
65
+ annotations := ReferencelessEvalParser.parse_annotations(
66
+ data.get("actionable_recommendations"), annotation_filters
67
+ )
68
+ ):
69
+ semantic_metric.annotations = annotations
70
+
71
+ return semantic_metric
@@ -1,10 +1,15 @@
1
+ import csv
1
2
  import glob
2
3
  import json
4
+ import math
3
5
  import os
4
6
  import re
5
- from typing import List, Optional, Union
7
+ from collections import defaultdict
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
6
10
  from urllib.parse import urlparse
7
11
 
12
+ import rich
8
13
  import yaml
9
14
  from rich import box, print
10
15
  from rich.console import Console, Group
@@ -17,14 +22,25 @@ from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
17
22
  from wxo_agentic_evaluation.metrics.metrics import (
18
23
  KnowledgeBaseMetricSummary,
19
24
  ReferenceLessEvalMetrics,
25
+ ToolCallAndRoutingMetrics,
20
26
  )
21
27
  from wxo_agentic_evaluation.type import (
22
28
  ConversationalConfidenceThresholdScore,
29
+ ExtendedMessage,
23
30
  Message,
24
31
  )
25
32
 
26
33
  console = Console()
27
34
 
35
+ RUN_FILE_RE = re.compile(
36
+ r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
37
+ )
38
+ N_A = "N/A"
39
+
40
+ # File name constants
41
+ REFERENCE_FILE_NAME = "reference"
42
+ EXPERIMENT_FILE_NAME = "experiment"
43
+
28
44
 
29
45
  class AttackResultsTable:
30
46
  def __init__(self, attack_results: dict):
@@ -64,10 +80,100 @@ class AttackResultsTable:
64
80
  console.print(self.table)
65
81
 
66
82
 
83
+ class TestCaseResources:
84
+ def __init__(self, output_dir: str):
85
+ """Todo flesh out for all resources that are saved"""
86
+ self.output_dir = Path(output_dir)
87
+
88
+ @property
89
+ def get_summary(self):
90
+ summary = []
91
+
92
+ with open(self.output_dir / "summary_metrics.csv", "r") as f:
93
+ reader = csv.reader(f)
94
+ header = next(reader)
95
+ for row in reader:
96
+ summary.append(dict(zip(header, row)))
97
+
98
+ return summary
99
+
100
+ def get_analyze_messages(
101
+ self, test_case_name=None, path=None
102
+ ) -> Tuple[List[ExtendedMessage], Mapping[str, Any]]:
103
+ test_messages = []
104
+
105
+ if test_case_name:
106
+ path = os.path.join(
107
+ self.output_dir,
108
+ "messages",
109
+ f"{test_case_name}.messages.analyze.json",
110
+ )
111
+
112
+ if not Path(str(path)).is_file():
113
+ rich.print(f"[r]No analyze file found at {path}")
114
+ raise Exception(f"No analyze file found at {path}")
115
+
116
+ with open(path, "r", encoding="utf-8") as f:
117
+ temp = json.load(f)
118
+ meta = None
119
+ if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
120
+ meta = temp[-1]["meta"]
121
+ temp = temp[:-1]
122
+
123
+ for entry in temp:
124
+ msg = ExtendedMessage(**entry)
125
+ test_messages.append(msg)
126
+
127
+ return test_messages, meta
128
+
129
+ def get_messages(self, test_case_name=None, path=None) -> List[Message]:
130
+ test_messages = []
131
+
132
+ if test_case_name:
133
+ path = os.path.join(
134
+ self.output_dir,
135
+ "messages",
136
+ f"{test_case_name}.messages.json",
137
+ )
138
+
139
+ if not Path(str(path)).is_file():
140
+ rich.print(f"[r]No messages file found at {path}")
141
+ raise Exception(f"No messages file found at {path}")
142
+
143
+ with open(path, "r", encoding="utf-8") as f:
144
+ temp = json.load(f)
145
+ for entry in temp:
146
+ msg = Message(**entry)
147
+ test_messages.append(msg)
148
+
149
+ return test_messages
150
+
151
+ def get_test_metrics(
152
+ self, test_case_name=None, path=None
153
+ ) -> ToolCallAndRoutingMetrics:
154
+ if test_case_name:
155
+ path = os.path.join(
156
+ self.output_dir,
157
+ "messages",
158
+ f"{test_case_name}.metrics.json",
159
+ )
160
+
161
+ if not Path(str(path)).is_file():
162
+ rich.print(f"[r]No metrics file found at {path}")
163
+ raise Exception(f"No metrics file found at {path}")
164
+
165
+ with open(path, "r", encoding="utf-8") as f:
166
+ metrics = ToolCallAndRoutingMetrics(**json.load(f))
167
+
168
+ return metrics
169
+
170
+
67
171
  class AgentMetricsTable:
68
- def __init__(self, data):
172
+ def __init__(self, data, title: Optional[str] = None):
173
+ if title is None:
174
+ title = "Agent Metrics"
69
175
  self.table = Table(
70
- title="Agent Metrics",
176
+ title=title,
71
177
  box=box.ROUNDED,
72
178
  show_lines=True,
73
179
  )
@@ -88,7 +194,9 @@ class AgentMetricsTable:
88
194
  console.print(self.table)
89
195
 
90
196
 
91
- def create_table(data: List[dict]) -> AgentMetricsTable:
197
+ def create_table(
198
+ data: List[dict], title: Optional[str] = None
199
+ ) -> AgentMetricsTable:
92
200
  """
93
201
  Generate a Rich table from a list of dictionaries.
94
202
  Returns the AgentMetricsTable instance.
@@ -100,7 +208,47 @@ def create_table(data: List[dict]) -> AgentMetricsTable:
100
208
  print("create_table() received an empty dataset. No table generated.")
101
209
  return None
102
210
 
103
- return AgentMetricsTable(data)
211
+ return AgentMetricsTable(data, title=title)
212
+
213
+
214
+ def mean(vals: List[float]) -> float:
215
+ """
216
+ Calculate the mean of a list of values.
217
+
218
+ Args:
219
+ vals: List of values
220
+
221
+ Returns:
222
+ Mean value
223
+ """
224
+ return round(sum(vals) / len(vals), 2) if vals else 0.0
225
+
226
+
227
+ def to_pct(value: float | None, decimals: int = 0) -> str:
228
+ """
229
+ Convert a value to a percentage string.
230
+
231
+ Args:
232
+ value: Value to convert
233
+ decimals: Number of decimal places
234
+
235
+ Returns:
236
+ Percentage string
237
+ """
238
+ if value is None:
239
+ return "NA"
240
+ try:
241
+ return f"{round(float(value) * 100, decimals)}%"
242
+ except Exception:
243
+ return "NA"
244
+
245
+
246
+ def average(array) -> float:
247
+ if len(array) == 0:
248
+ return math.nan
249
+
250
+ else:
251
+ return sum(array) / len(array)
104
252
 
105
253
 
106
254
  def safe_divide(nom, denom):
@@ -122,20 +270,114 @@ def is_ibm_cloud_url(service_url: str) -> bool:
122
270
 
123
271
  def add_line_seperator(
124
272
  style_config: Optional[Union[str, Style]] = None,
273
+ print=True,
125
274
  ):
275
+ """
276
+ Adds a lined seperator provided the style config.
277
+ `print` is a boolean to indicate if the lined seperator should go to stdout immeadiatly or returned as an object.
278
+ Set `print` to False, the lined seperator is printed later as part of the pager view for example.
279
+ """
126
280
 
127
281
  if not style_config:
128
282
  style = "grey42"
129
283
  else:
130
284
  style = style_config
131
285
 
132
- console.print(
133
- Rule(
134
- style=style,
286
+ if print:
287
+ console.print(
288
+ Rule(
289
+ style=style,
290
+ )
135
291
  )
292
+ else:
293
+ return Rule(style=style, characters="==")
294
+
295
+
296
+ def get_reference_column(base_name: str) -> str:
297
+ """Generate a column name with the reference suffix."""
298
+ return f"{base_name}_{REFERENCE_FILE_NAME}"
299
+
300
+
301
+ def get_experiment_column(base_name: str) -> str:
302
+ """Generate a column name with the experiment suffix."""
303
+ return f"{base_name}_{EXPERIMENT_FILE_NAME}"
304
+
305
+
306
+ def get_diff_column(base_name: str) -> str:
307
+ """Generate a diff column name."""
308
+ return f"{base_name}_diff"
309
+
310
+
311
+ def get_column_value(
312
+ row: Dict[str, Any], base_name: str, file_type: str
313
+ ) -> Any:
314
+ """Get a value from a column with the appropriate suffix.
315
+
316
+ Args:
317
+ row: The data row
318
+ base_name: The base column name
319
+ file_type: Either 'reference' or 'experiment'
320
+
321
+ Returns:
322
+ The value from the column, or None if not found
323
+ """
324
+ if file_type.lower() == "reference":
325
+ key = get_reference_column(base_name)
326
+ elif file_type.lower() == "experiment":
327
+ key = get_experiment_column(base_name)
328
+ else:
329
+ raise ValueError(f"Invalid file_type: {file_type}")
330
+
331
+ return row.get(key)
332
+
333
+
334
+ def has_column_in_both(row: Dict[str, Any], base_name: str) -> bool:
335
+ """Check if a column exists with both reference and experiment suffixes."""
336
+ return (
337
+ get_reference_column(base_name) in row
338
+ and get_experiment_column(base_name) in row
136
339
  )
137
340
 
138
341
 
342
+ def format_ratio(ratio: Optional[float]) -> str:
343
+ """Format a ratio as a percentage string."""
344
+ if ratio is None:
345
+ return "N/A"
346
+ return f"{ratio * 100:.1f}%"
347
+
348
+
349
+ def read_file(path: str, type: str = "csv") -> List[Dict[str, Any]]:
350
+ """Read a file and return its contents as a structured object."""
351
+ if type == "csv":
352
+ return read_csv_file(path)
353
+ elif type == "json":
354
+ # Add JSON reading logic if needed
355
+ raise NotImplementedError("JSON reading not yet implemented")
356
+ else:
357
+ raise ValueError(f"Unsupported file type: {type}")
358
+
359
+
360
+ def read_csv_file(file_path: str) -> List[Dict[str, Any]]:
361
+ """Read a CSV file and return a list of dictionaries."""
362
+ data = []
363
+ with open(file_path, "r") as f:
364
+ reader = csv.DictReader(f)
365
+ for row in reader:
366
+ # Convert numeric values to appropriate types
367
+ for key, value in row.items():
368
+ if key == "dataset_name" or key == "text_match":
369
+ continue
370
+ elif key == "is_success":
371
+ row[key] = value.lower() == "true"
372
+ else:
373
+ try:
374
+ row[key] = float(value)
375
+ except ValueError:
376
+ pass
377
+ data.append(row)
378
+ return data
379
+
380
+
139
381
  class FaithfulnessTable:
140
382
  def __init__(
141
383
  self, faithfulness_metrics: List[Faithfulness], tool_call_ids: List[str]
@@ -346,6 +588,7 @@ class ReferencelessEvalPanel:
346
588
 
347
589
  # Function to load messages from JSON file
348
590
  def load_messages(file_path):
591
+ """TODO: replace in favor of TestCaseResources.get_messages(...)"""
349
592
  with open(file_path, "r") as f:
350
593
  try:
351
594
  message_data = json.load(f)
@@ -361,7 +604,7 @@ def load_messages(file_path):
361
604
  return None
362
605
 
363
606
 
364
- def load_agents(agents_path: str):
607
+ def load_agents_from_disk(agents_path: str):
365
608
  agents_json = glob.glob(os.path.join(agents_path, "*.json"))
366
609
  agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
367
610
 
@@ -376,3 +619,64 @@ def load_agents(agents_path: str):
376
619
  agents.append(yaml.safe_load(f))
377
620
 
378
621
  return agents
622
+
623
+
624
+ def list_run_files(messages_dir: str, dataset_base: str, filter_run: int = -1):
625
+ """
626
+ Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
627
+ (We only need analyze+metrics for this feature.)
628
+
629
+ `filter_run` only get gets the runs files for that run. If it is -1, then all run files are retrieved
630
+ For example, if there is `data3.run1.messages.json`, `data3.run2.messages.json`, and filter_run is 2, then,
631
+ the files related to only the second run are retrieved.
632
+
633
+ """
634
+ runs = defaultdict(
635
+ lambda: {"analyze": None, "metrics": None, "messages": None}
636
+ )
637
+ for fn in os.listdir(messages_dir):
638
+ m = RUN_FILE_RE.match(fn)
639
+ if not m or m.group("base") != dataset_base:
640
+ continue
641
+ run_id = int(m.group("run"))
642
+ if filter_run != -1 and run_id != filter_run:
643
+ continue
644
+
645
+ kind = m.group("kind")
646
+ full = os.path.join(messages_dir, fn)
647
+ if kind == "messages.analyze":
648
+ runs[run_id]["analyze"] = full
649
+ elif kind == "metrics":
650
+ runs[run_id]["metrics"] = full
651
+ elif kind == "messages":
652
+ runs[run_id]["messages"] = full
653
+ return runs
654
+
655
+
656
+ def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
657
+ """Todo remove in a later PR"""
658
+ with open(metrics_path, "r", encoding="utf-8") as f:
659
+ return ToolCallAndRoutingMetrics(**json.load(f))
660
+
661
+
662
+ def csv_dump(file_path: Union[str, Path], rows: List[Dict[str, Any]]) -> None:
663
+ """
664
+ Write rows to a CSV file.
665
+
666
+ Args:
667
+ file_path: Path to the output CSV file
668
+ rows: List of dictionaries representing CSV rows
669
+ """
670
+ if not rows:
671
+ return
672
+
673
+ # Ensure the parent directory exists
674
+ if isinstance(file_path, str):
675
+ file_path = Path(file_path)
676
+ file_path.parent.mkdir(parents=True, exist_ok=True)
677
+
678
+ # Write to CSV
679
+ with open(file_path, "w", newline="") as f:
680
+ writer = csv.DictWriter(f, fieldnames=rows[0].keys())
681
+ writer.writeheader()
682
+ writer.writerows(rows)
@@ -0,0 +1,81 @@
1
+ import os
2
+ from typing import Any, Dict, Optional
3
+
4
+ import requests
5
+ import urllib3
6
+ from urllib3.exceptions import InsecureRequestWarning
7
+
8
+ from wxo_agentic_evaluation.service_instance import tenant_setup
9
+
10
+
11
+ class WXOClient:
12
+ def __init__(
13
+ self, service_url, api_key, env: Optional[Dict[str, Any]] = None
14
+ ):
15
+ self.service_url = service_url
16
+ self.api_key = api_key
17
+
18
+ ov = os.getenv("WO_SSL_VERIFY")
19
+ if ov and ov.strip().lower() in ("true", "false"):
20
+ self._verify_ssl = ov.strip().lower() == "true"
21
+ else:
22
+ v, bs = (env.get("verify") if env else None), (
23
+ env.get("bypass_ssl") if env else None
24
+ )
25
+ self._verify_ssl = (
26
+ False
27
+ if (
28
+ (bs is True)
29
+ or (isinstance(bs, str) and bs.strip().lower() == "true")
30
+ or (v is None)
31
+ or (
32
+ isinstance(v, str)
33
+ and v.strip().lower() in {"none", "null"}
34
+ )
35
+ )
36
+ else (v if isinstance(v, bool) else True)
37
+ )
38
+
39
+ if not self._verify_ssl:
40
+ urllib3.disable_warnings(InsecureRequestWarning)
41
+
42
+ def _get_headers(self) -> dict:
43
+ headers = {}
44
+ if self.api_key:
45
+ headers["Authorization"] = f"Bearer {self.api_key}"
46
+ return headers
47
+
48
+ def post(self, payload: dict, path: str, stream=False):
49
+ url = f"{self.service_url}/{path}"
50
+ return requests.post(
51
+ url=url,
52
+ headers=self._get_headers(),
53
+ json=payload,
54
+ stream=stream,
55
+ verify=self._verify_ssl,
56
+ )
57
+
58
+ def get(self, path: str, params: dict = None):
59
+ url = f"{self.service_url}/{path}"
60
+ return requests.get(
61
+ url,
62
+ params=params,
63
+ headers=self._get_headers(),
64
+ verify=self._verify_ssl,
65
+ )
66
+
67
+
68
+ def get_wxo_client(
69
+ service_url: Optional[str], tenant_name: str, token: Optional[str] = None
70
+ ) -> WXOClient:
71
+
72
+ token, resolved_url, env = tenant_setup(service_url, tenant_name)
73
+ service_url = service_url or resolved_url
74
+
75
+ if not (service_url and str(service_url).strip()):
76
+ raise ValueError(
77
+ f"service_url not provided and not found in config for tenant '{tenant_name}'"
78
+ )
79
+
80
+ wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
81
+ return wxo_client
@@ -1,102 +0,0 @@
1
- wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
3
- wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
4
- wxo_agentic_evaluation/arg_configs.py,sha256=KttX3LFPXjg4qRlbeQ-fQ4Qp5-9_Uz5tt4TCx93KRAY,3028
5
- wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
6
- wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
7
- wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
8
- wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
9
- wxo_agentic_evaluation/evaluation_package.py,sha256=Ud1h7HDr47Gs4XPUoPagm6oS54Iqb_UWGlcyKoCLnfE,24319
10
- wxo_agentic_evaluation/inference_backend.py,sha256=mG7Z-Hi63znfJ7vzwCCYNPMc6AHgu7Codnw4puoAM3U,33004
11
- wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
12
- wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
13
- wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
14
- wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6ew,13574
15
- wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
16
- wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
17
- wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
18
- wxo_agentic_evaluation/service_instance.py,sha256=lAwfIRJD20vOZFsmtqBt7z4-AmIWE-Fu5VGjmVeyoso,8506
19
- wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
20
- wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
21
- wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
22
- wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=mI2fyYzbLpSjSr2iwSwpjrOAenxvfA-6h9z2oky0uMs,18349
23
- wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
24
- wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
25
- wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
26
- wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
27
- wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkVvomgY0hlS44N_n_7ld3YAQ5PFZdfU,4200
28
- wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
29
- wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
30
- wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
32
- wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
33
- wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
34
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py,sha256=gY5m5INv0IQrA4Xi2wigAUI1cnxzGPYtMLWCIo9pubQ,5602
35
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
36
- wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
37
- wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
- wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
39
- wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
40
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
41
- wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
42
- wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
43
- wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
44
- wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
45
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=GAHtEJvFNtgWBQma1I9KJdhXdhmqbEQf7JY66Z1JLMU,1113
46
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=yrLtXfmVIJ_C3XIaTvpqlQGlg9kKIibrVR3UzpzBEmo,1288
47
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
48
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
49
- wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
50
- wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
51
- wxo_agentic_evaluation/prompt/template_render.py,sha256=xVy7NOeGk5_XxzTT-YIY4HVAseQFU2SbRMSdvQGa-FE,4829
52
- wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
53
- wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
54
- wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
- wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
56
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=pfhMUjddv32pIRewea7o1vn_xrV_LuyC8vRlJ7qVyO8,5267
57
- wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=Sz9zB5O1ct7EoZCog8GNdwj8yWFZo7HJLPbA9HvelZc,11886
58
- wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
59
- wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=XXNP43mEneuDBo_zGPdCVNRdUNy-KGd7kbIKYwKhKJQ,4477
60
- wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
61
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=G2b7rwN0VTLBVGwU1VXKUl4eqT8Ya8zCcOorwkZwrZA,4354
62
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
64
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
66
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
67
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
69
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
70
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
72
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
73
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
75
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
76
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
77
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
78
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
79
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=mm7eOx6a_2ExDgck29IkgAzjeQkICpMDXecuxa6ZULo,17182
80
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
81
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
82
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
83
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
84
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
85
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
86
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
88
- wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
89
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
90
- wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
91
- wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
92
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
93
- wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=LYSpxOI2oMQSysasb8WT_nn5SdDy-dsLFyJDJHXFtn0,6876
94
- wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
95
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
96
- wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
97
- wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
98
- wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
99
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA,sha256=SRO-KH4zJYQhHMhyhDIqrkeoELwrDnTvYbwcIZT9i9w,1435
100
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
101
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
102
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD,,