ibm-watsonx-orchestrate-evaluation-framework 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (63) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/METADATA +103 -109
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/RECORD +96 -0
  3. wxo_agentic_evaluation/analytics/tools/main.py +1 -18
  4. wxo_agentic_evaluation/analyze_run.py +358 -97
  5. wxo_agentic_evaluation/arg_configs.py +28 -1
  6. wxo_agentic_evaluation/description_quality_checker.py +149 -0
  7. wxo_agentic_evaluation/evaluation_package.py +65 -20
  8. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  9. wxo_agentic_evaluation/external_agent/performance_test.py +2 -3
  10. wxo_agentic_evaluation/inference_backend.py +117 -14
  11. wxo_agentic_evaluation/llm_user.py +2 -1
  12. wxo_agentic_evaluation/main.py +5 -0
  13. wxo_agentic_evaluation/metrics/metrics.py +22 -1
  14. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  15. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
  16. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  17. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  18. wxo_agentic_evaluation/prompt/template_render.py +34 -3
  19. wxo_agentic_evaluation/quick_eval.py +342 -0
  20. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
  21. wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
  22. wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
  23. wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
  24. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  26. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  30. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
  40. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
  41. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  42. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
  43. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  44. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
  45. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
  46. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
  47. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
  48. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
  49. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  50. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
  51. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +116 -0
  52. wxo_agentic_evaluation/service_instance.py +2 -2
  53. wxo_agentic_evaluation/service_provider/watsonx_provider.py +118 -4
  54. wxo_agentic_evaluation/tool_planner.py +3 -1
  55. wxo_agentic_evaluation/type.py +33 -2
  56. wxo_agentic_evaluation/utils/__init__.py +0 -1
  57. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
  58. wxo_agentic_evaluation/utils/rich_utils.py +174 -0
  59. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  60. wxo_agentic_evaluation/utils/utils.py +167 -5
  61. ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD +0 -56
  62. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/WHEEL +0 -0
  63. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,174 @@
1
+ from rich.text import Text
2
+ from typing import Optional, List, Any
3
+ import rich
4
+
5
+
6
+ def pretty_print(content: Any, style: Optional[str] = None):
7
+ """
8
+ Utility function for stylized prints.
9
+ Please refer to: https://rich.readthedocs.io/en/stable/appendix/colors.html for valid `style` strings.
10
+ NOTE:
11
+ Rich allows for nested [style][/style] tags within a string.
12
+ This utility only applies an outermost style wrapper using the passed `style` (ONLY for a string `content`).
13
+
14
+ :param content: The content to be printed
15
+ :param style: a valid `rich` colour.
16
+ """
17
+ if isinstance(content, str):
18
+ if style:
19
+ rich.print(f"[{style}]{content}[/{style}]")
20
+ else:
21
+ rich.print(content)
22
+ else:
23
+ rich.print(content)
24
+
25
+
26
+ def warn(
27
+ message: str,
28
+ style: Optional[str] = "bold yellow",
29
+ prompt: Optional[str] = "WARNING ⚠️ :",
30
+ ) -> Text:
31
+ """Utility function for formatting a warning message."""
32
+ return Text(f"{prompt}{message}\n\n", style=style)
33
+
34
+
35
+ def is_ok(
36
+ message: str, style: Optional[str] = "bold green", prompt: Optional[str] = "OK ✅ :"
37
+ ) -> Text:
38
+ """Utility function for formatting an OK message."""
39
+ return Text(f"{prompt}{message}\n\n", style=style)
40
+
41
+
42
+ def print_done(prompt: Optional[str] = "Done ✅", style: Optional[str] = "bold cyan"):
43
+ """
44
+ Prints a prompt indicating completion of a process/routine.
45
+ :param prompt: default is `"Done ✅"`
46
+ :param style: The style for the text (default is bold cyan).
47
+ """
48
+ pretty_print(content=prompt, style=style)
49
+
50
+
51
+ def print_success(
52
+ message: str,
53
+ style: Optional[str] = "bold green",
54
+ prompt: Optional[str] = "✅ PASSED",
55
+ ):
56
+ """
57
+ Prints a success message.
58
+ :param message: a statement that is printed alongside a PASSED outcome.
59
+ :param style: The style for the text (default is bold green).
60
+ :param prompt: The prompt to display before the message (default is "✅ PASSED").
61
+ """
62
+ pretty_print(content=f"{prompt} - {message}", style=style)
63
+
64
+
65
+ def print_failure(
66
+ message: str, style: Optional[str] = "bold red", prompt: Optional[str] = "❌ FAILED"
67
+ ):
68
+ """
69
+ Prints a failure message.
70
+ :param message: a statement that is printed alongside a FAILED outcome.
71
+ :param style: The style for the text (default is bold red).
72
+ :param prompt: The prompt to display before the message (default is "❌ FAILED").
73
+ """
74
+ pretty_print(content=f"{prompt} - {message}", style=style)
75
+
76
+
77
+ class IncorrectParameterUtils:
78
+ """
79
+ Utility functions for handling warning and suggestion messages related to bad parameters in tool descriptions.
80
+ These are primarily used for providing feedback on incorrect parameter usage by the assistant in `analyze_run`.
81
+ """
82
+
83
+ @staticmethod
84
+ def suggest(message: str, style: Optional[str] = "green") -> Text:
85
+ """
86
+ Used for formatting a suggestion message for improving agent behaviour relating to bad parameter usage.
87
+ :param message: The suggestion message to display.
88
+ :param style: The style for the text (default is green).
89
+ :return: A rich Text object styled as a suggestion.
90
+ """
91
+ return Text(
92
+ f"💡 {message}\n✅ A good description is insightful of the tool's purpose, and clarifies parameter usage to the assistant.\n\n",
93
+ style=style,
94
+ )
95
+
96
+ @staticmethod
97
+ def format_missing_description_message(
98
+ tool_definition_path: str, tool_name: str
99
+ ) -> List[Text]:
100
+
101
+ return [
102
+ warn(
103
+ f"Tool description for '{tool_name}' not found in file: '{tool_definition_path}'"
104
+ ),
105
+ IncorrectParameterUtils.suggest(
106
+ f"Please consider adding a description for '{tool_name}'."
107
+ ),
108
+ ]
109
+
110
+ @staticmethod
111
+ def format_bad_description_message(tool_name: str, tool_desc: str) -> List[Text]:
112
+
113
+ return [
114
+ warn(
115
+ f"Tool description for '{tool_name}' may be incomplete or unclear: '{tool_desc.strip()}'."
116
+ ),
117
+ IncorrectParameterUtils.suggest(
118
+ f"Please consider making the description for '{tool_name}' more informative on parameter usage."
119
+ ),
120
+ ]
121
+
122
+
123
+ class TestingUtils:
124
+ """
125
+ Provides a collection of formatted messages that can be used in testing workflows.
126
+ """
127
+
128
+ @staticmethod
129
+ def print_test_header(
130
+ test_case_count: int,
131
+ test_description: str,
132
+ style: Optional[str] = "bold cyan",
133
+ prompt: Optional[str] = "\n⚙️ Testing",
134
+ ):
135
+ """
136
+ Print formatted test suite header.
137
+ :param test_case_count: # of test-cases.
138
+ :param test_description: a short statement explaining what is being examined.
139
+ For example, this can be read as: `"{\n⚙️ Testing} {20} {good tool descriptions}"`.
140
+ """
141
+ pretty_print(
142
+ content=f"{prompt} {test_case_count} {test_description}", style=style
143
+ )
144
+
145
+ @staticmethod
146
+ def print_error_details(
147
+ expected: List[str], detected: List[str], style: Optional[str] = "bold red"
148
+ ):
149
+ """
150
+ Print detailed error information.
151
+ An error in this context can be an assertion mis-match.
152
+ Use this function to display the delta.
153
+ :param expected: the expected outcome.
154
+ :param detected: the actual/observed outcome.
155
+ """
156
+ pretty_print(content=f" Expected: {expected}", style=style)
157
+ pretty_print(content=f" Detected: {detected}", style=style)
158
+
159
+ @staticmethod
160
+ def print_failure_summary(
161
+ failed_cases: List[str],
162
+ prompt: Optional[str] = "Failed cases",
163
+ style: Optional[str] = "bold red",
164
+ ):
165
+ """
166
+ Print summary of all failures.
167
+ List out the specific cases that failed the test.
168
+ :param failed_cases: List of failed case names, this list is iterated over to print/list all failures.
169
+ :param style: The style for the text (default is bold red).
170
+ """
171
+ if failed_cases:
172
+ pretty_print(content=f"{prompt} ({len(failed_cases)}):", style=style)
173
+ for case in failed_cases:
174
+ pretty_print(content=f" - {case}", style=style)
@@ -0,0 +1,23 @@
1
+ def lcs_length(x, y):
2
+ """Compute the length of the Longest Common Subsequence (LCS)."""
3
+ m, n = len(x), len(y)
4
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
5
+ for i in range(m):
6
+ for j in range(n):
7
+ if x[i] == y[j]:
8
+ dp[i + 1][j + 1] = dp[i][j] + 1
9
+ else:
10
+ dp[i + 1][j + 1] = max(dp[i][j + 1], dp[i + 1][j])
11
+ return dp[m][n]
12
+
13
+
14
+ def rouge_l_recall(prediction, reference):
15
+ """Compute ROUGE-L recall. No stemming."""
16
+ pred_tokens = prediction.split()
17
+ ref_tokens = reference.split()
18
+
19
+ lcs = lcs_length(pred_tokens, ref_tokens)
20
+ if len(pred_tokens) == 0:
21
+ return 0.0
22
+
23
+ return lcs / len(pred_tokens)
@@ -5,15 +5,47 @@ from rich.panel import Panel
5
5
  from rich.rule import Rule
6
6
  from rich import box
7
7
  from rich import print
8
+ import re
9
+ from rich.style import Style
8
10
 
9
- from typing import List
11
+ from typing import List, Optional, Union
12
+ import json
13
+ import yaml
14
+ import glob
15
+ import os
10
16
 
11
17
  from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
12
- from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary
13
- from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
18
+ from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary, ReferenceLessEvalMetrics
19
+ from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore, Message
14
20
 
15
21
  console = Console()
16
22
 
23
+ class AttackResultsTable:
24
+ def __init__(self, attack_results: dict):
25
+ self.table = Table(
26
+ title="Attack Results",
27
+ box=box.ROUNDED,
28
+ show_lines=True,
29
+ )
30
+ self.table.add_column("Attack Category", style="magenta")
31
+ self.table.add_column("Count", style="cyan")
32
+ self.table.add_column("Success Rate", style="green")
33
+
34
+ # Extract values
35
+ n_on_policy = attack_results.get("n_on_policy_attacks", 0)
36
+ n_off_policy = attack_results.get("n_off_policy_attacks", 0)
37
+ n_on_policy_successful = attack_results.get("n_on_policy_successful", 0)
38
+ n_off_policy_successful = attack_results.get("n_off_policy_successful", 0)
39
+
40
+ # Calculate success rates
41
+ on_policy_rate = f"{round(100 * safe_divide(n_on_policy_successful, n_on_policy))}%" if n_on_policy else "0%"
42
+ off_policy_rate = f"{round(100 * safe_divide(n_off_policy_successful, n_off_policy))}%" if n_off_policy else "0%"
43
+
44
+ self.table.add_row("On Policy", str(n_on_policy), on_policy_rate)
45
+ self.table.add_row("Off Policy", str(n_off_policy), off_policy_rate)
46
+
47
+ def print(self):
48
+ console.print(self.table)
17
49
 
18
50
  class AgentMetricsTable:
19
51
  def __init__(self, data):
@@ -70,8 +102,22 @@ def is_ibm_cloud_url(service_url: str) -> bool:
70
102
  return ".cloud.ibm.com" in hostname
71
103
 
72
104
 
73
- def add_line_seperator():
74
- console.print(Rule(style="grey42"))
105
+ def add_line_seperator(
106
+ style_config: Optional[
107
+ Union[str,Style]
108
+ ]=None,
109
+ ):
110
+
111
+ if not style_config:
112
+ style="grey42"
113
+ else:
114
+ style=style_config
115
+
116
+ console.print(
117
+ Rule(
118
+ style=style,
119
+ )
120
+ )
75
121
 
76
122
 
77
123
  class FaithfulnessTable:
@@ -183,3 +229,119 @@ class SummaryPanel:
183
229
 
184
230
  def print(self):
185
231
  console.print(self.table)
232
+
233
+
234
+ class Tokenizer:
235
+ PATTERN = r"""
236
+ \w+(?=n't)| # Words before n't contractions (e.g., "do" in "don't")
237
+ n't| # n't contractions themselves
238
+ \w+(?=')| # Words before apostrophes (e.g., "I" in "I'm")
239
+ '| # Apostrophes as separate tokens
240
+ \w+| # Regular words (letters, numbers, underscores)
241
+ [^\w\s] # Punctuation marks (anything that's not word chars or whitespace)
242
+ """
243
+
244
+ def __init__(self):
245
+ self.compiled_pattern = re.compile(
246
+ self.PATTERN,
247
+ re.VERBOSE | re.IGNORECASE
248
+ )
249
+
250
+ def __call__(self, text: str) -> List[str]:
251
+ """
252
+ Tokenizes text by splitting on punctuation and handling contractions.
253
+
254
+ Args:
255
+ text: Input text to tokenize.
256
+
257
+ Returns:
258
+ List of tokenized words (lowercase, no punctuation).
259
+
260
+ Examples:
261
+ - "I'm fine" -> ['i', 'm', 'fine']
262
+ - "don't go" -> ['do', "n't", 'go']
263
+ - "Hello, world!" -> ['hello', 'world']
264
+ """
265
+
266
+ tokens = self.compiled_pattern.findall(
267
+ text
268
+ )
269
+
270
+ return self._clean_tokens(tokens)
271
+
272
+ def _clean_tokens(self, raw_tokens: List[str]) -> List[str]:
273
+ """
274
+ Applies some basic post-processing to tokenized messages.
275
+
276
+ Args:
277
+ raw_tokens: list of tokens extracted from a message.
278
+ """
279
+
280
+ filtered_tokens = [
281
+ token.lower() \
282
+ for token in raw_tokens \
283
+ if token.strip() \
284
+ and not (len(token) == 1 and not token.isalnum())
285
+ ]
286
+
287
+ return filtered_tokens
288
+
289
+
290
+ class ReferencelessEvalPanel:
291
+ def __init__(self, referenceless_metrics: List[ReferenceLessEvalMetrics]):
292
+ self.table = Table(
293
+ title="Quick Evaluation Summary Metrics",
294
+ box=box.ROUNDED,
295
+ show_lines=True,
296
+ )
297
+
298
+ self.table.add_column("Dataset", style="yellow", justify="center")
299
+ self.table.add_column("Tool Calls", style="deep_sky_blue1", justify="center")
300
+ self.table.add_column("Successful Tool Calls", style="magenta", justify="center")
301
+ self.table.add_column("Tool Calls Failed due to Schema Mismatch", style="deep_sky_blue1", justify="center")
302
+ self.table.add_column("Tool Calls Failed due to Hallucination", style="magenta", justify="center")
303
+
304
+ for metric in referenceless_metrics:
305
+ self.table.add_row(
306
+ str(metric.dataset_name),
307
+ str(metric.number_of_tool_calls),
308
+ str(metric.number_of_successful_tool_calls),
309
+ str(metric.number_of_static_failed_tool_calls),
310
+ str(metric.number_of_semantic_failed_tool_calls)
311
+ )
312
+
313
+ def print(self):
314
+ console.print(self.table)
315
+
316
+ # Function to load messages from JSON file
317
+ def load_messages(file_path):
318
+ with open(file_path, "r") as f:
319
+ try:
320
+ message_data = json.load(f)
321
+ messages = []
322
+ for msg in message_data:
323
+ messages.append(Message.model_validate(msg))
324
+
325
+ return messages
326
+
327
+ except Exception as e:
328
+ print(file_path)
329
+ print(e)
330
+ return None
331
+
332
+
333
+ def load_agents(agents_path: str):
334
+ agents_json = glob.glob(os.path.join(agents_path, "*.json"))
335
+ agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
336
+
337
+ agents = []
338
+
339
+ for agent_path in agents_json:
340
+ with open(agent_path, "r") as f:
341
+ agents.append(json.load(f))
342
+
343
+ for agent_path in agents_yaml:
344
+ with open(agent_path, "r") as f:
345
+ agents.append(yaml.safe_load(f))
346
+
347
+ return agents
@@ -1,56 +0,0 @@
1
- wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
3
- wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
4
- wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
5
- wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
6
- wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
7
- wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
8
- wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
9
- wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
10
- wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
11
- wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
12
- wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
13
- wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
14
- wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
15
- wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
16
- wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
17
- wxo_agentic_evaluation/tool_planner.py,sha256=JW5o0VYaaUorB3FBcrwLzgG3-iqEWrqjVhh82u7x8YM,12960
18
- wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
19
- wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
20
- wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
21
- wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
22
- wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
23
- wxo_agentic_evaluation/external_agent/__init__.py,sha256=LY3gMNzfIEwjpQkx5_2iZFHGQiUL4ymEkKL1dc2uKq4,1491
24
- wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
25
- wxo_agentic_evaluation/external_agent/performance_test.py,sha256=bCXUsW0OeUzwfSSYObgfAmEU5vARkD-PblYU-mU9aPY,2507
26
- wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
27
- wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
29
- wxo_agentic_evaluation/metrics/metrics.py,sha256=9O2m6T2iW-PMjGrTdMbOHP2Pr4RN0NwbEp6YgFpTi3I,5572
30
- wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
32
- wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
33
- wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
34
- wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
35
- wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
36
- wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
37
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=nDfCD0o9cRYmsgIjzD-RZNQxotlvuqrzdsZIY-vT794,684
38
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
39
- wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
40
- wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
41
- wxo_agentic_evaluation/prompt/template_render.py,sha256=FVH5ew2TofC5LGqQzqNj90unrxooUZv_5XxJzVdz8uM,3563
42
- wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
43
- wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
44
- wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
- wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
46
- wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
47
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=Y36Ryv4nPG8RdVP_zsQsRlEWv8F_hGi7-wOppWPQTwc,4026
48
- wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
49
- wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
50
- wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
51
- wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
52
- wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
53
- ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/METADATA,sha256=wz60je0UK3ogKLH9qiDLS808j57cfWOosONyCuQR95g,18051
54
- ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
55
- ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
56
- ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD,,