PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.1py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show

wxo_agentic_evaluation/batch_annotate.py CHANGED Viewed

@@ -1,22 +1,28 @@
-import json
 import ast
 import csv
+import json
 import os
 from pathlib import Path
 from jsonargparse import CLI
-from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.prompt.template_render import BatchTestCaseGeneratorTemplateRenderer
-from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
 from wxo_agentic_evaluation import __file__
+from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
+from wxo_agentic_evaluation.prompt.template_render import (
+    BatchTestCaseGeneratorTemplateRenderer,
+)
+from wxo_agentic_evaluation.service_provider import get_provider
 root_dir = os.path.dirname(__file__)
-BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(root_dir, "prompt", "batch_testcase_prompt.jinja2")
+BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "batch_testcase_prompt.jinja2"
+)
 EXAMPLE_PATH = os.path.join(root_dir, "prompt", "examples", "data_simple.json")
-def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_names: list[str]) -> tuple[
-    dict, list[dict]]:
+def parse_tools_with_filter(
+    agent_name: str, tools_path: Path, allowed_tool_names: list[str]
+) -> tuple[dict, list[dict]]:
     if not allowed_tool_names:
         raise ValueError("Allowed tool list cannot be empty.")
@@ -29,7 +35,9 @@ def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_name
     elif tools_path.is_dir():
         files_to_parse.extend(tools_path.glob("**/*.py"))
     else:
-        raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
+        raise ValueError(
+            f"Tools path {tools_path} is neither a file nor directory"
+        )
     for file_path in files_to_parse:
         try:
@@ -41,21 +49,29 @@ def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_name
             # Process only module-level functions
             for node in parsed_code.body:
                 if isinstance(node, ast.FunctionDef):
-                    tool_data.append({
-                        "Function Name": node.name,
-                        "Arguments": [arg.arg for arg in node.args.args],
-                        "Docstring": ast.get_docstring(node)
-                    })
+                    tool_data.append(
+                        {
+                            "Function Name": node.name,
+                            "Arguments": [arg.arg for arg in node.args.args],
+                            "Docstring": ast.get_docstring(node),
+                        }
+                    )
         except Exception as e:
             print(f"Warning: Failed to parse {file_path}: {str(e)}")
             continue
     # Filter tools based on allowed names
-    filtered_tools = [tool for tool in tool_data if tool["Function Name"] in allowed_tool_names]
+    filtered_tools = [
+        tool
+        for tool in tool_data
+        if tool["Function Name"] in allowed_tool_names
+    ]
     if not filtered_tools:
-        print(f"Warning: No matching tools found. Available tools: {[t['Function Name'] for t in tool_data]}")
+        print(
+            f"Warning: No matching tools found. Available tools: {[t['Function Name'] for t in tool_data]}"
+        )
     return {"name": agent_name}, filtered_tools
@@ -75,8 +91,17 @@ def load_example(example_path: Path):
 # Step 4: Prompt builder for N test cases from a given story
-def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story: str, num_variants: int = 2):
-    renderer = BatchTestCaseGeneratorTemplateRenderer(BATCH_TEST_CASE_GENERATOR_PROMPT_PATH)
+def build_prompt_for_story(
+    agent,
+    tools,
+    tool_inputs,
+    example_case: dict,
+    story: str,
+    num_variants: int = 2,
+):
+    renderer = BatchTestCaseGeneratorTemplateRenderer(
+        BATCH_TEST_CASE_GENERATOR_PROMPT_PATH
+    )
     tool_blocks = "\n".join(
         f"- Tool: {t['Function Name']}\n  Description: {t['Docstring']}\n  Args: {', '.join(t['Arguments']) or 'None'}"
@@ -93,13 +118,23 @@ def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story:
     )
     return prompt
 # Step 5: Send prompt to LLM and save test cases
-def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-llama/llama-3-405b-instruct", ):
+def generate_multiple_in_one(
+    prompt,
+    output_dir,
+    starting_index,
+    model_id="meta-llama/llama-3-405b-instruct",
+):
     output_dir.mkdir(parents=True, exist_ok=True)
     provider = get_provider(
         model_id=model_id,
-        params={"min_new_tokens": 50, "decoding_method": "greedy", "max_new_tokens": 3000},
+        params={
+            "min_new_tokens": 50,
+            "decoding_method": "greedy",
+            "max_new_tokens": 3000,
+        },
     )
     response = provider.query(prompt)
@@ -124,8 +159,19 @@ def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-
         print("Raw text:\n", raw_text)
         print("Error:", str(e))
-def generate_test_cases_from_stories(agent_name: str, stories: list[str], tools_path: Path, snapshot_path: Path, output_dir: Path, allowed_tools: list[str], num_variants: int = 2):
-    agent, tools = parse_tools_with_filter(agent_name, tools_path, allowed_tools)
+def generate_test_cases_from_stories(
+    agent_name: str,
+    stories: list[str],
+    tools_path: Path,
+    snapshot_path: Path,
+    output_dir: Path,
+    allowed_tools: list[str],
+    num_variants: int = 2,
+):
+    agent, tools = parse_tools_with_filter(
+        agent_name, tools_path, allowed_tools
+    )
     tool_inputs = extract_inputs_from_snapshot(snapshot_path)
     example_json = load_example(Path(EXAMPLE_PATH))
@@ -134,23 +180,29 @@ def generate_test_cases_from_stories(agent_name: str, stories: list[str], tools_
         print(f"\n Generating test cases for story {idx}: {story}")
         prompt = build_prompt_for_story(
-            agent, tools, tool_inputs, example_json, story, num_variants=num_variants
+            agent,
+            tools,
+            tool_inputs,
+            example_json,
+            story,
+            num_variants=num_variants,
         )
         generate_multiple_in_one(
             prompt=prompt,
             output_dir=output_dir,
-            starting_index=test_case_counter
+            starting_index=test_case_counter,
         )
         test_case_counter += num_variants
 def main(config: BatchAnnotateConfig):
     stories_path = Path(config.stories_path)
     stories = []
     agent_name = None
-    with stories_path.open("r", encoding="utf-8", newline='') as f:
+    with stories_path.open("r", encoding="utf-8", newline="") as f:
         csv_reader = csv.DictReader(f)
         for row in csv_reader:
             stories.append(row["story"])
@@ -168,8 +220,9 @@ def main(config: BatchAnnotateConfig):
         snapshot_path,
         output_dir,
         config.allowed_tools,
-        num_variants=config.num_variants
+        num_variants=config.num_variants,
     )
 if __name__ == "__main__":
     main(CLI(BatchAnnotateConfig, as_positional=False))

wxo_agentic_evaluation/data_annotator.py CHANGED Viewed

@@ -1,16 +1,16 @@
-from wxo_agentic_evaluation.type import Message, EvaluationData
-from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
-from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.prompt.template_render import (
-    LlamaKeywordsGenerationTemplateRenderer,
-)
-from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
 import ast
-import json
 import collections
+import json
 from typing import Dict, List, Optional
+from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
+from wxo_agentic_evaluation.prompt.template_render import (
+    LlamaKeywordsGenerationTemplateRenderer,
+)
+from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
+from wxo_agentic_evaluation.type import EvaluationData, Message
 ERROR_KEYWORDS = [
     "error",
     "erroneous",
@@ -143,7 +143,9 @@ class DataAnnotator:
                     )
         return wrong_tool_response_id
-    def _process_tool_call_order(self, wrong_tool_response_id: list[str]) -> list[str]:
+    def _process_tool_call_order(
+        self, wrong_tool_response_id: list[str]
+    ) -> list[str]:
         """Process and order tool calls, skipping failed ones"""
         # gather all call ids that actually got a response
         valid_call_ids = {
@@ -230,7 +232,11 @@ class DataAnnotator:
             if message.role == "assistant":
                 provider = get_provider(
                     model_id=self.keywords_generation_config.model_id,
-                    params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 256},
+                    params={
+                        "min_new_tokens": 0,
+                        "decoding_method": "greedy",
+                        "max_new_tokens": 256,
+                    },
                 )
                 kw_generator = KeywordsGenerationLLM(
                     provider=provider,
@@ -247,14 +253,13 @@ class DataAnnotator:
                 }
                 goal_details.append(summarize_step)
                 break
         if previous is None:
             goals["summarize"] = []
         elif summarize_step is None:
             goals[previous] = []
         else:
             goals[previous] = ["summarize"]
     def generate(self) -> Dict:
         """Generate the final dataset"""

wxo_agentic_evaluation/description_quality_checker.py CHANGED Viewed

@@ -1,15 +1,18 @@
 import os
+from enum import Enum
 from pathlib import Path
 from typing import List
 import rich
-from enum import Enum
+from wxo_agentic_evaluation.prompt.template_render import (
+    BadToolDescriptionRenderer,
+)
 from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.prompt.template_render import BadToolDescriptionRenderer
 from wxo_agentic_evaluation.tool_planner import (
-    parse_json_string,
-    extract_tool_signatures,
     MISSING_DOCSTRING_PROMPT,
+    extract_tool_signatures,
+    parse_json_string,
 )
 from wxo_agentic_evaluation.type import ToolDefinition
 from wxo_agentic_evaluation.utils.utils import safe_divide
@@ -34,11 +37,11 @@ class ToolDescriptionIssue(Enum):
 class DescriptionQualityInspector:
-    DEFAULT_CLASSIFICATION_THRESHOLD = (
-        40.0  # 2/5 issues detected. A higher score indicates a worse description.
-    )
+    DEFAULT_CLASSIFICATION_THRESHOLD = 40.0  # 2/5 issues detected. A higher score indicates a worse description.
     CLASSIFICATION_SCORE_THRESHOLD = float(
-        os.getenv("CLASSIFICATION_SCORE_THRESHOLD", DEFAULT_CLASSIFICATION_THRESHOLD)
+        os.getenv(
+            "CLASSIFICATION_SCORE_THRESHOLD", DEFAULT_CLASSIFICATION_THRESHOLD
+        )
     )
     LLM_MODEL_ID = "meta-llama/llama-3-2-90b-vision-instruct"
@@ -67,9 +70,7 @@ class DescriptionQualityInspector:
         self.template = BadToolDescriptionRenderer(
             self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
         )
-        self.cached_response = (
-            None  # this is used in the unit-tests for nuanced analysis of the response.
-        )
+        self.cached_response = None  # this is used in the unit-tests for nuanced analysis of the response.
     @staticmethod
     def extract_tool_desc_from_tool_source(
@@ -96,7 +97,8 @@ class DescriptionQualityInspector:
                         tool_name=tool_name,
                         tool_description=(
                             tool_data["Docstring"]
-                            if tool_data["Docstring"] != MISSING_DOCSTRING_PROMPT
+                            if tool_data["Docstring"]
+                            != MISSING_DOCSTRING_PROMPT
                             else None
                         ),
                         tool_params=tool_data["Arguments"],
@@ -131,7 +133,9 @@ class DescriptionQualityInspector:
             return False  # likely some unexpected parsing issue, in this case - flags description as good.
         # calculate weighted score
-        final_description_score = self._calculate_score(response_data=response_data)
+        final_description_score = self._calculate_score(
+            response_data=response_data
+        )
         return final_description_score >= self.CLASSIFICATION_SCORE_THRESHOLD
@@ -146,4 +150,6 @@ class DescriptionQualityInspector:
             for issue in ToolDescriptionIssue
             if response_data.get(issue.value, "FALSE").upper() == "TRUE"
         )
-        return safe_divide(detected_issues, self.WORST_POSSIBLE_EVAL_OUTCOME) * 100
+        return (
+            safe_divide(detected_issues, self.WORST_POSSIBLE_EVAL_OUTCOME) * 100
+        )

ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.1py3-none-any.whl → 1.1.2py3-none-any.whl