PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show

ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
wxo_agentic_evaluation/__init__.py +0 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
wxo_agentic_evaluation/analytics/tools/main.py +163 -0
wxo_agentic_evaluation/analytics/tools/types.py +130 -0
wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
wxo_agentic_evaluation/analyze_run.py +123 -0
wxo_agentic_evaluation/annotate.py +40 -0
wxo_agentic_evaluation/arg_configs.py +78 -0
wxo_agentic_evaluation/batch_annotate.py +181 -0
wxo_agentic_evaluation/data_annotator.py +253 -0
wxo_agentic_evaluation/evaluation_package.py +518 -0
wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
wxo_agentic_evaluation/external_agent/types.py +65 -0
wxo_agentic_evaluation/inference_backend.py +601 -0
wxo_agentic_evaluation/llm_matching.py +39 -0
wxo_agentic_evaluation/llm_rag_eval.py +47 -0
wxo_agentic_evaluation/llm_user.py +38 -0
wxo_agentic_evaluation/main.py +231 -0
wxo_agentic_evaluation/metrics/__init__.py +0 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
wxo_agentic_evaluation/metrics/metrics.py +101 -0
wxo_agentic_evaluation/prompt/__init__.py +0 -0
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
wxo_agentic_evaluation/prompt/template_render.py +90 -0
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
wxo_agentic_evaluation/record_chat.py +165 -0
wxo_agentic_evaluation/service_instance.py +179 -0
wxo_agentic_evaluation/tool_planner.py +228 -0
wxo_agentic_evaluation/type.py +176 -0
wxo_agentic_evaluation/utils/__init__.py +6 -0
wxo_agentic_evaluation/utils/utils.py +233 -0
wxo_agentic_evaluation/watsonx_provider.py +175 -0

wxo_agentic_evaluation/annotate.py ADDED Viewed

@@ -0,0 +1,40 @@
+from wxo_agentic_evaluation.type import Message, EvaluationData
+from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
+from wxo_agentic_evaluation.data_annotator import DataAnnotator
+import json
+from pprint import pprint
+from jsonargparse import CLI
+import os
+def main(config: TestCaseGenerationConfig):
+    messages = []
+    with open(config.log_path, "r") as f:
+        data = json.load(f)
+        for entry in data:
+            messages.append(Message.model_validate(entry))
+    with open(config.seed_data_path, "r") as f:
+        evaluation_data = EvaluationData(**json.load(f))
+    # Generate annonated dataset
+    annotator = DataAnnotator(
+        messages=messages,
+        keywords_generation_config=config.keywords_generation_config,
+        initial_data=evaluation_data,
+    )
+    dataset = annotator.generate()
+    # Save dataset
+    filename = config.seed_data_path.split("/")[-1]
+    core_name = filename.split(".")[0]
+    new_filename = f"{core_name}_annotated.json"
+    with open(os.path.join(config.output_dir, new_filename), "w") as f:
+        json.dump(dataset, f, indent=4)
+    pprint(dataset)
+if __name__ == "__main__":
+    main(CLI(TestCaseGenerationConfig, as_positional=False))

wxo_agentic_evaluation/arg_configs.py ADDED Viewed

@@ -0,0 +1,78 @@
+import os
+from dataclasses import dataclass, field
+from typing import List
+from wxo_agentic_evaluation import __file__
+root_dir = os.path.dirname(__file__)
+LLAMA_USER_PROMPT_PATH = os.path.join(root_dir, "prompt", "llama_user_prompt.jinja2")
+KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(root_dir, "prompt", "keywords_generation_prompt.jinja2")
+@dataclass
+class AuthConfig:
+    url: str
+    tenant_name: str = "local"
+    token: str = None
+@dataclass
+class LLMUserConfig:
+    model_id: str = field(default="meta-llama/llama-3-405b-instruct")
+    prompt_config: str = field(default=LLAMA_USER_PROMPT_PATH)
+    user_response_style: List[str] = field(default_factory=list)
+@dataclass
+class TestConfig:
+    test_paths: List[str]
+    output_dir: str
+    auth_config: AuthConfig
+    wxo_lite_version: str
+    llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
+    enable_verbose_logging: bool = True
+    enable_manual_user_input: bool = False
+    skip_available_results: bool = False
+    data_annotation_run: bool = False
+    num_workers: int = 2
+@dataclass
+class AnalyzeConfig:
+    data_path: str
+@dataclass
+class KeywordsGenerationConfig:
+    model_id: str = field(default="meta-llama/llama-3-405b-instruct")
+    prompt_config: str = field(default=KEYWORDS_GENERATION_PROMPT_PATH)
+@dataclass
+class TestCaseGenerationConfig:
+    log_path: str
+    seed_data_path: str
+    output_dir: str
+    keywords_generation_config: KeywordsGenerationConfig = field(
+        default_factory=KeywordsGenerationConfig
+    )
+    enable_verbose_logging: bool = True
+@dataclass
+class ChatRecordingConfig:
+    output_dir: str
+    keywords_generation_config: KeywordsGenerationConfig = field(
+        default_factory=KeywordsGenerationConfig
+    )
+    service_url: str = "http://localhost:4321"
+    tenant_name: str = "wxo-dev"
+    token: str = None
+@dataclass
+class BatchAnnotateConfig:
+    allowed_tools: List[str]
+    tools_path: str
+    stories_path: str
+    output_dir: str
+    num_variants: int = 2

wxo_agentic_evaluation/batch_annotate.py ADDED Viewed

@@ -0,0 +1,181 @@
+import json
+import ast
+import csv
+import os
+from pathlib import Path
+from jsonargparse import CLI
+from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
+from wxo_agentic_evaluation.prompt.template_render import BatchTestCaseGeneratorTemplateRenderer
+from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
+from wxo_agentic_evaluation import __file__
+root_dir = os.path.dirname(__file__)
+BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(root_dir, "prompt", "batch_testcase_prompt.jinja2")
+EXAMPLE_PATH = os.path.join(root_dir, "prompt", "examples", "data_simple.json")
+def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_names: list[str]) -> tuple[
+    dict, list[dict]]:
+    if not allowed_tool_names:
+        raise ValueError("Allowed tool list cannot be empty.")
+    tool_data = []
+    files_to_parse = []
+    # Handle both single file and directory cases
+    if tools_path.is_file():
+        files_to_parse.append(tools_path)
+    elif tools_path.is_dir():
+        files_to_parse.extend(tools_path.glob("**/*.py"))
+    else:
+        raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
+    for file_path in files_to_parse:
+        try:
+            with file_path.open("r", encoding="utf-8") as f:
+                tools_code = f.read()
+            parsed_code = ast.parse(tools_code)
+            # Process only module-level functions
+            for node in parsed_code.body:
+                if isinstance(node, ast.FunctionDef):
+                    tool_data.append({
+                        "Function Name": node.name,
+                        "Arguments": [arg.arg for arg in node.args.args],
+                        "Docstring": ast.get_docstring(node)
+                    })
+        except Exception as e:
+            print(f"Warning: Failed to parse {file_path}: {str(e)}")
+            continue
+    # Filter tools based on allowed names
+    filtered_tools = [tool for tool in tool_data if tool["Function Name"] in allowed_tool_names]
+    if not filtered_tools:
+        print(f"Warning: No matching tools found. Available tools: {[t['Function Name'] for t in tool_data]}")
+    return {"name": agent_name}, filtered_tools
+# Step 2: Extract tool input/output examples from snapshot
+def extract_inputs_from_snapshot(snapshot_path: Path) -> dict:
+    with snapshot_path.open("r", encoding="utf-8") as f:
+        snapshot = json.load(f)
+    return snapshot.get("input_output_examples", {})
+# Step 3: Load a single example test case just for structure
+def load_example(example_path: Path):
+    with example_path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    data.pop("mine_fields", None)
+    return data
+# Step 4: Prompt builder for N test cases from a given story
+def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story: str, num_variants: int = 2):
+    renderer = BatchTestCaseGeneratorTemplateRenderer(BATCH_TEST_CASE_GENERATOR_PROMPT_PATH)
+    tool_blocks = "\n".join(
+        f"- Tool: {t['Function Name']}\n  Description: {t['Docstring']}\n  Args: {', '.join(t['Arguments']) or 'None'}"
+        for t in tools
+    )
+    prompt = renderer.render(
+        agent_name=agent["name"],
+        tool_blocks=tool_blocks,
+        tool_inputs_str=json.dumps(tool_inputs, indent=2),
+        story=story,
+        num_variants=num_variants,
+        example_str=json.dumps(example_case, indent=2),
+    )
+    return prompt
+# Step 5: Send prompt to LLM and save test cases
+def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-llama/llama-3-405b-instruct", ):
+    output_dir.mkdir(parents=True, exist_ok=True)
+    provider = WatsonXProvider(
+        model_id=model_id,
+        llm_decode_parameter={
+            "min_new_tokens": 50,
+            "decoding_method": "greedy",
+            "max_new_tokens": 3000
+        }
+    )
+    response = provider.query(prompt)
+    try:
+        raw_text = response.get("generated_text", "")
+        json_start = raw_text.find("[")
+        json_end = raw_text.rfind("]") + 1
+        json_block = raw_text[json_start:json_end].strip()
+        test_cases = json.loads(json_block)
+        assert isinstance(test_cases, list), "Expected list of test cases"
+        for i, case in enumerate(test_cases, start=starting_index):
+            case["mine_fields"] = []  # ✅ Add the field here
+            out_file = output_dir / f"synthetic_test_case_{i}.json"
+            with out_file.open("w", encoding="utf-8") as f:
+                json.dump(case, f, indent=2)
+            print(f"✅ Test case {i} written to {out_file}")
+    except Exception as e:
+        print("⚠️ Failed to parse or validate test case output.")
+        print("Raw text:\n", raw_text)
+        print("Error:", str(e))
+def generate_test_cases_from_stories(agent_name: str, stories: list[str], tools_path: Path, snapshot_path: Path, output_dir: Path, allowed_tools: list[str], num_variants: int = 2):
+    agent, tools = parse_tools_with_filter(agent_name, tools_path, allowed_tools)
+    tool_inputs = extract_inputs_from_snapshot(snapshot_path)
+    example_json = load_example(Path(EXAMPLE_PATH))
+    test_case_counter = 1
+    for idx, story in enumerate(stories, start=1):
+        print(f"\n Generating test cases for story {idx}: {story}")
+        prompt = build_prompt_for_story(
+            agent, tools, tool_inputs, example_json, story, num_variants=num_variants
+        )
+        generate_multiple_in_one(
+            prompt=prompt,
+            output_dir=output_dir,
+            starting_index=test_case_counter
+        )
+        test_case_counter += num_variants
+def main(config: BatchAnnotateConfig):
+    stories_path = Path(config.stories_path)
+    stories = []
+    agent_name = None
+    with stories_path.open("r", encoding="utf-8", newline='') as f:
+        csv_reader = csv.DictReader(f)
+        for row in csv_reader:
+            stories.append(row["story"])
+            if agent_name is None:
+                agent_name = row["agent"]
+    tools_path = Path(config.tools_path)
+    snapshot_path = stories_path.parent / f"{agent_name}_snapshot_llm.json"
+    output_dir = Path(config.output_dir) / f"{agent_name}_test_cases"
+    generate_test_cases_from_stories(
+        agent_name,
+        stories,
+        tools_path,
+        snapshot_path,
+        output_dir,
+        config.allowed_tools,
+        num_variants=config.num_variants
+    )
+if __name__ == "__main__":
+    main(CLI(BatchAnnotateConfig, as_positional=False))

wxo_agentic_evaluation/data_annotator.py ADDED Viewed

@@ -0,0 +1,253 @@
+from wxo_agentic_evaluation.type import Message, EvaluationData
+from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
+from wxo_agentic_evaluation.prompt.template_render import (
+    LlamaKeywordsGenerationTemplateRenderer,
+)
+from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
+import ast
+import json
+import collections
+from typing import Dict, List, Optional
+ERROR_KEYWORDS = [
+    "error",
+    "erroneous",
+    "exception",
+    "traceback",
+    "failed",
+    "fail",
+    "fatal",
+    "panic",
+    "abort",
+    "not found",
+    "notfound",
+    "cannot",
+    "can't",
+    "unable",
+    "unsuccessful",
+    "invalid",
+    "incorrect",
+    "illegal",
+    "unknown",
+    "unexpected",
+    "unauthorized",
+    "permission denied",
+    "denied",
+    "forbidden",
+    "forbidden request",
+    "unavailable",
+    "unreachable",
+    "missing",
+    "exceeded",
+    "exceeds limit",
+    "timed out",
+    "timeout",
+    "stack trace",
+    "syntax error",
+    "runtime error",
+    "indexerror",
+    "keyerror",
+    "valueerror",
+    "typeerror",
+    "zerodivisionerror",
+    "segmentation fault",
+    "segfault",
+    "core dumped",
+    "memory error",
+    "out of memory",
+    "oom",
+    "overflow",
+    "underflow",
+    "crash",
+    "bad request",
+    "http_code=400",
+    "http_code=401",
+    "http_code=403",
+    "http_code=404",
+    "http_code=405",
+    "http_code=408",
+    "http_code=409",
+    "http_code=429",
+    "http_code=500",
+    "http_code=503",
+    "http_code=504",
+    "connection refused",
+    "connection error",
+    "broken pipe",
+    "bus error",
+    "catastrophic failure",
+    "unresolved",
+    "infinite recursion",
+    "overrun",
+    "overwrite",
+    "no such file or directory",
+    "invalid argument",
+    "server is down",
+    "server error",
+    "sql error",
+    "db error",
+    "database error",
+]
+class KeywordsGenerationLLM:
+    def __init__(
+        self,
+        wai_client: WatsonXProvider,
+        template: LlamaKeywordsGenerationTemplateRenderer,
+    ):
+        self.wai_client = wai_client
+        self.prompt_template = template
+    def genereate_keywords(self, response) -> Message | None:
+        prompt = self.prompt_template.render(response=response)
+        res = self.wai_client.query(prompt)
+        keywords = ast.literal_eval(res["generated_text"].strip())
+        return keywords
+class DataAnnotator:
+    def __init__(
+        self,
+        messages: List[Message],
+        keywords_generation_config: KeywordsGenerationConfig,
+        initial_data: Optional[EvaluationData] = None,
+    ):
+        self.messages = messages
+        self.keywords_generation_config = keywords_generation_config
+        self.initial_data = initial_data or EvaluationData(
+            agent="",
+            story="",
+            starting_sentence=messages[0].content if messages else "",
+            mine_fields=[],
+            goals={},
+            goal_details=[],
+        )
+    @staticmethod
+    def _is_error_in_message(message: str) -> bool:
+        """Heuristic to catch tool calls that fail"""
+        message = message.lower()
+        return any(keyword in message for keyword in ERROR_KEYWORDS)
+    def _get_failed_tool_responses(self) -> list[str]:
+        """Get list of IDs for failed tool calls"""
+        wrong_tool_response_id = []
+        for message in self.messages:
+            if message.type == "tool_response":
+                content = message.content.lower()
+                if self._is_error_in_message(content):
+                    wrong_tool_response_id.append(
+                        json.loads(message.content)["tool_call_id"]
+                    )
+        return wrong_tool_response_id
+    def _process_tool_call_order(self, wrong_tool_response_id: list[str]) -> list[str]:
+        """Process and order tool calls, skipping failed ones"""
+        order = []
+        for message in self.messages:
+            if message.type == "tool_call":
+                content = json.loads(message.content)
+                # skip all the tool calls that fail
+                if (
+                    content.get("tool_call_id", "") in wrong_tool_response_id
+                    or content.get("id", "") in wrong_tool_response_id
+                ):
+                    continue
+                if "tool_call_id" in content:
+                    del content["tool_call_id"]
+                if "id" in content:
+                    del content["id"]
+                content = json.dumps(content, sort_keys=True)
+                # for a given tool call signature - function name + args only keep the most recent one
+                if content in order:
+                    idx = order.index(content)
+                    order = order[:idx] + order[idx + 1 :] + [content]
+                else:
+                    order.append(content)
+        return order
+    def _process_tool_calls(self) -> tuple[Dict, List, str]:
+        """Process tool calls and generate goals structure"""
+        # Get failed tool response IDs and process tool calls
+        wrong_tool_response_id = self._get_failed_tool_responses()
+        order = self._process_tool_call_order(wrong_tool_response_id)
+        goals = {}
+        goal_details = []
+        function_count = collections.defaultdict(int)
+        previous = None
+        for tool_call in order:
+            call = json.loads(tool_call)
+            funct_name = call["name"]
+            function_count[funct_name] += 1
+            goal_name = funct_name + f"-{function_count[funct_name]}"
+            if previous:
+                goals[previous] = [goal_name]
+            goal_detail = {
+                "type": "tool_call",
+                "name": goal_name,
+                "tool_name": funct_name,
+                "args": call["args"],
+            }
+            goal_details.append(goal_detail)
+            previous = goal_name
+        return goals, goal_details, previous
+    def _process_summarization(
+        self, previous: str, goals: Dict, goal_details: List
+    ) -> None:
+        """Process summarization step"""
+        summarize_step = None
+        # we assume single summary step at the end
+        for message in self.messages[::-1]:
+            if message.role == "assistant":
+                wai_client = WatsonXProvider(
+                    model_id=self.keywords_generation_config.model_id,
+                    llm_decode_parameter={
+                        "min_new_tokens": 0,
+                        "decoding_method": "greedy",
+                        "max_new_tokens": 256,
+                    },
+                )
+                kw_generator = KeywordsGenerationLLM(
+                    wai_client=wai_client,
+                    template=LlamaKeywordsGenerationTemplateRenderer(
+                        self.keywords_generation_config.prompt_config
+                    ),
+                )
+                keywords = kw_generator.genereate_keywords(message.content)
+                summarize_step = {
+                    "name": "summarize",
+                    "type": "text",
+                    "response": message.content,
+                    "keywords": keywords,
+                }
+                goal_details.append(summarize_step)
+                break
+        if summarize_step:
+            goals[previous] = ["summarize"]
+        else:
+            goals[previous] = []
+    def generate(self) -> Dict:
+        """Generate the final dataset"""
+        goals, goal_details, previous = self._process_tool_calls()
+        self._process_summarization(previous, goals, goal_details)
+        return {
+            "agent": self.initial_data.agent,
+            "goals": goals,
+            "goal_details": goal_details,
+            "mine_fields": [],
+            "story": self.initial_data.story,
+            "starting_sentence": self.initial_data.starting_sentence,
+        }