PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show

ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
wxo_agentic_evaluation/__init__.py +0 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
wxo_agentic_evaluation/analytics/tools/main.py +163 -0
wxo_agentic_evaluation/analytics/tools/types.py +130 -0
wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
wxo_agentic_evaluation/analyze_run.py +123 -0
wxo_agentic_evaluation/annotate.py +40 -0
wxo_agentic_evaluation/arg_configs.py +78 -0
wxo_agentic_evaluation/batch_annotate.py +181 -0
wxo_agentic_evaluation/data_annotator.py +253 -0
wxo_agentic_evaluation/evaluation_package.py +518 -0
wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
wxo_agentic_evaluation/external_agent/types.py +65 -0
wxo_agentic_evaluation/inference_backend.py +601 -0
wxo_agentic_evaluation/llm_matching.py +39 -0
wxo_agentic_evaluation/llm_rag_eval.py +47 -0
wxo_agentic_evaluation/llm_user.py +38 -0
wxo_agentic_evaluation/main.py +231 -0
wxo_agentic_evaluation/metrics/__init__.py +0 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
wxo_agentic_evaluation/metrics/metrics.py +101 -0
wxo_agentic_evaluation/prompt/__init__.py +0 -0
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
wxo_agentic_evaluation/prompt/template_render.py +90 -0
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
wxo_agentic_evaluation/record_chat.py +165 -0
wxo_agentic_evaluation/service_instance.py +179 -0
wxo_agentic_evaluation/tool_planner.py +228 -0
wxo_agentic_evaluation/type.py +176 -0
wxo_agentic_evaluation/utils/__init__.py +6 -0
wxo_agentic_evaluation/utils/utils.py +233 -0
wxo_agentic_evaluation/watsonx_provider.py +175 -0

wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 ADDED Viewed

@@ -0,0 +1,59 @@
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are an evaluation agent judging how faithful a claim is to the source material.
+You are only to use the source material when judging a claim. Do not use your own knowledge when evaluating a claim. STICK TO THE SUPPORTING EVIDENCE.
+## Key Evaluation Principles:
+- When evaluating the input against the source material, list the evidence from the source material that supports the claim.
+- Claims don't have to be worded exactly as presented in the source material; however, the claim should convey the same general meaning from the source material. BUT, crucially, claims should not overstate or hyperbolize.
+- A claim can be partially true. Keep this in mind when evaluating the claim.
+- You can ignore different formatting for things like dates (mm/dd/yy vs mm-dd-yy vs mmddyyy as just some examples), places (Florida vs. Fl., NYC vs. New York City), numbers ($210,000 vs 210000.0 vs $21,0000.0)
+- Keep your reasoning brief and to the point. Don't be wordy.
+## Scoring:
+Once you have evaluated the claim against the source material, determine a score from 0 - 1 on how well the claim is supported by the evidence.
+A higher score indicates the claim is well supported by the evidence. A lower score indicates that the claim is less supported.
+You need to provide a reason for your score. Your reasoning should illustrate why you gave the score you did. Ask yourself these questions:
+- Is there a lot of evidence for the claim?
+- If a claim is contradictory to the source material, what pieces of evidence contradict the claim?
+- If a claim is partially supported by the evidence, what parts are supported by the evidence? what parts are not supported?
+## Output:
+Respond in a JSON format with the following fields:
+- evidence: this is a list that contains the evidence from the source material.
+- faithfulness_score: this field contains the score you gave
+- reason: this field contains your justification on why you gave the claim the score you did
+This is an example of a valid JSON output
+{
+    "evidence": [],
+    "reason": "place holder text",
+    "faithfulness_score": 0.5
+}
+DO NOT PROVIDE ADDITIONAL COMMENTARY, EXPLANATIONS, OR OUTPUTS other than what is explicitly required above.
+<|eot_id|>
+---
+<|start_header_id|>user<|end_header_id|>
+Now evaluate the following claim against the source material.
+Claim:
+{{ claim }}
+Supporting Evidence:
+{{ supporting_evidence }}
+Answer:
+<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>

wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 ADDED Viewed

@@ -0,0 +1,75 @@
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are an evaluation agent. Your task is to check if ALL keywords appear in the given response.
+- If ALL keywords are present (exactly or as close variants), respond only with: True
+- If ANY keyword is missing, respond only with: False
+- DO NOT explain or list anything. Only return: True or False
+- Only evaluate the keywords and response given to you. Do not generate additional examples.
+<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+Evaluate the following examples:
+### Example 1
+Keywords:
+successfully
+email
+updated
+Response:
+Your email has been successfully updated.
+Answer:
+True
+### Example 2
+Keywords:
+job code
+Engineering
+50000076
+Response:
+The job code in the system for Engineering is 50000074.
+Answer:
+False
+### Example 3
+Keywords:
+2024-12-25
+2025-02-15
+2025-03-20
+2025-01-10
+Response:
+Team members will be off on the following dates — jsmith: 20251225, 20250215; alee: 20250320, 20250110.
+Answer:
+True
+### Example 4
+Keywords:
+EUR
+85000.0
+Response:
+Your annual compensation is now set to €8,5000.0, paid in EUR.
+Answer:
+True
+---
+### Now, evaluate the following:
+Keywords:
+{{ keywords_text }}
+Response:
+{{ response_text }}
+Answer:
+<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>

wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 ADDED Viewed

@@ -0,0 +1,20 @@
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+Given a text response, generate keywords that capture the main information in the response.
+Capture all specific information in the response.
+Only return a list starting with [ and ending with ].
+No extra commentary<|eot_id|>
+Here are some examples:
+<|start_header_id|>user<|end_header_id|>The effective start date of your latest dental plan is January 1, 2022.<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>["dental plan", "January 1, 2022"]<|eot_id|>
+<|start_header_id|>user<|end_header_id|>Your current compensation details are as follows:\n\n* Currency: NZD\n* Yearly base salary: $102,000.00<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>["compensation", "NZD", "$102,000.00"]<|eot_id|>
+<|start_header_id|>user<|end_header_id|>Your payslip details from January 1, 2018, to January 15, 2018, are as follows:\n\n* Start Date: January 1, 2018\n* End Date: January 15, 2018\n* Currency: USD\n* Wages:\n\t+ Gross: $2466.66\n\t+ Net Pay: $1712.75\n\t+ Taxes: $556.58\n\t+ Other (None): -$197.33\n\t+ Salary: $2466.66<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>["January 1, 2018", "January 15, 2018", "USD", "$2466.66", "$1712.75", "$556.58","-$197.33"]<|eot_id|>
+<|start_header_id|>user<|end_header_id|>I successfully updated your address.<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>[]<|eot_id|>
+<|start_header_id|>user<|end_header_id|>Here are the details for the user 108727:\n| field       | value\n|-------------|------------|\n| first name  | Merton     |\n| last name   | Wells      |\n| nationality | USA        |\n| gender      | M          |\n| country     | USA        |<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>["Merton", "Wells", "USA", "M"]<|eot_id|>
+<|start_header_id|>user<|end_header_id|>{{response}}<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>

wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 ADDED Viewed

@@ -0,0 +1,22 @@
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+The user ALWAYS responds with "END" once it has the information listed in user story. If prompted a question of is it correct, the user will respond with YES if it is correct else mention No and what is to corrected/ added.
+This is the user story:
+{{user_story}}
+{% if user_response_style -%}
+This is the user response style:
+{% for instruction in user_response_style -%}
+- {{instruction}}
+{% endfor -%}
+{% endif -%}
+<|eot_id|>
+{% for message in conversation_history -%}
+<|start_header_id|>{{message.role}}<|end_header_id|>
+{{message.content}}<|eot_id|>
+{% endfor -%}
+<|eot_id|><|start_header_id|>user<|end_header_id|>

wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 ADDED Viewed

@@ -0,0 +1,114 @@
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently.
+Key evaluation principles:
+1. Focus on whether the core information and outcome is the same
+2. Different phrasings that convey the same result should be considered equivalent
+3. When specific values (IDs, dates, amounts, names) appear in both texts, they must match exactly
+4. Ignore formatting differences in dates (2022-01-01 vs. 1/1/2022 vs 20220101), numbers ($210,000 vs 210000.0 vs $21,0000.0), and IDs
+5. Different levels of detail are acceptable if they don't contradict each other and the primary information remains intact
+6. Reference IDs that are clearly system-generated (like request IDs, confirmation numbers, UUIDs) may vary and should be ignored
+Respond ONLY with:
+- True: if the texts convey the same essential information and outcomes
+- False: if they communicate different factual information or contradict each other
+DO NOT provide explanations or commentary - only respond with "True" or "False"
+<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+Evaluate the following examples:
+### Example 1
+Expected:
+Your email has been successfully updated.
+Actual:
+You have successfully updated your email.
+Answer:
+True
+### Example 2
+Expected:
+Ontario is a province in Canada.
+Actual:
+Ontario is a province.
+Answer:
+False
+### Example 3
+Expected:
+No payslips found for user with ID 12345.
+Actual:
+You don't have any payslips.
+Answer:
+True
+### Example 4
+Expected:
+Your time off request from 2024-11-01 to 2024-11-01 for TRAVEL has been successfully submitted. The request ID is c705878eb6584e9b910b8db3907a31da.
+Actual:
+Your time off request for TRAVEL on 2024-11-01 has been submitted. The confirmation ID is d805979fc7595f0a021b9ec4018b42eb.
+Answer:
+True
+### Example 5
+Expected:
+Your compensation details are as follows:
+* Currency: USD
+* Yearly base salary: 210000.0
+Actual:
+Your compensation is $210,000 USD per annum.
+Answer:
+True
+### Example 6
+Expected:
+Your visa details are as follows:
+- Country: 44
+- Document Number: DF112345DD
+- Expiration Date: 2022-09-01
+Actual:
+Your visa details are as follows:
+- Country: 46
+- Document Number: DF112345DD
+- Expiration Date: 2022-09-01
+Answer:
+False
+### Example 7
+Expected:
+I successfully updated your personal information.
+Actual:
+I have successfully updated your preferred name to M Wells and starting date is 7/05/2025.
+Answer:
+True
+---
+### Now, evaluate the following:
+Expected:
+{{ expected_text }}
+Actual:
+{{ actual_text }}
+Answer:
+<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>

wxo_agentic_evaluation/prompt/template_render.py ADDED Viewed

@@ -0,0 +1,90 @@
+import jinja2
+from typing import List
+class JinjaTemplateRenderer:
+    def __init__(self, template_path: str):
+        self._template_env = jinja2.Environment(
+            loader=jinja2.BaseLoader(), undefined=jinja2.StrictUndefined
+        )
+        # TODO: make use of config
+        self._template_env.policies["json.dumps_kwargs"] = {"sort_keys": False}
+        with open(template_path, "r") as file:
+            template_str = file.read()
+        self.template_str = template_str
+        self.template = self._template_env.from_string(template_str)
+    def render(self, **kwargs):
+        return self.template.render(**kwargs)
+class LlamaUserTemplateRenderer(JinjaTemplateRenderer):
+    def render(
+        self, user_story: str, user_response_style: List, conversation_history: List
+    ) -> str:
+        return super().render(
+            user_story=user_story,
+            user_response_style=user_response_style,
+            conversation_history=conversation_history,
+        )
+class KeywordMatchingTemplateRenderer(JinjaTemplateRenderer):
+    def render(self, keywords_text: str, response_text: str) -> str:
+        return super().render(keywords_text=keywords_text, response_text=response_text)
+class SemanticMatchingTemplateRenderer(JinjaTemplateRenderer):
+    def render(self, expected_text: str, actual_text: str) -> str:
+        return super().render(expected_text=expected_text, actual_text=actual_text)
+class LlamaKeywordsGenerationTemplateRenderer(JinjaTemplateRenderer):
+    def render(self, response: str) -> str:
+        return super().render(response=response)
+class FaithfulnessTemplateRenderer(JinjaTemplateRenderer):
+    def render(self, claim, retrieval_context):
+        return super().render(claim=claim, supporting_evidence=retrieval_context)
+class AnswerRelevancyTemplateRenderer(JinjaTemplateRenderer):
+    def render(self, question, context, answer):
+        return super().render(question=question, context=context, answer=answer)
+class ToolPlannerTemplateRenderer(JinjaTemplateRenderer):
+    def render(self, user_story: str, agent_name: str, available_tools: str) -> str:
+        return super().render(
+            user_story=user_story,
+            agent_name=agent_name,
+            available_tools=available_tools,
+        )
+class ToolChainAgentTemplateRenderer(JinjaTemplateRenderer):
+    def render(self, tool_call_history: List, available_tools:str) -> str:
+        return super().render(
+            tool_call_history=tool_call_history,
+            available_tools=available_tools,
+        )
+class BatchTestCaseGeneratorTemplateRenderer(JinjaTemplateRenderer):
+    def render(
+        self,
+        agent_name: str,
+        tool_blocks: str,
+        tool_inputs_str: str,
+        story: str,
+        num_variants: int,
+        example_str: str,
+    ) -> str:
+        return super().render(
+            agent_name=agent_name,
+            tool_blocks=tool_blocks,
+            tool_inputs_str=tool_inputs_str,
+            story=story,
+            num_variants=num_variants,
+            example_str=example_str,
+        )

wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 ADDED Viewed

@@ -0,0 +1,11 @@
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are trying to make tool calls
+{{ available_tools }}
+<|eot_id|>
+{% for message in tool_call_history -%}
+<|start_header_id|>assistant<|end_header_id|>
+{{message}}<|eot_id|>
+{% endfor -%}
+<|start_header_id|>assistant<|end_header_id|>

wxo_agentic_evaluation/prompt/tool_planner.jinja2 ADDED Viewed

@@ -0,0 +1,40 @@
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are a tool-planning assistant for an AI system.
+Your job is to extract and sequence tool calls based on user stories and available tools.
+Rules:
+- Use only the tools listed below.
+- Use only input values explicitly stated or clearly implied in the story.
+- If a tool depends on a prior output, refer to it as "$<key>" (e.g., "$fetch_assignment_id").
+- Do not use index notation like [0], [1], etc. in any tool inputs. Use the full list as-is when multiple values are expected.
+- Output ONLY one valid JSON array.
+- DO NOT include extra text or wrap the output. Just return the JSON list.
+Available Tools:
+{{ available_tools }}
+Example:
+Story: "Your username is nwaters. You want to find out your time-off schedule from: 2025-01-01 to: 2025-12-31."
+[
+  {
+    "tool_name": "fetch_assignment_id",
+    "inputs": {
+      "username": "nwaters"
+    }
+  },
+  {
+    "tool_name": "retrieve_timeoff_schedule",
+    "inputs": {
+      "assignment_id": "$fetch_assignment_id",
+      "start_date": "2025-01-01",
+      "end_date": "2025-12-31"
+    }
+  }
+]
+<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+Story: "{{ user_story }}"
+<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>

wxo_agentic_evaluation/record_chat.py ADDED Viewed

@@ -0,0 +1,165 @@
+from wxo_agentic_evaluation.type import Message
+from wxo_agentic_evaluation.arg_configs import (
+    ChatRecordingConfig,
+    KeywordsGenerationConfig,
+)
+from wxo_agentic_evaluation.inference_backend import (
+    WXOClient,
+    WXOInferenceBackend,
+    get_wxo_client,
+)
+from wxo_agentic_evaluation.data_annotator import DataAnnotator
+from wxo_agentic_evaluation.utils.utils import is_saas_url
+from wxo_agentic_evaluation.service_instance import tenant_setup
+import json
+import os
+import rich
+from datetime import datetime
+import time
+from typing import List
+from jsonargparse import CLI
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+def get_all_runs(wxo_client: WXOClient):
+    limit = 20  # Maximum allowed limit per request
+    offset = 0
+    all_runs = []
+    if is_saas_url(wxo_client.service_url):
+        path = "v1//orchestrate/runs"
+    else:
+        path = "/orchestrate/runs"
+    initial_response = wxo_client.get(
+        path, {"limit": limit, "offset": 0}
+    ).json()
+    total_runs = initial_response["total"]
+    all_runs.extend(initial_response["data"])
+    while len(all_runs) < total_runs:
+        offset += limit
+        response = wxo_client.get(
+            path, {"limit": limit, "offset": offset}
+        ).json()
+        all_runs.extend(response["data"])
+    # Sort runs by completed_at in descending order (most recent first)
+    # Put runs with no completion time at the end
+    all_runs.sort(
+        key=lambda x: (
+            datetime.strptime(x["completed_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
+            if x.get("completed_at")
+            else datetime.min
+        ),
+        reverse=True,
+    )
+    return all_runs
+def pull_messages_from_thread_id(thread_id: str, wxo_client: WXOClient):
+    inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
+    messages = inference_backend.get_messages(thread_id)
+    return messages
+def annotate_messages(
+    messages: List[Message], keywords_generation_config: KeywordsGenerationConfig
+):
+    annotator = DataAnnotator(
+        messages=messages, keywords_generation_config=keywords_generation_config
+    )
+    return annotator.generate()
+def record_chats(config: ChatRecordingConfig):
+    """Record chats in background mode"""
+    start_time = datetime.utcnow()
+    processed_threads = set()
+    rich.print(
+        f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
+    )
+    if config.token is None:
+        token = tenant_setup(config.service_url, config.tenant_name)
+    else:
+        token = config.token
+    wxo_client = get_wxo_client(config.service_url, token)
+    try:
+        while True:
+            all_runs = get_all_runs(wxo_client)
+            seen_threads = set()
+            # Process only new runs that started after our recording began
+            for run in all_runs:
+                thread_id = run.get("thread_id")
+                if thread_id in seen_threads:
+                    continue
+                seen_threads.add(thread_id)
+                started_at = run.get("started_at")
+                if not thread_id or not started_at:
+                    continue
+                try:
+                    started_time = datetime.strptime(
+                        started_at, "%Y-%m-%dT%H:%M:%S.%fZ"
+                    )
+                    if started_time > start_time:
+                        if thread_id not in processed_threads:
+                            os.makedirs(config.output_dir, exist_ok=True)
+                            rich.print(
+                                f"\n[green]INFO:[/green] New recording started at {started_at}"
+                            )
+                            rich.print(
+                                f"[green]INFO:[/green] Messages saved to: {os.path.join(config.output_dir, f'{thread_id}_messages.json')}"
+                            )
+                            rich.print(
+                                f"[green]INFO:[/green] Annotations saved to: {os.path.join(config.output_dir, f'{thread_id}_annotated_data.json')}"
+                            )
+                        processed_threads.add(thread_id)
+                        try:
+                            messages = pull_messages_from_thread_id(
+                                thread_id, wxo_client
+                            )
+                            annotated_data = annotate_messages(
+                                messages, config.keywords_generation_config
+                            )
+                            messages_filename = os.path.join(
+                                config.output_dir, f"{thread_id}_messages.json"
+                            )
+                            annotation_filename = os.path.join(
+                                config.output_dir, f"{thread_id}_annotated_data.json"
+                            )
+                            with open(messages_filename, "w") as f:
+                                json.dump(
+                                    [msg.model_dump() for msg in messages], f, indent=4
+                                )
+                            with open(annotation_filename, "w") as f:
+                                json.dump(annotated_data, f, indent=4)
+                        except Exception as e:
+                            rich.print(
+                                f"[red]ERROR:[/red] Failed to process thread {thread_id}: {str(e)}"
+                            )
+                except (ValueError, TypeError) as e:
+                    rich.print(
+                        f"[yellow]WARNING:[/yellow] Invalid timestamp format for thread {thread_id}: {str(e)}"
+                    )
+            time.sleep(2)  # Poll every 2 seconds
+    except KeyboardInterrupt:
+        rich.print("\n[yellow]Recording stopped by user[/yellow]")
+if __name__ == "__main__":
+    record_chats(CLI(ChatRecordingConfig, as_positional=False))