PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show

wxo_agentic_evaluation/main.py CHANGED Viewed

@@ -1,49 +1,64 @@
-from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.resource_map import ResourceMap
-from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
+import csv
+import dataclasses
+import glob
+import json
+import os
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import List
+import rich
+import yaml
+from jsonargparse import CLI
+from rich.progress import Progress
+from wxo_agentic_evaluation.arg_configs import TestConfig
+from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
 from wxo_agentic_evaluation.inference_backend import (
     EvaluationController,
+    WXOInferenceBackend,
     get_wxo_client,
-    WXOInferenceBackend
 )
-from typing import List
-from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
+from wxo_agentic_evaluation.llm_user import LLMUser
+from wxo_agentic_evaluation.metrics.metrics import (
+    KnowledgeBaseMetricSummary,
+    TextMatchType,
+    ToolCallAndRoutingMetrics,
+)
+from wxo_agentic_evaluation.prompt.template_render import (
+    LlamaUserTemplateRenderer,
+)
+from wxo_agentic_evaluation.resource_map import ResourceMap
+from wxo_agentic_evaluation.service_provider import get_provider
 from wxo_agentic_evaluation.type import EvaluationData
-from wxo_agentic_evaluation.arg_configs import TestConfig
+from wxo_agentic_evaluation.utils import json_dump
 from wxo_agentic_evaluation.utils.utils import (
-    create_table,
     SummaryPanel,
-    safe_divide
+    create_table,
+    safe_divide,
 )
-from wxo_agentic_evaluation.utils import json_dump
-from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary, ToolCallAndRoutingMetrics, TextMatchType
-import os
-import json
-import traceback
-import yaml
-import dataclasses
-import glob
-import rich
-import csv
-from rich.progress import Progress
-from pathlib import Path
-from concurrent.futures import ThreadPoolExecutor
-from jsonargparse import CLI
-def process_test_case(task_n, test_case, config, inference_backend, resource_map, llm_user):
+def process_test_case(
+    task_n, test_case, config, inference_backend, resource_map, llm_user
+):
     summary_results_for_path = []
     tc_name = os.path.basename(test_case).replace(".json", "")
     with open(test_case, "r") as f:
         test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
     evaluation_controller = EvaluationController(
-        wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
+        wxo_inference_backend=inference_backend,
+        llm_user=llm_user,
+        config=config,
     )
     rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
-    history, call_tracker, conversational_search_data = evaluation_controller.run(
+    (
+        history,
+        call_tracker,
+        conversational_search_data,
+    ) = evaluation_controller.run(
         task_n,
         test_case.story,
         agent_name=test_case.agent,
@@ -54,7 +69,8 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
         result.append(message.model_dump())
     json_dump(
-        os.path.join(config.output_dir, "messages", tc_name + ".messages.json"), result
+        os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
+        result,
     )
     if len(conversational_search_data) > 0:
@@ -73,7 +89,7 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
         messages=history,
         ground_truth=test_case,
         conversational_search_data=conversational_search_data,
-        resource_map=resource_map
+        resource_map=resource_map,
     )
     (
         keyword_semantic_matches,
@@ -85,7 +101,9 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
     for message in messages_with_reason:
         temp.append(message.model_dump())
     json_dump(
-        os.path.join(config.output_dir, "messages", tc_name + ".messages.analyze.json"),
+        os.path.join(
+            config.output_dir, "messages", tc_name + ".messages.analyze.json"
+        ),
         temp,
     )
@@ -108,18 +126,29 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
 def main(config: TestConfig):
     executor = ThreadPoolExecutor(max_workers=config.num_workers)
     if config.num_workers > 1 and config.enable_manual_user_input:
-        rich.print("[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]")
-        config.enable_manual_user_input = False # disable manual user input for parallel execution
+        rich.print(
+            "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
+        )
+        config.enable_manual_user_input = (
+            False  # disable manual user input for parallel execution
+        )
         # reason: threads continue to stream messages while waiting for user input, which is not desired
         # and the manual input prompt is not labelled properly in the UI
     wxo_client = get_wxo_client(
-        config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
+        config.auth_config.url,
+        config.auth_config.tenant_name,
+        config.auth_config.token,
     )
     resource_map = ResourceMap(wxo_client)
     inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
     llm_user = LLMUser(
-        wai_client=get_provider(config=config.provider_config, model_id=config.llm_user_config.model_id),
-        template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
+        wai_client=get_provider(
+            config=config.provider_config,
+            model_id=config.llm_user_config.model_id,
+        ),
+        template=LlamaUserTemplateRenderer(
+            config.llm_user_config.prompt_config
+        ),
         user_response_style=config.llm_user_config.user_response_style,
     )
@@ -127,7 +156,9 @@ def main(config: TestConfig):
     results_list = []
-    knowledge_base_output_folder = Path(config.output_dir) / "knowledge_base_metrics"
+    knowledge_base_output_folder = (
+        Path(config.output_dir) / "knowledge_base_metrics"
+    )
     knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
     detailed_rag_output_file = (
         knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
@@ -143,7 +174,9 @@ def main(config: TestConfig):
             [
                 os.path.basename(f).replace(".messages", "")
                 for f in glob.glob(
-                    os.path.join(config.output_dir, "messages", "*.messages.json")
+                    os.path.join(
+                        config.output_dir, "messages", "*.messages.json"
+                    )
                 )
             ]
         )
@@ -153,7 +186,7 @@ def main(config: TestConfig):
         if os.path.isdir(test_path):
             test_path = os.path.join(test_path, "*.json")
         test_cases.extend(sorted(glob.glob(test_path)))
     futures = []
     task_n = 0
     for test_case in test_cases:
@@ -161,7 +194,9 @@ def main(config: TestConfig):
             continue
         if config.skip_available_results:
             if test_case in available_res:
-                print(f"Skipping test case {test_case} as results already exist.")
+                print(
+                    f"Skipping test case {test_case} as results already exist."
+                )
                 continue
         future = executor.submit(
@@ -180,7 +215,8 @@ def main(config: TestConfig):
     if futures:
         with Progress() as progress:
             task1 = progress.add_task(
-                f"[purple]Evaluating {len(futures)} tasks...", total=len(futures)
+                f"[purple]Evaluating {len(futures)} tasks...",
+                total=len(futures),
             )
             for test_case, future in futures:
                 try:
@@ -200,27 +236,55 @@ def main(config: TestConfig):
     SummaryPanel(rag_metric_summary).print()
     with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
-        json.dump(rag_metric_summary.model_dump(by_alias=True)["detailed"], f, indent=4)
+        json.dump(
+            rag_metric_summary.model_dump(by_alias=True)["detailed"],
+            f,
+            indent=4,
+        )
     with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
-        json.dump(rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4)
+        json.dump(
+            rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4
+        )
     if len(tool_call_metrics) > 0:
         # remove the average row if exist
         tool_call_metrics = [
-            row for row in tool_call_metrics if row.dataset_name != "Summary (Average)"
+            row
+            for row in tool_call_metrics
+            if row.dataset_name != "Summary (Average)"
         ]
-        def filter_display_only_values(tool_call_metric: ToolCallAndRoutingMetrics):
-            row = {"Dataset": tool_call_metric.dataset_name, "Total Steps": tool_call_metric.total_steps,
-                   "LLM Steps": tool_call_metric.llm_step, "Total Tool Calls":tool_call_metric.total_tool_calls, "Tool Call Precision": tool_call_metric.tool_call_precision, "Tool Call Recall": tool_call_metric.tool_call_recall,
-                   "Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy, "Text Match": tool_call_metric.text_match, "Journey Success": tool_call_metric.is_success, "Avg Resp Time (sec)": tool_call_metric.avg_resp_time}
+        def filter_display_only_values(
+            tool_call_metric: ToolCallAndRoutingMetrics,
+        ):
+            row = {
+                "Dataset": tool_call_metric.dataset_name,
+                "Total Steps": tool_call_metric.total_steps,
+                "LLM Steps": tool_call_metric.llm_step,
+                "Total Tool Calls": tool_call_metric.total_tool_calls,
+                "Tool Call Precision": tool_call_metric.tool_call_precision,
+                "Tool Call Recall": tool_call_metric.tool_call_recall,
+                "Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy,
+                "Text Match": tool_call_metric.text_match,
+                "Journey Success": tool_call_metric.is_success,
+                "Avg Resp Time (sec)": tool_call_metric.avg_resp_time,
+            }
             return row
         def create_avg_row(metrics: List[dict]):
-            avg_row = {"Dataset": "Summary (Average)", "Total Steps": 0,
-                   "LLM Steps": 0, "Total Tool Calls":0,  "Tool Call Precision": 0, "Tool Call Recall": 0, "Agent Routing Accuracy": 0,
-                   "Text Match": 0, "Journey Success": 0, "Avg Resp Time (sec)": 0}
+            avg_row = {
+                "Dataset": "Summary (Average)",
+                "Total Steps": 0,
+                "LLM Steps": 0,
+                "Total Tool Calls": 0,
+                "Tool Call Precision": 0,
+                "Tool Call Recall": 0,
+                "Agent Routing Accuracy": 0,
+                "Text Match": 0,
+                "Journey Success": 0,
+                "Avg Resp Time (sec)": 0,
+            }
             if metrics:
                 for row in metrics:
                     avg_row["Total Steps"] += row["Total Steps"]
@@ -228,33 +292,77 @@ def main(config: TestConfig):
                     avg_row["Total Tool Calls"] += row["Total Tool Calls"]
                     avg_row["Tool Call Precision"] += row["Tool Call Precision"]
                     avg_row["Tool Call Recall"] += row["Tool Call Recall"]
-                    avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
-                    avg_row["Text Match"] += row["Text Match"] == TextMatchType.text_match.value
+                    avg_row["Agent Routing Accuracy"] += row[
+                        "Agent Routing Accuracy"
+                    ]
+                    avg_row["Text Match"] += (
+                        row["Text Match"] == TextMatchType.text_match.value
+                    )
                     avg_row["Journey Success"] += row["Journey Success"]
                     avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
-                avg_row["Total Steps"] = round(safe_divide(avg_row["Total Steps"], len(metrics)), 2)
-                avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], len(metrics)), 2)
-                avg_row["Total Tool Calls"] = round(safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2)
-                avg_row["Tool Call Precision"] = round(safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2)
-                avg_row["Tool Call Recall"] = round(safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2)
-                avg_row["Agent Routing Accuracy"] = round(safe_divide(avg_row["Agent Routing Accuracy"], len(metrics)), 2)
-                avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], len([row for row in metrics if row["Text Match"] != TextMatchType.text_match.na])), 2)
-                avg_row["Journey Success"] = round(safe_divide(avg_row["Journey Success"], len(metrics)), 2)
-                avg_row["Avg Resp Time (sec)"] = round(safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2)
+                avg_row["Total Steps"] = round(
+                    safe_divide(avg_row["Total Steps"], len(metrics)), 2
+                )
+                avg_row["LLM Steps"] = round(
+                    safe_divide(avg_row["LLM Steps"], len(metrics)), 2
+                )
+                avg_row["Total Tool Calls"] = round(
+                    safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2
+                )
+                avg_row["Tool Call Precision"] = round(
+                    safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2
+                )
+                avg_row["Tool Call Recall"] = round(
+                    safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2
+                )
+                avg_row["Agent Routing Accuracy"] = round(
+                    safe_divide(
+                        avg_row["Agent Routing Accuracy"], len(metrics)
+                    ),
+                    2,
+                )
+                avg_row["Text Match"] = round(
+                    safe_divide(
+                        avg_row["Text Match"],
+                        len(
+                            [
+                                row
+                                for row in metrics
+                                if row["Text Match"]
+                                != TextMatchType.text_match.na
+                            ]
+                        ),
+                    ),
+                    2,
+                )
+                avg_row["Journey Success"] = round(
+                    safe_divide(avg_row["Journey Success"], len(metrics)), 2
+                )
+                avg_row["Avg Resp Time (sec)"] = round(
+                    safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2
+                )
             return avg_row
         tool_call_metrics_for_display = []
         for row in tool_call_metrics:
-            tool_call_metrics_for_display.append(filter_display_only_values(row))
-        tool_call_metrics_for_display.append(create_avg_row(tool_call_metrics_for_display))
-        tool_call_table_for_display = create_table(tool_call_metrics_for_display)
+            tool_call_metrics_for_display.append(
+                filter_display_only_values(row)
+            )
+        tool_call_metrics_for_display.append(
+            create_avg_row(tool_call_metrics_for_display)
+        )
+        tool_call_table_for_display = create_table(
+            tool_call_metrics_for_display
+        )
         if tool_call_table_for_display:
             tool_call_table_for_display.print()
     if len(tool_call_metrics) > 0:
-        tool_call_metrics = [metric.model_dump() for metric in tool_call_metrics]
+        tool_call_metrics = [
+            metric.model_dump() for metric in tool_call_metrics
+        ]
         output_file = os.path.join(config.output_dir, "summary_metrics.csv")
         header = list(tool_call_metrics[0].keys())

wxo_agentic_evaluation/metrics/llm_as_judge.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from pydantic import BaseModel, computed_field
 from abc import abstractmethod
 from functools import cached_property
+from pydantic import BaseModel, computed_field
 class BaseLLMJudgeMetric(BaseModel):
     @abstractmethod

wxo_agentic_evaluation/metrics/metrics.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import math
-from typing import List, Mapping, Any, Tuple, Optional
 from enum import Enum
+from typing import Any, List, Mapping, Optional, Tuple
 from pydantic import BaseModel, computed_field
-from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
+from wxo_agentic_evaluation.metrics.llm_as_judge import (
+    AnswerRelevancy,
+    Faithfulness,
+)
 from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
@@ -13,7 +16,7 @@ def average(array):
         return math.nan
     else:
-        return sum(array)/len(array)
+        return sum(array) / len(array)
 class KnowledgeBaseMetrics(BaseModel):
@@ -54,7 +57,9 @@ class KnowledgeBaseMetricSummary(BaseModel):
                     }
                 else:
                     values = groupby[name]
-                    values.get("knowledge_base_name").append(knowledge_base_name)
+                    values.get("knowledge_base_name").append(
+                        knowledge_base_name
+                    )
                     values.get("faithfulness").append(faithfulness)
                     values.get("answer_relevancy").append(answer_relevancy)
                     values.get("confidence_scores").append(confidence_scores)
@@ -109,6 +114,7 @@ class KeywordSemanticSearchMetric(BaseModel):
     message: str
     goal_detail: str
 class TextMatchType(Enum):
     text_match = "Summary Matched"
     text_mismatch = "Summary MisMatched"
@@ -117,12 +123,14 @@ class TextMatchType(Enum):
 class ToolCallAndRoutingMetrics(BaseModel):
     dataset_name: str = ""
-    total_steps: int=0
-    llm_step: int =0
+    total_steps: int = 0
+    llm_step: int = 0
     total_tool_calls: int = 0
     expected_tool_calls: int = 0
     correct_tool_calls: int = 0
-    relevant_tool_calls: int = 0 # calls with the same function but different args
+    relevant_tool_calls: int = (
+        0  # calls with the same function but different args
+    )
     total_routing_calls: int = 0
     relevant_routing_calls: int = 0
     tool_calls_with_incorrect_parameter: int = 0
@@ -135,7 +143,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
     def tool_call_recall(self) -> float:
         return round(
             (
-                self.correct_tool_calls/self.expected_tool_calls
+                self.correct_tool_calls / self.expected_tool_calls
                 if self.expected_tool_calls > 0
                 else 0.0
             ),
@@ -147,8 +155,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
     def tool_call_precision(self) -> float:
         return round(
             (
-                (self.correct_tool_calls)
-                / self.total_tool_calls
+                (self.correct_tool_calls) / self.total_tool_calls
                 if self.total_tool_calls > 0
                 else 0.0
             ),
@@ -167,11 +174,13 @@ class ToolCallAndRoutingMetrics(BaseModel):
             2,
         )
 class FailedStaticTestCases(BaseModel):
     metric_name: str
     description: str
     explanation: str
 class FailedSemanticTestCases(BaseModel):
     metric_name: str
     evidence: str
@@ -179,11 +188,16 @@ class FailedSemanticTestCases(BaseModel):
     output: int
     confidence: float
 class ReferenceLessEvalMetrics(BaseModel):
     dataset_name: str
     number_of_tool_calls: int
     number_of_successful_tool_calls: int
     number_of_static_failed_tool_calls: int
     number_of_semantic_failed_tool_calls: int
-    failed_static_tool_calls: Optional[List[Tuple[int, List[FailedStaticTestCases]]]]
-    failed_semantic_tool_calls: Optional[List[Tuple[int, List[FailedSemanticTestCases]]]]
+    failed_static_tool_calls: Optional[
+        List[Tuple[int, List[FailedStaticTestCases]]]
+    ]
+    failed_semantic_tool_calls: Optional[
+        List[Tuple[int, List[FailedSemanticTestCases]]]
+    ]

wxo_agentic_evaluation/prompt/template_render.py CHANGED Viewed

@@ -1,7 +1,10 @@
-import jinja2
 from typing import List
+import jinja2
 from wxo_agentic_evaluation.type import ToolDefinition
 class JinjaTemplateRenderer:
     def __init__(self, template_path: str):
         self._template_env = jinja2.Environment(
@@ -20,7 +23,11 @@ class JinjaTemplateRenderer:
 class LlamaUserTemplateRenderer(JinjaTemplateRenderer):
     def render(
-        self, user_story: str, user_response_style: List, conversation_history: List, attack_instructions: str = None
+        self,
+        user_story: str,
+        user_response_style: List,
+        conversation_history: List,
+        attack_instructions: str = None,
     ) -> str:
         return super().render(
             user_story=user_story,
@@ -32,12 +39,17 @@ class LlamaUserTemplateRenderer(JinjaTemplateRenderer):
 class KeywordMatchingTemplateRenderer(JinjaTemplateRenderer):
     def render(self, keywords_text: str, response_text: str) -> str:
-        return super().render(keywords_text=keywords_text, response_text=response_text)
+        return super().render(
+            keywords_text=keywords_text, response_text=response_text
+        )
 class SemanticMatchingTemplateRenderer(JinjaTemplateRenderer):
     def render(self, expected_text: str, actual_text: str) -> str:
-        return super().render(expected_text=expected_text, actual_text=actual_text)
+        return super().render(
+            expected_text=expected_text, actual_text=actual_text
+        )
 class BadToolDescriptionRenderer(JinjaTemplateRenderer):
     def render(self, tool_definition: ToolDefinition) -> str:
@@ -51,7 +63,9 @@ class LlamaKeywordsGenerationTemplateRenderer(JinjaTemplateRenderer):
 class FaithfulnessTemplateRenderer(JinjaTemplateRenderer):
     def render(self, claim, retrieval_context):
-        return super().render(claim=claim, supporting_evidence=retrieval_context)
+        return super().render(
+            claim=claim, supporting_evidence=retrieval_context
+        )
 class AnswerRelevancyTemplateRenderer(JinjaTemplateRenderer):
@@ -60,13 +74,16 @@ class AnswerRelevancyTemplateRenderer(JinjaTemplateRenderer):
 class ToolPlannerTemplateRenderer(JinjaTemplateRenderer):
-    def render(self, user_story: str, agent_name: str, available_tools: str) -> str:
+    def render(
+        self, user_story: str, agent_name: str, available_tools: str
+    ) -> str:
         return super().render(
             user_story=user_story,
             agent_name=agent_name,
             available_tools=available_tools,
         )
 class ArgsExtractorTemplateRenderer(JinjaTemplateRenderer):
     def render(self, tool_signature: str, step: dict, inputs: dict) -> str:
         return super().render(
@@ -75,8 +92,9 @@ class ArgsExtractorTemplateRenderer(JinjaTemplateRenderer):
             inputs=inputs,
         )
 class ToolChainAgentTemplateRenderer(JinjaTemplateRenderer):
-    def render(self, tool_call_history: List, available_tools:str) -> str:
+    def render(self, tool_call_history: List, available_tools: str) -> str:
         return super().render(
             tool_call_history=tool_call_history,
             available_tools=available_tools,
@@ -102,6 +120,7 @@ class BatchTestCaseGeneratorTemplateRenderer(JinjaTemplateRenderer):
             example_str=example_str,
         )
 class StoryGenerationTemplateRenderer(JinjaTemplateRenderer):
     def render(
         self,
@@ -110,7 +129,8 @@ class StoryGenerationTemplateRenderer(JinjaTemplateRenderer):
         return super().render(
             input_data=input_data,
         )
 class OnPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
     def render(
         self,
@@ -125,7 +145,8 @@ class OnPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
             original_story=original_story,
             original_starting_sentence=original_starting_sentence,
         )
 class OffPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
     def render(
         self,
@@ -135,4 +156,4 @@ class OffPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
         return super().render(
             original_story=original_story,
             original_starting_sentence=original_starting_sentence,
-        )
+        )

ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl