PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.1py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -1,46 +1,53 @@
-from typing import List
 import json
-import os
+import os
+from typing import List
 import rich
+from wxo_agentic_evaluation import __file__
 from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
-from wxo_agentic_evaluation.type import (
-    ContentType,
-    Message,
-    EvaluationData,
-    EventTypes,
-    ConversationalSearch,
-    ExtendedMessage,
-)
-from wxo_agentic_evaluation.resource_map import ResourceMap
-from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.llm_matching import LLMMatcher
+from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
 from wxo_agentic_evaluation.metrics.metrics import (
-    KnowledgeBaseMetrics,
     KeywordSemanticSearchMetric,
+    KnowledgeBaseMetrics,
+    TextMatchType,
     ToolCallAndRoutingMetrics,
-    TextMatchType
 )
 from wxo_agentic_evaluation.prompt.template_render import (
+    AnswerRelevancyTemplateRenderer,
+    FaithfulnessTemplateRenderer,
     KeywordMatchingTemplateRenderer,
     SemanticMatchingTemplateRenderer,
-    FaithfulnessTemplateRenderer,
-    AnswerRelevancyTemplateRenderer,
 )
-from wxo_agentic_evaluation.llm_matching import LLMMatcher
-from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
-from wxo_agentic_evaluation import __file__
+from wxo_agentic_evaluation.resource_map import ResourceMap
+from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.type import (
+    ContentType,
+    ConversationalSearch,
+    EvaluationData,
+    EventTypes,
+    ExtendedMessage,
+    Message,
+)
 root_dir = os.path.dirname(__file__)
-KEYWORD_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "keyword_matching_prompt.jinja2")
-SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "semantic_matching_prompt.jinja2")
-FAITHFULNESS_PROMPT_PATH = os.path.join(root_dir, "prompt", "faithfulness_prompt.jinja2")
-ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(root_dir, "prompt", "answer_relevancy_prompt.jinja2")
+KEYWORD_MATCHING_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "keyword_matching_prompt.jinja2"
+)
+SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "semantic_matching_prompt.jinja2"
+)
+FAITHFULNESS_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "faithfulness_prompt.jinja2"
+)
+ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "answer_relevancy_prompt.jinja2"
+)
 RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
-    "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS",
-    "<IGNORE>"
-    )
+    "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
+)
 """
 - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
@@ -50,6 +57,7 @@ single, summary step goals.
 """
 DUMMY_GRAPH_NODE_NAME = "dummy-goal"
 class EvaluationPackage:
     def __init__(
         self,
@@ -76,14 +84,18 @@ class EvaluationPackage:
         self.ground_truth = ground_truth
         self.test_case_name = test_case_name
         self.resource_map = resource_map
         if not self.is_attack_evaluation:
             self.validate_ground_truth(self.ground_truth, self.test_case_name)
         self.matcher = LLMMatcher(
             llm_client=get_provider(
                 model_id="meta-llama/llama-3-405b-instruct",
-                params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 10},
+                params={
+                    "min_new_tokens": 0,
+                    "decoding_method": "greedy",
+                    "max_new_tokens": 10,
+                },
             ),
             keyword_template=KeywordMatchingTemplateRenderer(
                 KEYWORD_MATCHING_PROMPT_PATH
@@ -94,20 +106,24 @@ class EvaluationPackage:
         )
         self.rag_llm_as_a_judge = LLMJudge(
             llm_client=get_provider(
-                    model_id="meta-llama/llama-3-405b-instruct",
-                    params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 4096},
-                ),
+                model_id="meta-llama/llama-3-405b-instruct",
+                params={
+                    "min_new_tokens": 0,
+                    "decoding_method": "greedy",
+                    "max_new_tokens": 4096,
+                },
+            ),
             faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
             answer_relevancy=AnswerRelevancyTemplateRenderer(
                 ANSWER_RELEVANCY_PROMPT_PATH
             ),
         )
     @staticmethod
     def find_ground_node(graph, start_node):
-        """ Simple implementation. Should be fixed in the future
+        """Simple implementation. Should be fixed in the future
-        Assumes that there is a single graph node that does not have children
+        Assumes that there is a single graph node that does not have children
         """
         stack = [start_node]
@@ -117,21 +133,23 @@ class EvaluationPackage:
             node = stack.pop()
             if node not in visited_set:
                 visited_set.add(node)
                 # check for children
                 # improvement for future: add the ground nodes here
                 # right now, just return the first one
                 if not graph.get(node):
                     return node
                 stack.extend(graph[node])
         return None
     @staticmethod
     def is_topological_sort(graph, ordering):
         position = {node: i for i, node in enumerate(ordering)}
-        ground_node = EvaluationPackage.find_ground_node(graph, list(graph.keys())[0])
+        ground_node = EvaluationPackage.find_ground_node(
+            graph, list(graph.keys())[0]
+        )
         if ground_node is not None:
             graph[ground_node] = [DUMMY_GRAPH_NODE_NAME]
@@ -187,7 +205,11 @@ class EvaluationPackage:
                     f"Goal detail '{goal_detail.name}' does not match any goals: {goals}. test_case_name: {test_case_name}"
                 )
             if goal_detail.name == "summarize":
-                if (not goal_detail.keywords or len(goal_detail.keywords) == 0) and (not goal_detail.response or len(goal_detail.response) == 0):
+                if (
+                    not goal_detail.keywords or len(goal_detail.keywords) == 0
+                ) and (
+                    not goal_detail.response or len(goal_detail.response) == 0
+                ):
                     rich.print(
                         f"Summarize goal should have keywords or final response. test_case_name: {test_case_name}"
                     )
@@ -215,11 +237,10 @@ class EvaluationPackage:
                 rich.print(
                     f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
                 )
     @staticmethod
     def _check_if_args_match_with_ignore(
-            actual_args: dict[str, str],
-            expected_args: dict[str, str]
+        actual_args: dict[str, str], expected_args: dict[str, str]
     ) -> bool:
         """
         This function checks if a registered tool call matches with the goal node when:
@@ -230,15 +251,15 @@ class EvaluationPackage:
         Returns:
             bool: True if match with keyword parameters ignored | False otherwise (improper tool call).
         """
-        if(
-            set(actual_args.keys()) != set(expected_args.keys())
-            ):
+        if set(actual_args.keys()) != set(expected_args.keys()):
             return False
         for key in actual_args:
-            if actual_args[key] != expected_args[key] \
-                and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
+            if (
+                actual_args[key] != expected_args[key]
+                and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
+            ):
                 return False
         return True
@@ -248,18 +269,26 @@ class EvaluationPackage:
         message_outcomes = []
         labelled_messages_without_text_step = []
         # Counters for tool-calling related metrics
-        tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
+        tool_call_and_routing_metrics = ToolCallAndRoutingMetrics()
+        tool_call_and_routing_metrics.expected_tool_calls = len(
+            self.tool_dictionary
         )
-        tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
-        correct_tool_calls = set() # sometimes, tool with the same signature can be called more than once
+        correct_tool_calls = (
+            set()
+        )  # sometimes, tool with the same signature can be called more than once
         for message in self.messages:
             if message.type == ContentType.tool_call:
                 msg_tool_call = json.loads(message.content)
-                if self.resource_map and msg_tool_call["name"] in self.resource_map.agent2tools:
+                if (
+                    self.resource_map
+                    and msg_tool_call["name"] in self.resource_map.agent2tools
+                ):
                     tool_call_and_routing_metrics.total_routing_calls += 1
                     relevant = False
-                    for tool in self.resource_map.agent2tools[msg_tool_call["name"]]:
+                    for tool in self.resource_map.agent2tools[
+                        msg_tool_call["name"]
+                    ]:
                         for goal_detail in self.tool_dictionary.values():
                             if goal_detail.tool_name == tool:
                                 relevant = True
@@ -268,7 +297,9 @@ class EvaluationPackage:
                             break
                     if relevant:
-                        tool_call_and_routing_metrics.relevant_routing_calls += 1
+                        tool_call_and_routing_metrics.relevant_routing_calls += (
+                            1
+                        )
                     else:
                         message_outcome = ExtendedMessage(message=message)
                         message_outcome.reason = {
@@ -294,21 +325,26 @@ class EvaluationPackage:
                     possible_ground_truth_for_analysis = []
                     for goal_detail in matching_goal_details:
                         # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
-                        if goal_detail.args == {"IGNORE": None} or (msg_tool_call["args"] == goal_detail.args or
-                            self._check_if_args_match_with_ignore(
-                                msg_tool_call["args"],
-                                goal_detail.args
-                            )):
+                        if goal_detail.args == {"IGNORE": None} or (
+                            msg_tool_call["args"] == goal_detail.args
+                            or self._check_if_args_match_with_ignore(
+                                msg_tool_call["args"], goal_detail.args
+                            )
+                        ):
                             labelled_messages.append(goal_detail.name)
-                            labelled_messages_without_text_step.append(goal_detail.name)
+                            labelled_messages_without_text_step.append(
+                                goal_detail.name
+                            )
                             correct_tool_calls.add(goal_detail.name)
-                            #tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
+                            # tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
                             found = True
                             message_outcome = ExtendedMessage(message=message)
                             message_outcomes.append(message_outcome)
                             break
                         else:
-                            possible_ground_truth_for_analysis.append(goal_detail.args)
+                            possible_ground_truth_for_analysis.append(
+                                goal_detail.args
+                            )
                     if not found:
                         message_outcome = ExtendedMessage(message=message)
@@ -324,7 +360,7 @@ class EvaluationPackage:
                                 f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
                             )
                 else:
                     if not self.is_attack_evaluation:
                         rich.print(
                             f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
@@ -350,7 +386,9 @@ class EvaluationPackage:
                 message_outcome = ExtendedMessage(message=message)
                 message_outcomes.append(message_outcome)
-        tool_call_and_routing_metrics.correct_tool_calls = len(correct_tool_calls)
+        tool_call_and_routing_metrics.correct_tool_calls = len(
+            correct_tool_calls
+        )
         assistant_responses = [
             message
@@ -430,7 +468,9 @@ class EvaluationPackage:
                 llm_steps += 1
             total_step += 1
-        knowledge_base_metric_summary = self.generate_knowledge_base_metric_summary()
+        knowledge_base_metric_summary = (
+            self.generate_knowledge_base_metric_summary()
+        )
         # TO-DO: the table is not printing properly anymore with the new columns introduced
         # we need to introduce a separate table for these.
@@ -524,7 +564,8 @@ class EvaluationPackage:
             )  # name of knowledge base
             search_results = [
-                result.body for result in conversational_search_data.search_results
+                result.body
+                for result in conversational_search_data.search_results
             ]
             faithfulness = self.rag_llm_as_a_judge.faithfulness(
                 conversational_search_data.text, search_results
@@ -547,6 +588,7 @@ class EvaluationPackage:
         return metrics
 if __name__ == "__main__":
     messages = []
@@ -564,7 +606,9 @@ if __name__ == "__main__":
     for message in messages:
         if message.role == "user":
-            rich.print("[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content)
+            rich.print(
+                "[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content
+            )
         else:
             rich.print("[orange3]WXO:[/orange3]", message.content)
@@ -574,7 +618,7 @@ if __name__ == "__main__":
     evaluate_package = EvaluationPackage(
         test_case_name="data1.messages.json",
         ground_truth=ground_truth,
-        messages=messages
+        messages=messages,
     )
     print(evaluate_package.generate_summary())
     # print(evaluate_package.traverse())

wxo_agentic_evaluation/external_agent/__init__.py CHANGED Viewed

@@ -1,21 +1,28 @@
 import importlib.resources
 import json
 import rich
-from wxo_agentic_evaluation.prompt.template_render import StoryGenerationTemplateRenderer
-from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
 from wxo_agentic_evaluation import prompt
+from wxo_agentic_evaluation.prompt.template_render import (
+    StoryGenerationTemplateRenderer,
+)
+from wxo_agentic_evaluation.service_provider import ProviderConfig, get_provider
 console = rich.console.Console()
 def starting_sentence_generation_prompt():
-    with importlib.resources.path(prompt, "starting_sentence_generation_prompt.jinja2") as fp:
+    with importlib.resources.path(
+        prompt, "starting_sentence_generation_prompt.jinja2"
+    ) as fp:
         # reuse the StoryGenerationTemplateRenderer class, even though we are generating a "starting_sentence" instead of a "story"
         # the starting sentence generation prompts uses the same input variable
         render = StoryGenerationTemplateRenderer(str(fp))
     return render
 def generate_starting_sentence(annotated_data: dict):
     renderer = starting_sentence_generation_prompt()
     llm_decode_parameter = {
@@ -23,7 +30,9 @@ def generate_starting_sentence(annotated_data: dict):
         "decoding_method": "greedy",
         "max_new_tokens": 4096,
     }
-    wai_client = get_provider(model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter)
+    wai_client = get_provider(
+        model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter
+    )
     prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
     res = wai_client.query(prompt)
     res = res.strip()
@@ -33,5 +42,7 @@ def generate_starting_sentence(annotated_data: dict):
         res = json.loads(res)
         return res["starting_sentence"]
     except Exception:
-        console.log(f"The generated `starting_sentence` had incorrect format: '{res}'")
-        return res
+        console.log(
+            f"The generated `starting_sentence` had incorrect format: '{res}'"
+        )
+        return res

wxo_agentic_evaluation/external_agent/external_validate.py CHANGED Viewed

@@ -1,15 +1,21 @@
+import json
 from typing import Generator
 import requests
-import json
 import rich
-from wxo_agentic_evaluation.external_agent.types import UniversalData, SchemaValidationResults
+from wxo_agentic_evaluation.external_agent.types import (
+    SchemaValidationResults,
+    UniversalData,
+)
 MESSAGES = [
     {"role": "user", "content": "what's the holiday is June 13th in us?"},
-    {"role": "assistant", "content": "tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}"},
-    {"role": "assistant", "content":"it's National Sewing Machine Day"}
+    {
+        "role": "assistant",
+        "content": 'tool_name: calendar_lookup, args {"location": "USA", "data": "06-13-2025"}}',
+    },
+    {"role": "assistant", "content": "it's National Sewing Machine Day"},
 ]
@@ -18,7 +24,7 @@ class ExternalAgentValidation:
         self.credential = credential
         self.auth_scheme = auth_scheme
         self.service_url = service_url
     @property
     def header(self):
         header = {"Content-Type": "application/json"}
@@ -32,23 +38,23 @@ class ExternalAgentValidation:
         return header
     def _parse_streaming_events(self, resp: Generator[bytes, None, None]):
-        data = b''
+        data = b""
         for chunk in resp:
             for line in chunk.splitlines(True):
-                if line.startswith(b'data:'):
-                    line = line.replace(b'data:', b'')
-                if line.strip() == b'[DONE]':
+                if line.startswith(b"data:"):
+                    line = line.replace(b"data:", b"")
+                if line.strip() == b"[DONE]":
                     return
                 data += line
-                if data.endswith((b'\r\r', b'\n\n', b'\r\n\r\n')):
+                if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
                     # NOTE: edge case, "data" can be sent in two different chunks
-                    if data.startswith(b'data:'):
-                        data = data.replace(b'data:', b'')
+                    if data.startswith(b"data:"):
+                        data = data.replace(b"data:", b"")
                     yield data
-                    data = b''
+                    data = b""
         if data:
             yield data
     def _validate_streaming_response(self, resp):
         success = True
         logged_events = []
@@ -61,52 +67,57 @@ class ExternalAgentValidation:
             except Exception as e:
                 success = False
                 break
         return success, logged_events
     def _validate_schema_compliance(self, messages):
         payload = {"stream": True}
         payload["messages"] = messages
-        resp = requests.post(url=self.service_url, headers=self.header, json=payload)
+        resp = requests.post(
+            url=self.service_url, headers=self.header, json=payload
+        )
         success, logged_events = self._validate_streaming_response(resp)
         msg = ", ".join([msg["content"] for msg in payload["messages"]])
         if success:
-            rich.print(f":white_check_mark: External Agent streaming response validation succeeded for '{msg}'.")
+            rich.print(
+                f":white_check_mark: External Agent streaming response validation succeeded for '{msg}'."
+            )
         else:
-            rich.print(f":heavy_exclamation_mark:Schema validation failed for messages: '{msg}':heavy_exclamation_mark:\n The last logged event was {logged_events[-1]}.\n")
+            rich.print(
+                f":heavy_exclamation_mark:Schema validation failed for messages: '{msg}':heavy_exclamation_mark:\n The last logged event was {logged_events[-1]}.\n"
+            )
         return success, logged_events
-    def call_validation(self, input_str: str, add_context: bool = False) -> SchemaValidationResults:
+    def call_validation(
+        self, input_str: str, add_context: bool = False
+    ) -> SchemaValidationResults:
         if add_context:
             return self.block_validation(input_str)
-        msg = {
-            "role": "user",
-            "content": input_str
-        }
+        msg = {"role": "user", "content": input_str}
         success, logged_events = self._validate_schema_compliance([msg])
-        results = SchemaValidationResults(success=success, logged_events=logged_events, messages=[msg])
+        results = SchemaValidationResults(
+            success=success, logged_events=logged_events, messages=[msg]
+        )
         return results.model_dump()
     def block_validation(self, input_str: str) -> SchemaValidationResults:
-        """ Tests a block of messages
-        """
+        """Tests a block of messages"""
         rich.print(
             f"[gold3]The following prebuilt messages, '{MESSAGES}' is prepended to the input message, '{input_str}'"
         )
-        msg = {
-            "role": "user",
-            "content": input_str
-        }
+        msg = {"role": "user", "content": input_str}
         messages = MESSAGES + [msg]
         success, logged_events = self._validate_schema_compliance(messages)
-        results = SchemaValidationResults(success=success, logged_events=logged_events, messages=messages)
+        results = SchemaValidationResults(
+            success=success, logged_events=logged_events, messages=messages
+        )
-        return results.model_dump()
+        return results.model_dump()

wxo_agentic_evaluation/external_agent/performance_test.py CHANGED Viewed

@@ -1,10 +1,15 @@
-from typing import List, Mapping, Any
+from typing import Any, List, Mapping
 from rich.console import Console
-from wxo_agentic_evaluation.external_agent import generate_starting_sentence
 from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
+from wxo_agentic_evaluation.data_annotator import (
+    KeywordsGenerationLLM,
+    LlamaKeywordsGenerationTemplateRenderer,
+)
+from wxo_agentic_evaluation.external_agent import generate_starting_sentence
 from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
 class ExternalAgentPerformanceTest:
     def __init__(self, agent_name: str, test_data: List[str]):
@@ -12,8 +17,7 @@ class ExternalAgentPerformanceTest:
         self.goal_template = {
             "agent": agent_name,
             "goals": {"summarize": []},
-            "goal_details": [
-            ],
+            "goal_details": [],
             "story": "<placeholder>",
         }
@@ -24,42 +28,50 @@ class ExternalAgentPerformanceTest:
             "decoding_method": "greedy",
             "max_new_tokens": 256,
         }
-        wai_client = get_provider(model_id=kw_gen_config.model_id, params=llm_decode_parameter)
+        wai_client = get_provider(
+            model_id=kw_gen_config.model_id, params=llm_decode_parameter
+        )
         self.kw_gen = KeywordsGenerationLLM(
             provider=wai_client,
             template=LlamaKeywordsGenerationTemplateRenderer(
                 kw_gen_config.prompt_config
-                ),
-            )
+            ),
+        )
     def generate_tests(self) -> List[Mapping[str, Any]]:
         console = Console()
         goal_templates = []
-        with console.status("[gold3]Creating starting sentence for user story from input file for performance testing") as status:
+        with console.status(
+            "[gold3]Creating starting sentence for user story from input file for performance testing"
+        ) as status:
             for sentence, response in self.test_data:
                 goal_temp = self.goal_template.copy()
                 goal_temp["story"] = sentence
                 keywords = self.kw_gen.genereate_keywords(response)
                 summarize_step = {
-                        "name": "summarize",
-                        "type": "text",
-                        "response": response,
-                        "keywords": keywords
-                    }
+                    "name": "summarize",
+                    "type": "text",
+                    "response": response,
+                    "keywords": keywords,
+                }
                 goal_temp["goal_details"] = [summarize_step]
-                goal_temp["starting_sentence"] = generate_starting_sentence(goal_temp)
+                goal_temp["starting_sentence"] = generate_starting_sentence(
+                    goal_temp
+                )
                 goal_templates.append(goal_temp)
             status.stop()
-            console.print("[bold green]Done creating starting sentence from provided input data")
+            console.print(
+                "[bold green]Done creating starting sentence from provided input data"
+            )
             return goal_templates
 if __name__ == "__main__":
     t = ExternalAgentPerformanceTest("test")
-    t.generate_tests()
+    t.generate_tests()

ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.1py3-none-any.whl → 1.1.2py3-none-any.whl