PyPI - azure-ai-evaluation - Versions diffs - 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (144) hide show

azure/ai/evaluation/_converters/_models.py CHANGED Viewed

@@ -3,10 +3,18 @@ import json
 from pydantic import BaseModel
-from azure.ai.projects.models import RunStepFunctionToolCall
 from typing import List, Optional, Union
+# Models moved in a later version of agents SDK, so try a few different locations
+try:
+    from azure.ai.projects.models import RunStepFunctionToolCall
+except ImportError:
+    pass
+try:
+    from azure.ai.agents.models import RunStepFunctionToolCall
+except ImportError:
+    pass
 # Message roles constants.
 _SYSTEM = "system"
 _USER = "user"
@@ -21,6 +29,57 @@ _FUNCTION = "function"
 # This is returned by AI services in the API to filter against tool invocations.
 _TOOL_CALLS = "tool_calls"
+# Constants to only be used internally in this file for the built-in tools.
+_CODE_INTERPRETER = "code_interpreter"
+_BING_GROUNDING = "bing_grounding"
+_FILE_SEARCH = "file_search"
+_AZURE_AI_SEARCH = "azure_ai_search"
+_FABRIC_DATAAGENT = "fabric_dataagent"
+# Built-in tool descriptions and parameters are hidden, but we include basic descriptions
+# for evaluation purposes.
+_BUILT_IN_DESCRIPTIONS = {
+    _CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
+    + "generate code, and create graphs and charts using your data. Supports "
+    + "up to 20 files.",
+    _BING_GROUNDING: "Enhance model output with web data.",
+    _FILE_SEARCH: "Search for data across uploaded files.",
+    _AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
+    _FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
+}
+# Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
+_BUILT_IN_PARAMS = {
+    _CODE_INTERPRETER: {
+        "type": "object",
+        "properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
+    },
+    _BING_GROUNDING: {
+        "type": "object",
+        "properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
+    },
+    _FILE_SEARCH: {
+        "type": "object",
+        "properties": {
+            "ranking_options": {
+                "type": "object",
+                "properties": {
+                    "ranker": {"type": "string", "description": "Ranking algorithm to use."},
+                    "score_threshold": {"type": "number", "description": "Threshold for search results."},
+                },
+                "description": "Ranking options for search results.",
+            }
+        },
+    },
+    _AZURE_AI_SEARCH: {
+        "type": "object",
+        "properties": {"input": {"type": "string", "description": "Search terms to use."}},
+    },
+    _FABRIC_DATAAGENT: {
+        "type": "object",
+        "properties": {"input": {"type": "string", "description": "Search terms to use."}},
+    },
+}
 class Message(BaseModel):
     """Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
@@ -98,6 +157,8 @@ class ToolDefinition(BaseModel):
     :param name: The name of the tool.
     :type name: str
+    :param type: The type of the tool.
+    :type type: str
     :param description: A description of the tool.
     :type description: str
     :param parameters: The parameters required by the tool.
@@ -105,6 +166,7 @@ class ToolDefinition(BaseModel):
     """
     name: str
+    type: str
     description: Optional[str] = None
     parameters: dict
@@ -191,6 +253,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
             arguments = {
                 "ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
             }
+        elif tool_call.details["type"] == "azure_ai_search":
+            arguments = {"input": tool_call.details["azure_ai_search"]["input"]}
+        elif tool_call.details["type"] == "fabric_dataagent":
+            arguments = {"input": tool_call.details["fabric_dataagent"]["input"]}
         else:
             # unsupported tool type, skip
             return messages
@@ -211,17 +277,17 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
     messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
     if hasattr(tool_call.details, _FUNCTION):
-        output = safe_loads(tool_call.details.function.output)
+        output = safe_loads(tool_call.details.function["output"])
     else:
         try:
             # Some built-ins may have output, others may not
             # Try to retrieve it, but if we don't find anything, skip adding the message
             # Just manually converting to dicts for easy serialization for now rather than custom serializers
-            if tool_call.details.type == "code_interpreter":
+            if tool_call.details.type == _CODE_INTERPRETER:
                 output = tool_call.details.code_interpreter.outputs
-            elif tool_call.details.type == "bing_grounding":
+            elif tool_call.details.type == _BING_GROUNDING:
                 return messages  # not supported yet from bing grounding tool
-            elif tool_call.details.type == "file_search":
+            elif tool_call.details.type == _FILE_SEARCH:
                 output = [
                     {
                         "file_id": result.file_id,
@@ -231,6 +297,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
                     }
                     for result in tool_call.details.file_search.results
                 ]
+            elif tool_call.details.type == _AZURE_AI_SEARCH:
+                output = tool_call.details.azure_ai_search["output"]
+            elif tool_call.details.type == _FABRIC_DATAAGENT:
+                output = tool_call.details.fabric_dataagent["output"]
         except:
             return messages

azure/ai/evaluation/_eval_mapping.py ADDED Viewed

@@ -0,0 +1,73 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+# Note: This was removed from the normal constants file due to circular import issues.
+# In the future, it would be nice to instead rely on the id value
+# of each eval class, but I wouldn't like to rely on those before
+# we simplify them into version-less, static values, instead of the
+# problematic registry references they currently are.
+# Import all evals
+from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
+from azure.ai.evaluation import (
+    BleuScoreEvaluator,
+    CodeVulnerabilityEvaluator,
+    CoherenceEvaluator,
+    ContentSafetyEvaluator,
+    DocumentRetrievalEvaluator,
+    F1ScoreEvaluator,
+    FluencyEvaluator,
+    GleuScoreEvaluator,
+    GroundednessEvaluator,
+    GroundednessProEvaluator,
+    HateUnfairnessEvaluator,
+    IndirectAttackEvaluator,
+    IntentResolutionEvaluator,
+    MeteorScoreEvaluator,
+    ProtectedMaterialEvaluator,
+    QAEvaluator,
+    RelevanceEvaluator,
+    ResponseCompletenessEvaluator,
+    RetrievalEvaluator,
+    RougeScoreEvaluator,
+    SelfHarmEvaluator,
+    SexualEvaluator,
+    SimilarityEvaluator,
+    TaskAdherenceEvaluator,
+    ToolCallAccuracyEvaluator,
+    UngroundedAttributesEvaluator,
+    ViolenceEvaluator
+)
+EVAL_CLASS_MAP = {
+    BleuScoreEvaluator: "bleu_score",
+    CodeVulnerabilityEvaluator: "code_vulnerability",
+    CoherenceEvaluator: "coherence",
+    ContentSafetyEvaluator: "content_safety",
+    DocumentRetrievalEvaluator: "document_retrieval",
+    ECIEvaluator: "eci",
+    F1ScoreEvaluator: "f1_score",
+    FluencyEvaluator: "fluency",
+    GleuScoreEvaluator: "gleu_score",
+    GroundednessEvaluator: "groundedness",
+    GroundednessProEvaluator: "groundedness_pro",
+    HateUnfairnessEvaluator: "hate_unfairness",
+    IndirectAttackEvaluator: "indirect_attack",
+    IntentResolutionEvaluator: "intent_resolution",
+    MeteorScoreEvaluator: "meteor_score",
+    ProtectedMaterialEvaluator: "protected_material",
+    QAEvaluator: "qa",
+    RelevanceEvaluator: "relevance",
+    ResponseCompletenessEvaluator: "response_completeness",
+    RetrievalEvaluator: "retrieval",
+    RougeScoreEvaluator: "rouge_score",
+    SelfHarmEvaluator: "self_harm",
+    SexualEvaluator: "sexual",
+    SimilarityEvaluator: "similarity",
+    TaskAdherenceEvaluator: "task_adherence",
+    ToolCallAccuracyEvaluator: "tool_call_accuracy",
+    UngroundedAttributesEvaluator: "ungrounded_attributes",
+    ViolenceEvaluator: "violence",
+}

azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py CHANGED Viewed

@@ -2,11 +2,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import asyncio
 import logging
 import pandas as pd
 import sys
 from collections import defaultdict
-from concurrent.futures import Future, ThreadPoolExecutor
+from concurrent.futures import Future
 from os import PathLike
 from typing import Any, Callable, Dict, Final, List, Mapping, Optional, Sequence, Union, cast
@@ -14,6 +15,8 @@ from .batch_clients import BatchClientRun, HasAsyncCallable
 from ..._legacy._batch_engine._run_submitter import RunSubmitter
 from ..._legacy._batch_engine._config import BatchEngineConfig
 from ..._legacy._batch_engine._run import Run
+from ..._legacy._adapters._constants import LINE_NUMBER
+from ..._legacy._common._thread_pool_executor_with_context import ThreadPoolExecutorWithContext
 LOGGER = logging.getLogger(__name__)
@@ -22,7 +25,9 @@ LOGGER = logging.getLogger(__name__)
 class RunSubmitterClient:
     def __init__(self, config: Optional[BatchEngineConfig] = None) -> None:
         self._config = config or BatchEngineConfig(LOGGER, use_async=True)
-        self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
+        self._thread_pool = ThreadPoolExecutorWithContext(
+            thread_name_prefix="evaluators_thread",
+            max_workers=self._config.max_concurrency)
     def run(
         self,
@@ -33,30 +38,36 @@ class RunSubmitterClient:
         **kwargs: Any,
     ) -> BatchClientRun:
         if not isinstance(data, pd.DataFrame):
-            # Should never get here
             raise ValueError("Data must be a pandas DataFrame")
-        if not column_mapping:
-            raise ValueError("Column mapping must be provided")
-        # The column mappings are index by data to indicate they come from the data
+        # The column mappings are indexed by data to indicate they come from the data
         # input. Update the inputs so that each entry is a dictionary with a data key
         # that contains the original input data.
         inputs = [{"data": input_data} for input_data in data.to_dict(orient="records")]
-        # always uses async behind the scenes
+        # Pass the correct previous run to the evaluator
+        run: Optional[BatchClientRun] = kwargs.pop("run", None)
+        if run:
+            kwargs["run"] = self._get_run(run)
+        # Try to get async function to use
         if isinstance(flow, HasAsyncCallable):
             flow = flow._to_async()  # pylint: disable=protected-access
-        run_submitter = RunSubmitter(self._config)
+        # Start an event loop for async execution on a thread pool thread to separate it
+        # from the caller's thread.
+        run_submitter = RunSubmitter(self._config, self._thread_pool)
         run_future = self._thread_pool.submit(
-            run_submitter.submit,
-            dynamic_callable=flow,
-            inputs=inputs,
-            column_mapping=column_mapping,
-            name_prefix=evaluator_name,
-            created_on=kwargs.pop("created_on", None),
-            storage_creator=kwargs.pop("storage_creator", None),
-            **kwargs,
+            asyncio.run,
+            run_submitter.submit(
+                dynamic_callable=flow,
+                inputs=inputs,
+                column_mapping=column_mapping,
+                name_prefix=evaluator_name,
+                created_on=kwargs.pop("created_on", None),
+                storage_creator=kwargs.pop("storage_creator", None),
+                **kwargs,
+            )
         )
         return run_future
@@ -75,7 +86,10 @@ class RunSubmitterClient:
                     key = f"{prefix}.{k}"
                     data[key].append(value)
+        # Go from a list of dictionaries (i.e. a row view of the data) to a dictionary of lists
+        # (i.e. a column view of the data)
         _update("inputs", run.inputs)
+        _update("inputs", [{ LINE_NUMBER: i } for i in range(len(run.inputs)) ])
         _update("outputs", run.outputs)
         df = pd.DataFrame(data).reindex(columns=[k for k in data.keys()])

azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py CHANGED Viewed

@@ -8,6 +8,10 @@ from typing import Optional, Type, Union
 from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
 from azure.ai.evaluation._legacy._adapters.utils import ClientUserAgentUtil
 from azure.ai.evaluation._legacy._adapters.tracing import inject_openai_api, recover_openai_api
+from azure.ai.evaluation._legacy._batch_engine._openai_injector import (
+    inject_openai_api as ported_inject_openai_api,
+    recover_openai_api as ported_recover_openai_api,
+)
 from azure.ai.evaluation._constants import (
     OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
@@ -68,6 +72,7 @@ class EvalRunContext:
         if isinstance(self.client, RunSubmitterClient):
             set_event_loop_policy()
+            ported_inject_openai_api()
     def __exit__(
         self,
@@ -92,3 +97,6 @@ class EvalRunContext:
             if self._is_otel_timeout_set_by_system:
                 os.environ.pop(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, None)
                 self._is_otel_timeout_set_by_system = False
+        if isinstance(self.client, RunSubmitterClient):
+            ported_recover_openai_api()

azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py CHANGED Viewed

@@ -58,6 +58,11 @@ class ProxyClient:  # pylint: disable=client-accepts-api-version-keyword
         if not name:
             name = f"azure_ai_evaluation_evaluators_{evaluator_name}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
+        # Pass the correct previous run to the evaluator
+        run: Optional[BatchClientRun] = kwargs.pop("run", None)
+        if run:
+            kwargs["run"] = self.get_result(run)
         batch_use_async = self._should_batch_use_async(flow_to_run)
         eval_future = self._thread_pool.submit(
             self._pf_client.run,

azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py CHANGED Viewed

@@ -5,8 +5,15 @@ import os
 import types
 from typing import Optional, Type
+from azure.ai.evaluation._evaluate._batch_run.batch_clients import BatchClient
+from azure.ai.evaluation._evaluate._batch_run import RunSubmitterClient
 from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP
+from azure.ai.evaluation._legacy._batch_engine._openai_injector import (
+    inject_openai_api as ported_inject_openai_api,
+    recover_openai_api as ported_recover_openai_api,
+)
 from azure.ai.evaluation._constants import PF_DISABLE_TRACING
+from azure.ai.evaluation._evaluate._utils import set_event_loop_policy
 class TargetRunContext:
@@ -16,7 +23,8 @@ class TargetRunContext:
     :type upload_snapshot: bool
     """
-    def __init__(self, upload_snapshot: bool = False) -> None:
+    def __init__(self, client: BatchClient, upload_snapshot: bool = False) -> None:
+        self._client = client
         self._upload_snapshot = upload_snapshot
         self._original_cwd = os.getcwd()
@@ -32,6 +40,11 @@ class TargetRunContext:
         os.environ[PF_DISABLE_TRACING] = "true"
+        if isinstance(self._client, RunSubmitterClient):
+            ported_inject_openai_api()
+            # For addressing the issue of asyncio event loop closed on Windows
+            set_event_loop_policy()
     def __exit__(
         self,
         exc_type: Optional[Type[BaseException]],
@@ -44,3 +57,6 @@ class TargetRunContext:
             os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
         os.environ.pop(PF_DISABLE_TRACING, None)
+        if isinstance(self._client, RunSubmitterClient):
+            ported_recover_openai_api()

azure/ai/evaluation/_evaluate/_eval_run.py CHANGED Viewed

@@ -295,7 +295,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
     def _get_token(self) -> str:
-        return self._management_client.get_token()
+        return self._management_client.get_token().token
     def request_with_retry(
         self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None

azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl