azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (47) hide show
  1. azure/ai/evaluation/_azure/__init__.py +3 -0
  2. azure/ai/evaluation/_azure/_clients.py +204 -0
  3. azure/ai/evaluation/_azure/_models.py +227 -0
  4. azure/ai/evaluation/_azure/_token_manager.py +118 -0
  5. azure/ai/evaluation/_common/rai_service.py +30 -21
  6. azure/ai/evaluation/_constants.py +19 -0
  7. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
  8. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
  9. azure/ai/evaluation/_evaluate/_eval_run.py +16 -43
  10. azure/ai/evaluation/_evaluate/_evaluate.py +76 -44
  11. azure/ai/evaluation/_evaluate/_utils.py +93 -34
  12. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
  13. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  14. azure/ai/evaluation/_evaluators/_common/_base_eval.py +140 -5
  15. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
  16. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -1
  17. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +40 -2
  18. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  19. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +6 -43
  20. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
  21. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
  22. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
  24. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
  25. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
  26. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
  27. azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
  28. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
  29. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +42 -82
  30. azure/ai/evaluation/_http_utils.py +6 -4
  31. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  32. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  33. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  34. azure/ai/evaluation/_version.py +1 -1
  35. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  36. azure/ai/evaluation/simulator/_adversarial_simulator.py +35 -16
  37. azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
  38. azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
  39. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -0
  40. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +40 -0
  41. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
  42. azure/ai/evaluation/simulator/_simulator.py +24 -13
  43. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/METADATA +84 -15
  44. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/RECORD +47 -41
  45. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/NOTICE.txt +0 -0
  46. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/WHEEL +0 -0
  47. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,9 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ import enum
4
5
  from typing import Literal
6
+ from azure.ai.evaluation._common._experimental import experimental
5
7
 
6
8
 
7
9
  class EvaluationMetrics:
@@ -54,6 +56,23 @@ class EvaluationRunProperties:
54
56
 
55
57
  RUN_TYPE = "runType"
56
58
  EVALUATION_RUN = "_azureml.evaluation_run"
59
+ EVALUATION_SDK = "_azureml.evaluation_sdk_name"
60
+
61
+
62
+ @experimental
63
+ class _AggregationType(enum.Enum):
64
+ """Defines how numeric evaluation results should be aggregated
65
+ to produce a single value. Used by individual evaluators to combine per-turn results for
66
+ a conversation-based input. In general, wherever this enum is used, it is also possible
67
+ to directly assign the underlying aggregation function for more complex use cases.
68
+ The 'custom' value is generally not an acceptable input, and should only be used as an output
69
+ to indicate that a custom aggregation function has been injected."""
70
+
71
+ MEAN = "mean"
72
+ MAX = "max"
73
+ MIN = "min"
74
+ SUM = "sum"
75
+ CUSTOM = "custom"
57
76
 
58
77
 
59
78
  DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
@@ -5,5 +5,6 @@ from .eval_run_context import EvalRunContext
5
5
  from .code_client import CodeClient
6
6
  from .proxy_client import ProxyClient
7
7
  from .target_run_context import TargetRunContext
8
+ from .proxy_client import ProxyRun
8
9
 
9
- __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
10
+ __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]
@@ -16,7 +16,7 @@ class TargetRunContext:
16
16
  :type upload_snapshot: bool
17
17
  """
18
18
 
19
- def __init__(self, upload_snapshot: bool) -> None:
19
+ def __init__(self, upload_snapshot: bool = False) -> None:
20
20
  self._upload_snapshot = upload_snapshot
21
21
  self._original_cwd = os.getcwd()
22
22
 
@@ -22,29 +22,12 @@ from azure.ai.evaluation._version import VERSION
22
22
  from azure.core.pipeline.policies import RetryPolicy
23
23
  from azure.core.rest import HttpResponse
24
24
  from azure.core.exceptions import HttpResponseError
25
+ from azure.storage.blob import BlobServiceClient
26
+ from azure.ai.evaluation._azure._clients import LiteMLClient
25
27
 
26
28
  LOGGER = logging.getLogger(__name__)
27
29
 
28
30
 
29
- # Handle optional import. The azure libraries are only present if
30
- # promptflow-azure is installed.
31
- try:
32
- from azure.ai.ml import MLClient
33
- from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
34
- from azure.ai.ml.entities._datastore.datastore import Datastore
35
- from azure.storage.blob import BlobServiceClient
36
- except (ModuleNotFoundError, ImportError):
37
- raise EvaluationException( # pylint: disable=raise-missing-from
38
- message=(
39
- "The required packages for remote tracking are missing.\n"
40
- 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
41
- ),
42
- target=ErrorTarget.EVALUATE,
43
- category=ErrorCategory.MISSING_PACKAGE,
44
- blame=ErrorBlame.USER_ERROR,
45
- )
46
-
47
-
48
31
  @dataclasses.dataclass
49
32
  class RunInfo:
50
33
  """
@@ -93,15 +76,16 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
93
76
  :type group_name: str
94
77
  :param workspace_name: The name of workspace/project used to track run.
95
78
  :type workspace_name: str
96
- :param ml_client: The ml client used for authentication into Azure.
97
- :type ml_client: azure.ai.ml.MLClient
79
+ :param management_client: The trace destination string to parse the AI ML workspace blob store from.
80
+ :type management_client:
81
+ ~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
98
82
  :param promptflow_run: The promptflow run used by the
83
+ :type promptflow_run: Optional[promptflow._sdk.entities.Run]
99
84
  """
100
85
 
101
86
  _MAX_RETRIES = 5
102
87
  _BACKOFF_FACTOR = 2
103
88
  _TIMEOUT = 5
104
- _SCOPE = "https://management.azure.com/.default"
105
89
 
106
90
  EVALUATION_ARTIFACT = "instance_results.jsonl"
107
91
 
@@ -112,14 +96,14 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
112
96
  subscription_id: str,
113
97
  group_name: str,
114
98
  workspace_name: str,
115
- ml_client: "MLClient",
99
+ management_client: LiteMLClient,
116
100
  promptflow_run: Optional[Run] = None,
117
101
  ) -> None:
118
102
  self._tracking_uri: str = tracking_uri
119
103
  self._subscription_id: str = subscription_id
120
104
  self._resource_group_name: str = group_name
121
105
  self._workspace_name: str = workspace_name
122
- self._ml_client: Any = ml_client
106
+ self._management_client: LiteMLClient = management_client
123
107
  self._is_promptflow_run: bool = promptflow_run is not None
124
108
  self._run_name = run_name
125
109
  self._promptflow_run = promptflow_run
@@ -184,7 +168,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
184
168
  if self._promptflow_run is not None:
185
169
  self._info = RunInfo(
186
170
  self._promptflow_run.name,
187
- self._promptflow_run._experiment_name, # pylint: disable=protected-access
171
+ self._promptflow_run._experiment_name or "", # pylint: disable=protected-access
188
172
  self._promptflow_run.name,
189
173
  )
190
174
  else:
@@ -310,12 +294,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
310
294
  """
311
295
  return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
312
296
 
313
- def _get_token(self):
314
- # We have to use lazy import because promptflow.azure
315
- # is an optional dependency.
316
- from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module
317
-
318
- return ArmTokenCache().get_token(self._ml_client._credential) # pylint: disable=protected-access
297
+ def _get_token(self) -> str:
298
+ return self._management_client.get_token()
319
299
 
320
300
  def request_with_retry(
321
301
  self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
@@ -441,9 +421,12 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
441
421
  local_paths.append(local_file_path)
442
422
 
443
423
  # We will write the artifacts to the workspaceblobstore
444
- datastore = self._ml_client.datastores.get_default(include_secrets=True)
424
+ datastore = self._management_client.workspace_get_default_datastore(
425
+ self._workspace_name, include_credentials=True
426
+ )
445
427
  account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
446
- svc_client = BlobServiceClient(account_url=account_url, credential=self._get_datastore_credential(datastore))
428
+
429
+ svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
447
430
  try:
448
431
  for local, remote in zip(local_paths, remote_paths["paths"]):
449
432
  blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
@@ -515,16 +498,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
515
498
  except Exception as ex: # pylint: disable=broad-exception-caught
516
499
  LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
517
500
 
518
- def _get_datastore_credential(self, datastore: "Datastore"):
519
- # Reference the logic in azure.ai.ml._artifact._artifact_utilities
520
- # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
521
- credential = datastore.credentials
522
- if isinstance(credential, AccountKeyConfiguration):
523
- return credential.account_key
524
- if hasattr(credential, "sas_token"):
525
- return credential.sas_token
526
- return self._ml_client.datastores._credential # pylint: disable=protected-access
527
-
528
501
  def log_metric(self, key: str, value: float) -> None:
529
502
  """
530
503
  Log the metric to azure similar to how it is done by mlflow.
@@ -10,9 +10,9 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, T
10
10
 
11
11
  import pandas as pd
12
12
  from promptflow._sdk._constants import LINE_NUMBER
13
- from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
14
13
  from promptflow.client import PFClient
15
14
  from promptflow.entities import Run
15
+ from promptflow._sdk._configuration import Configuration
16
16
 
17
17
  from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
18
18
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
@@ -21,18 +21,19 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
21
21
  from .._constants import (
22
22
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
23
23
  EvaluationMetrics,
24
- EvaluationRunProperties,
24
+ DefaultOpenEncoding,
25
25
  Prefixes,
26
26
  _InternalEvaluationMetrics,
27
27
  )
28
28
  from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
29
29
  from .._user_agent import USER_AGENT
30
- from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
30
+ from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
31
31
  from ._utils import (
32
32
  _apply_column_mapping,
33
33
  _log_metrics_and_instance_results,
34
34
  _trace_destination_from_project_scope,
35
35
  _write_output,
36
+ DataLoaderFactory,
36
37
  )
37
38
 
38
39
  TClient = TypeVar("TClient", ProxyClient, CodeClient)
@@ -431,10 +432,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
431
432
  )
432
433
 
433
434
  try:
434
- initial_data_df = pd.read_json(data, lines=True)
435
+ data_loader = DataLoaderFactory.get_loader(data)
436
+ initial_data_df = data_loader.load()
435
437
  except Exception as e:
436
438
  raise EvaluationException(
437
- message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
439
+ message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
438
440
  target=ErrorTarget.EVALUATE,
439
441
  category=ErrorCategory.INVALID_VALUE,
440
442
  blame=ErrorBlame.USER_ERROR,
@@ -446,7 +448,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
446
448
  def _apply_target_to_data(
447
449
  target: Callable,
448
450
  data: Union[str, os.PathLike],
449
- pf_client: PFClient,
451
+ batch_client: TClient,
450
452
  initial_data: pd.DataFrame,
451
453
  evaluation_name: Optional[str] = None,
452
454
  **kwargs,
@@ -456,10 +458,10 @@ def _apply_target_to_data(
456
458
 
457
459
  :param target: The function to be applied to data.
458
460
  :type target: Callable
459
- :param data: The path to input jsonl file.
461
+ :param data: The path to input jsonl or csv file.
460
462
  :type data: Union[str, os.PathLike]
461
- :param pf_client: The promptflow client to be used.
462
- :type pf_client: PFClient
463
+ :param batch_client: The promptflow client to be used.
464
+ :type batch_client: PFClient
463
465
  :param initial_data: The data frame with the loaded data.
464
466
  :type initial_data: pd.DataFrame
465
467
  :param evaluation_name: The name of the evaluation.
@@ -468,35 +470,27 @@ def _apply_target_to_data(
468
470
  :rtype: Tuple[pandas.DataFrame, List[str]]
469
471
  """
470
472
  _run_name = kwargs.get("_run_name")
471
- upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
472
-
473
- try:
474
- with TargetRunContext(upload_target_snaphot):
475
- run: Run = pf_client.run(
476
- flow=target,
477
- display_name=evaluation_name,
478
- data=data,
479
- properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
480
- stream=True,
481
- name=_run_name,
482
- )
483
- except (UserAuthenticationError, UploadInternalError) as ex:
484
- if "Failed to upload run" in ex.message:
485
- msg = (
486
- "Failed to upload the target run to the cloud. "
487
- "This may be caused by insufficient permission to access storage or other errors."
488
- )
489
- raise EvaluationException(
490
- message=msg,
491
- target=ErrorTarget.EVALUATE,
492
- category=ErrorCategory.FAILED_REMOTE_TRACKING,
493
- blame=ErrorBlame.USER_ERROR,
494
- tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
495
- ) from ex
473
+ with TargetRunContext():
474
+ run: ProxyRun = batch_client.run(
475
+ flow=target,
476
+ display_name=evaluation_name,
477
+ data=data,
478
+ stream=True,
479
+ name=_run_name,
480
+ )
496
481
 
497
- raise ex
482
+ target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
483
+ run_summary = batch_client.get_run_summary(run)
498
484
 
499
- target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
485
+ if run_summary["completed_lines"] == 0:
486
+ msg = (f"Evaluation target failed to produce any results."
487
+ f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
488
+ raise EvaluationException(
489
+ message=msg,
490
+ target=ErrorTarget.EVALUATE,
491
+ category=ErrorCategory.FAILED_EXECUTION,
492
+ blame=ErrorBlame.USER_ERROR,
493
+ )
500
494
  # Remove input and output prefix
501
495
  generated_columns = {
502
496
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -515,7 +509,7 @@ def _apply_target_to_data(
515
509
  # Concatenate output to input
516
510
  target_output = pd.concat([target_output, initial_data], axis=1)
517
511
 
518
- return target_output, generated_columns, run
512
+ return target_output, generated_columns, run.run.result()
519
513
 
520
514
 
521
515
  def _process_column_mappings(
@@ -590,13 +584,14 @@ def evaluate(
590
584
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
591
585
  azure_ai_project: Optional[AzureAIProject] = None,
592
586
  output_path: Optional[Union[str, os.PathLike]] = None,
587
+ fail_on_evaluator_errors: bool = False,
593
588
  **kwargs,
594
589
  ) -> EvaluationResult:
595
590
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
596
591
  data will be run through target function and then results will be evaluated.
597
592
 
598
593
  :keyword data: Path to the data to be evaluated or passed to target if target is set.
599
- Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
594
+ JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
600
595
  :paramtype data: str
601
596
  :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
602
597
  and value as the evaluator function. Required.
@@ -615,6 +610,11 @@ def evaluate(
615
610
  :paramtype output_path: Optional[str]
616
611
  :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
617
612
  :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
613
+ :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
614
+ if ANY evaluator fails during their evaluation.
615
+ Defaults to false, which means that evaluations will continue regardless of failures.
616
+ If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
617
+ :paramtype fail_on_evaluator_errors: bool
618
618
  :return: Evaluation results.
619
619
  :rtype: ~azure.ai.evaluation.EvaluationResult
620
620
 
@@ -636,6 +636,7 @@ def evaluate(
636
636
  evaluator_config=evaluator_config,
637
637
  azure_ai_project=azure_ai_project,
638
638
  output_path=output_path,
639
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
639
640
  **kwargs,
640
641
  )
641
642
  except Exception as e:
@@ -684,6 +685,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
684
685
  print("\n====================================================\n")
685
686
 
686
687
 
688
+ def _print_fail_flag_warning() -> None:
689
+ print(
690
+ "Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
691
+ + "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
692
+ + "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
693
+ + "without producing any outputs, since a single failure will cancel the entire run "
694
+ "when fail_on_evaluator_errors is enabled."
695
+ )
696
+
697
+
687
698
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
688
699
  *,
689
700
  evaluators: Dict[str, Callable],
@@ -693,8 +704,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
693
704
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
694
705
  azure_ai_project: Optional[AzureAIProject] = None,
695
706
  output_path: Optional[Union[str, os.PathLike]] = None,
707
+ fail_on_evaluator_errors: bool = False,
696
708
  **kwargs,
697
709
  ) -> EvaluationResult:
710
+ if fail_on_evaluator_errors:
711
+ _print_fail_flag_warning()
698
712
  input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
699
713
 
700
714
  # Process evaluator config to replace ${target.} with ${data.}
@@ -711,6 +725,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
711
725
  if target is not None:
712
726
  _validate_columns_for_target(input_data_df, target)
713
727
 
728
+ Configuration.get_instance().set_config("trace.destination", "none")
714
729
  pf_client = PFClient(user_agent=USER_AGENT)
715
730
  target_run: Optional[Run] = None
716
731
 
@@ -723,7 +738,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
723
738
  target_generated_columns: Set[str] = set()
724
739
  if data is not None and target is not None:
725
740
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
726
- target, data, pf_client, input_data_df, evaluation_name, **kwargs
741
+ target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
727
742
  )
728
743
 
729
744
  for evaluator_name, mapping in column_mapping.items():
@@ -794,6 +809,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
794
809
  evaluators_result_df = None
795
810
  evaluators_metric = {}
796
811
  for evaluator_name, evaluator_result in per_evaluator_results.items():
812
+ if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
813
+ _print_summary(per_evaluator_results)
814
+ _turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
815
+
797
816
  evaluator_result_df = evaluator_result["result"]
798
817
 
799
818
  # drop input columns
@@ -834,11 +853,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
834
853
  studio_url = None
835
854
  if trace_destination:
836
855
  studio_url = _log_metrics_and_instance_results(
837
- metrics,
838
- result_df,
839
- trace_destination,
840
- target_run,
841
- evaluation_name,
856
+ metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
842
857
  )
843
858
 
844
859
  result_df_dict = result_df.to_dict("records")
@@ -850,3 +865,20 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
850
865
  _write_output(output_path, result)
851
866
 
852
867
  return result
868
+
869
+
870
+ def _turn_error_logs_into_exception(log_path: str) -> None:
871
+ """Produce an EvaluationException using the contents of the inputted
872
+ file as the error message.
873
+
874
+ :param log_path: The path to the error log file.
875
+ :type log_path: str
876
+ """
877
+ with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
878
+ error_message = file.read()
879
+ raise EvaluationException(
880
+ message=error_message,
881
+ target=ErrorTarget.EVALUATE,
882
+ category=ErrorCategory.FAILED_EXECUTION,
883
+ blame=ErrorBlame.UNKNOWN,
884
+ )
@@ -7,12 +7,11 @@ import os
7
7
  import re
8
8
  import tempfile
9
9
  from pathlib import Path
10
- from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
10
+ from typing import Any, Dict, NamedTuple, Optional, Union, cast
11
11
  import uuid
12
12
  import base64
13
13
 
14
14
  import pandas as pd
15
- from promptflow.client import PFClient
16
15
  from promptflow.entities import Run
17
16
 
18
17
  from azure.ai.evaluation._constants import (
@@ -23,6 +22,8 @@ from azure.ai.evaluation._constants import (
23
22
  )
24
23
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
25
24
  from azure.ai.evaluation._model_configurations import AzureAIProject
25
+ from azure.ai.evaluation._version import VERSION
26
+ from azure.ai.evaluation._azure._clients import LiteMLClient
26
27
 
27
28
  LOGGER = logging.getLogger(__name__)
28
29
 
@@ -45,6 +46,8 @@ def is_none(value) -> bool:
45
46
  def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
46
47
  trace_provider: str,
47
48
  ) -> AzureMLWorkspace:
49
+ from promptflow._cli._utils import get_workspace_triad_from_local
50
+
48
51
  match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
49
52
  if not match or len(match.groups()) != 5:
50
53
  raise EvaluationException(
@@ -58,10 +61,20 @@ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-lon
58
61
  category=ErrorCategory.INVALID_VALUE,
59
62
  blame=ErrorBlame.UNKNOWN,
60
63
  )
64
+
61
65
  subscription_id = match.group(1)
62
66
  resource_group_name = match.group(3)
63
67
  workspace_name = match.group(5)
64
- return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
68
+
69
+ # In theory this if statement should never evaluate to True, but we'll keep it here just in case
70
+ # for backwards compatibility with what the original code that depended on promptflow-azure did
71
+ if not (subscription_id and resource_group_name and workspace_name):
72
+ local = get_workspace_triad_from_local()
73
+ subscription_id = subscription_id or local.subscription_id or os.getenv("AZUREML_ARM_SUBSCRIPTION")
74
+ resource_group_name = resource_group_name or local.resource_group_name or os.getenv("AZUREML_ARM_RESOURCEGROUP")
75
+ workspace_name = workspace_name or local.workspace_name or os.getenv("AZUREML_ARM_WORKSPACE_NAME")
76
+
77
+ return AzureMLWorkspace(subscription_id or "", resource_group_name or "", workspace_name or "")
65
78
 
66
79
 
67
80
  def load_jsonl(path):
@@ -69,19 +82,6 @@ def load_jsonl(path):
69
82
  return [json.loads(line) for line in f.readlines()]
70
83
 
71
84
 
72
- def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
73
- from promptflow.azure._cli._utils import _get_azure_pf_client
74
-
75
- ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
76
- azure_pf_client = _get_azure_pf_client(
77
- subscription_id=ws_triad.subscription_id,
78
- resource_group=ws_triad.resource_group_name,
79
- workspace_name=ws_triad.workspace_name,
80
- )
81
-
82
- return azure_pf_client, ws_triad
83
-
84
-
85
85
  def _store_multimodal_content(messages, tmpdir: str):
86
86
  # verify if images folder exists
87
87
  images_folder_path = os.path.join(tmpdir, "images")
@@ -91,23 +91,40 @@ def _store_multimodal_content(messages, tmpdir: str):
91
91
  for message in messages:
92
92
  if isinstance(message.get("content", []), list):
93
93
  for content in message.get("content", []):
94
- if content.get("type") == "image_url":
95
- image_url = content.get("image_url")
96
- if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
97
- # Extract the base64 string
98
- base64image = image_url["url"].replace("data:image/jpg;base64,", "")
94
+ process_message_content(content, images_folder_path)
95
+
96
+
97
+ def process_message_content(content, images_folder_path):
98
+ if content.get("type", "") == "image_url":
99
+ image_url = content.get("image_url")
100
+
101
+ if not image_url or "url" not in image_url:
102
+ return None
103
+
104
+ url = image_url["url"]
105
+ if not url.startswith("data:image/"):
106
+ return None
107
+
108
+ match = re.search("data:image/([^;]+);", url)
109
+ if not match:
110
+ return None
99
111
 
100
- # Generate a unique filename
101
- image_file_name = f"{str(uuid.uuid4())}.jpg"
102
- image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
112
+ ext = match.group(1)
113
+ # Extract the base64 string
114
+ base64image = image_url["url"].replace(f"data:image/{ext};base64,", "")
103
115
 
104
- # Decode the base64 string to binary image data
105
- image_data_binary = base64.b64decode(base64image)
116
+ # Generate a unique filename
117
+ image_file_name = f"{str(uuid.uuid4())}.{ext}"
118
+ image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
106
119
 
107
- # Write the binary image data to the file
108
- image_file_path = os.path.join(images_folder_path, image_file_name)
109
- with open(image_file_path, "wb") as f:
110
- f.write(image_data_binary)
120
+ # Decode the base64 string to binary image data
121
+ image_data_binary = base64.b64decode(base64image)
122
+
123
+ # Write the binary image data to the file
124
+ image_file_path = os.path.join(images_folder_path, image_file_name)
125
+ with open(image_file_path, "wb") as f:
126
+ f.write(image_data_binary)
127
+ return None
111
128
 
112
129
 
113
130
  def _log_metrics_and_instance_results(
@@ -116,6 +133,7 @@ def _log_metrics_and_instance_results(
116
133
  trace_destination: Optional[str],
117
134
  run: Run,
118
135
  evaluation_name: Optional[str],
136
+ **kwargs,
119
137
  ) -> Optional[str]:
120
138
  from azure.ai.evaluation._evaluate._eval_run import EvalRun
121
139
 
@@ -123,19 +141,26 @@ def _log_metrics_and_instance_results(
123
141
  LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
124
142
  return None
125
143
 
126
- azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
127
- tracking_uri = azure_pf_client.ml_client.workspaces.get(ws_triad.workspace_name).mlflow_tracking_uri
144
+ ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
145
+ management_client = LiteMLClient(
146
+ subscription_id=ws_triad.subscription_id,
147
+ resource_group=ws_triad.resource_group_name,
148
+ logger=LOGGER,
149
+ credential=kwargs.get("credential"),
150
+ # let the client automatically determine the credentials to use
151
+ )
152
+ tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
128
153
 
129
154
  # Adding line_number as index column this is needed by UI to form link to individual instance run
130
155
  instance_results["line_number"] = instance_results.index.values
131
156
 
132
157
  with EvalRun(
133
158
  run_name=run.name if run is not None else evaluation_name,
134
- tracking_uri=tracking_uri,
159
+ tracking_uri=cast(str, tracking_uri),
135
160
  subscription_id=ws_triad.subscription_id,
136
161
  group_name=ws_triad.resource_group_name,
137
162
  workspace_name=ws_triad.workspace_name,
138
- ml_client=azure_pf_client.ml_client,
163
+ management_client=management_client,
139
164
  promptflow_run=run,
140
165
  ) as ev_run:
141
166
  artifact_name = EvalRun.EVALUATION_ARTIFACT
@@ -166,9 +191,16 @@ def _log_metrics_and_instance_results(
166
191
  properties={
167
192
  EvaluationRunProperties.RUN_TYPE: "eval_run",
168
193
  EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
194
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
169
195
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
170
196
  }
171
197
  )
198
+ else:
199
+ ev_run.write_properties_to_run_history(
200
+ properties={
201
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
202
+ }
203
+ )
172
204
 
173
205
  for metric_name, metric_value in metrics.items():
174
206
  ev_run.log_metric(metric_name, metric_value)
@@ -296,3 +328,30 @@ def set_event_loop_policy() -> None:
296
328
  # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
297
329
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
298
330
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
331
+
332
+
333
+ class JSONLDataFileLoader:
334
+ def __init__(self, filename: Union[os.PathLike, str]):
335
+ self.filename = filename
336
+
337
+ def load(self) -> pd.DataFrame:
338
+ return pd.read_json(self.filename, lines=True)
339
+
340
+
341
+ class CSVDataFileLoader:
342
+ def __init__(self, filename: Union[os.PathLike, str]):
343
+ self.filename = filename
344
+
345
+ def load(self) -> pd.DataFrame:
346
+ return pd.read_csv(self.filename)
347
+
348
+
349
+ class DataLoaderFactory:
350
+ @staticmethod
351
+ def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
352
+ filename_str = str(filename).lower()
353
+ if filename_str.endswith(".csv"):
354
+ return CSVDataFileLoader(filename)
355
+
356
+ # fallback to JSONL to maintain backward compatibility
357
+ return JSONLDataFileLoader(filename)