azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (72) hide show
  1. azure/ai/evaluation/_azure/__init__.py +3 -0
  2. azure/ai/evaluation/_azure/_clients.py +188 -0
  3. azure/ai/evaluation/_azure/_models.py +227 -0
  4. azure/ai/evaluation/_azure/_token_manager.py +118 -0
  5. azure/ai/evaluation/_common/_experimental.py +4 -0
  6. azure/ai/evaluation/_common/math.py +62 -2
  7. azure/ai/evaluation/_common/rai_service.py +110 -50
  8. azure/ai/evaluation/_common/utils.py +50 -16
  9. azure/ai/evaluation/_constants.py +2 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
  12. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +12 -1
  13. azure/ai/evaluation/_evaluate/_eval_run.py +38 -43
  14. azure/ai/evaluation/_evaluate/_evaluate.py +62 -131
  15. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
  16. azure/ai/evaluation/_evaluate/_utils.py +72 -38
  17. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
  18. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
  19. azure/ai/evaluation/_evaluators/_common/_base_eval.py +88 -6
  20. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +16 -3
  21. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +39 -10
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +58 -52
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
  33. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
  34. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
  35. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
  36. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
  37. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
  38. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
  39. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
  40. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
  43. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
  45. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
  46. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
  48. azure/ai/evaluation/_exceptions.py +2 -0
  49. azure/ai/evaluation/_http_utils.py +6 -4
  50. azure/ai/evaluation/_model_configurations.py +65 -14
  51. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  52. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  53. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  54. azure/ai/evaluation/_version.py +1 -1
  55. azure/ai/evaluation/simulator/_adversarial_scenario.py +17 -1
  56. azure/ai/evaluation/simulator/_adversarial_simulator.py +57 -47
  57. azure/ai/evaluation/simulator/_constants.py +11 -1
  58. azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
  59. azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
  60. azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +12 -1
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
  63. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +48 -4
  64. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
  65. azure/ai/evaluation/simulator/_simulator.py +54 -45
  66. azure/ai/evaluation/simulator/_utils.py +25 -7
  67. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/METADATA +240 -327
  68. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/RECORD +71 -68
  69. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  70. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/NOTICE.txt +0 -0
  71. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/WHEEL +0 -0
  72. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/top_level.txt +0 -0
@@ -3,24 +3,23 @@
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
5
  import json
6
+ import logging
6
7
  import os
7
8
  import re
8
9
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
9
10
 
10
11
  import pandas as pd
11
12
  from promptflow._sdk._constants import LINE_NUMBER
12
- from promptflow._sdk._errors import MissingAzurePackage, UserAuthenticationError, UploadInternalError
13
13
  from promptflow.client import PFClient
14
14
  from promptflow.entities import Run
15
15
 
16
- from azure.ai.evaluation._common.math import list_sum
16
+ from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
17
17
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
18
18
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
19
19
 
20
20
  from .._constants import (
21
21
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
22
22
  EvaluationMetrics,
23
- EvaluationRunProperties,
24
23
  Prefixes,
25
24
  _InternalEvaluationMetrics,
26
25
  )
@@ -35,6 +34,7 @@ from ._utils import (
35
34
  )
36
35
 
37
36
  TClient = TypeVar("TClient", ProxyClient, CodeClient)
37
+ LOGGER = logging.getLogger(__name__)
38
38
 
39
39
  # For metrics (aggregates) whose metric names intentionally differ from their
40
40
  # originating column name, usually because the aggregation of the original value
@@ -69,10 +69,11 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
69
69
  renamed_cols.append(col)
70
70
  new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
71
71
  col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
72
- metric_columns[new_col_name] = round(
73
- list_sum(col_with_numeric_values) / col_with_numeric_values.count(),
74
- 2,
75
- )
72
+ try:
73
+ metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
74
+ except EvaluationException: # only exception that can be cause is all NaN values
75
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
76
+ LOGGER.warning(msg)
76
77
 
77
78
  return renamed_cols, metric_columns
78
79
 
@@ -119,11 +120,15 @@ def _aggregate_content_safety_metrics(
119
120
  for col in content_safety_df.columns:
120
121
  defect_rate_name = col.replace("_score", "_defect_rate")
121
122
  col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
122
- defect_rates[defect_rate_name] = round(
123
- list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
124
- / col_with_numeric_values.count(),
125
- 2,
126
- )
123
+ try:
124
+ col_with_boolean_values = apply_transform_nan_safe(
125
+ col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
126
+ )
127
+ defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
128
+ except EvaluationException: # only exception that can be cause is all NaN values
129
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
130
+ LOGGER.warning(msg)
131
+
127
132
  return content_safety_cols, defect_rates
128
133
 
129
134
 
@@ -153,10 +158,11 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
153
158
  for col in label_df.columns:
154
159
  defect_rate_name = col.replace("_label", "_defect_rate")
155
160
  col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
156
- defect_rates[defect_rate_name] = round(
157
- list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
158
- 2,
159
- )
161
+ try:
162
+ defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
163
+ except EvaluationException: # only exception that can be cause is all NaN values
164
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
165
+ LOGGER.warning(msg)
160
166
  return label_cols, defect_rates
161
167
 
162
168
 
@@ -193,6 +199,9 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
193
199
  # For rest of metrics, we will calculate mean
194
200
  df.drop(columns=handled_columns, inplace=True)
195
201
 
202
+ # NOTE: nan/None values don't count as as booleans, so boolean columns with
203
+ # nan/None values won't have a mean produced from them.
204
+ # This is different from label-based known evaluators, which have special handling.
196
205
  mean_value = df.mean(numeric_only=True)
197
206
  metrics = mean_value.to_dict()
198
207
  # Add defect rates back into metrics
@@ -287,7 +296,13 @@ def _validate_columns_for_evaluators(
287
296
  # Ignore the missing fields if "conversation" presents in the input data
288
297
  missing_inputs = []
289
298
  else:
290
- missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
299
+ optional_params = (
300
+ evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
301
+ if hasattr(evaluator, "_OPTIONAL_PARAMS")
302
+ else []
303
+ )
304
+ excluded_params = set(new_df.columns).union(optional_params)
305
+ missing_inputs = [col for col in evaluator_params if col not in excluded_params]
291
306
 
292
307
  # If "conversation" is the only parameter and it is missing, keep it in the missing inputs
293
308
  # Otherwise, remove it from the missing inputs
@@ -391,7 +406,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
391
406
  )
392
407
 
393
408
  output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
394
- if not os.path.exists(output_dir):
409
+ if output_dir and not os.path.exists(output_dir):
395
410
  msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
396
411
  raise EvaluationException(
397
412
  message=msg,
@@ -451,33 +466,14 @@ def _apply_target_to_data(
451
466
  :rtype: Tuple[pandas.DataFrame, List[str]]
452
467
  """
453
468
  _run_name = kwargs.get("_run_name")
454
- upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
455
-
456
- try:
457
- with TargetRunContext(upload_target_snaphot):
458
- run: Run = pf_client.run(
459
- flow=target,
460
- display_name=evaluation_name,
461
- data=data,
462
- properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
463
- stream=True,
464
- name=_run_name,
465
- )
466
- except (UserAuthenticationError, UploadInternalError) as ex:
467
- if "Failed to upload run" in ex.message:
468
- msg = (
469
- "Failed to upload the target run to the cloud. "
470
- "This may be caused by insufficient permission to access storage or other errors."
471
- )
472
- raise EvaluationException(
473
- message=msg,
474
- target=ErrorTarget.EVALUATE,
475
- category=ErrorCategory.FAILED_REMOTE_TRACKING,
476
- blame=ErrorBlame.USER_ERROR,
477
- tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
478
- ) from ex
479
-
480
- raise ex
469
+ with TargetRunContext():
470
+ run: Run = pf_client.run(
471
+ flow=target,
472
+ display_name=evaluation_name,
473
+ data=data,
474
+ stream=True,
475
+ name=_run_name,
476
+ )
481
477
 
482
478
  target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
483
479
  # Remove input and output prefix
@@ -601,48 +597,14 @@ def evaluate(
601
597
  :return: Evaluation results.
602
598
  :rtype: ~azure.ai.evaluation.EvaluationResult
603
599
 
604
- :Example:
605
-
606
- Evaluate API can be used as follows:
607
-
608
- .. code-block:: python
609
-
610
- from azure.ai.evaluation import evaluate, RelevanceEvaluator, CoherenceEvaluator
611
-
612
-
613
- model_config = {
614
- "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
615
- "api_key": os.environ.get("AZURE_OPENAI_KEY"),
616
- "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
617
- }
618
-
619
- coherence_eval = CoherenceEvaluator(model_config=model_config)
620
- relevance_eval = RelevanceEvaluator(model_config=model_config)
621
-
622
- path = "evaluate_test_data.jsonl"
623
- result = evaluate(
624
- data=path,
625
- evaluators={
626
- "coherence": coherence_eval,
627
- "relevance": relevance_eval,
628
- },
629
- evaluator_config={
630
- "coherence": {
631
- "column_mapping": {
632
- "response": "${data.response}",
633
- "query": "${data.query}",
634
- },
635
- },
636
- "relevance": {
637
- "column_mapping": {
638
- "response": "${data.response}",
639
- "context": "${data.context}",
640
- "query": "${data.query}",
641
- },
642
- },
643
- },
644
- )
600
+ .. admonition:: Example:
645
601
 
602
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
603
+ :start-after: [START evaluate_method]
604
+ :end-before: [END evaluate_method]
605
+ :language: python
606
+ :dedent: 8
607
+ :caption: Run an evaluation on local data with Coherence and Relevance evaluators.
646
608
  """
647
609
  try:
648
610
  return _evaluate(
@@ -698,7 +660,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
698
660
  if output_dict:
699
661
  print("======= Combined Run Summary (Per Evaluator) =======\n")
700
662
  print(json.dumps(output_dict, indent=4))
701
- print("\n====================================================")
663
+ print("\n====================================================\n")
702
664
 
703
665
 
704
666
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
@@ -728,36 +690,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
728
690
  if target is not None:
729
691
  _validate_columns_for_target(input_data_df, target)
730
692
 
731
- # Target Run
732
- try:
733
- pf_client = PFClient(
734
- config=(
735
- {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
736
- if azure_ai_project
737
- else None
738
- ),
739
- user_agent=USER_AGENT,
740
- )
741
- # pylint: disable=raise-missing-from
742
- except MissingAzurePackage:
743
- msg = (
744
- "The required packages for remote tracking are missing.\n"
745
- 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
746
- )
747
-
748
- raise EvaluationException( # pylint: disable=raise-missing-from
749
- message=msg,
750
- target=ErrorTarget.EVALUATE,
751
- category=ErrorCategory.MISSING_PACKAGE,
752
- blame=ErrorBlame.USER_ERROR,
753
- )
754
-
755
- trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
756
-
757
- # Handle the case where the customer manually run "pf config set trace.destination=none"
758
- if trace_destination and trace_destination.lower() == "none":
759
- trace_destination = None
760
-
693
+ pf_client = PFClient(user_agent=USER_AGENT)
761
694
  target_run: Optional[Run] = None
762
695
 
763
696
  # Create default configuration for evaluators that directly maps
@@ -831,11 +764,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
831
764
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
832
765
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
833
766
  data = os.path.abspath(data)
834
-
835
- # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
836
- # The root cause is still unclear, but it seems related to a conflict between the async run uploader
837
- # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
838
- per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
767
+ per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
839
768
  else:
840
769
  data = input_data_df
841
770
  per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
@@ -877,20 +806,22 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
877
806
  result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
878
807
  metrics = _aggregate_metrics(evaluators_result_df, evaluators)
879
808
  metrics.update(evaluators_metric)
880
- studio_url = _log_metrics_and_instance_results(
881
- metrics,
882
- result_df,
883
- trace_destination,
884
- target_run,
885
- evaluation_name,
886
- )
809
+
810
+ # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
811
+ target_run = None
812
+ trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
813
+ studio_url = None
814
+ if trace_destination:
815
+ studio_url = _log_metrics_and_instance_results(
816
+ metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
817
+ )
887
818
 
888
819
  result_df_dict = result_df.to_dict("records")
889
820
  result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
890
821
 
822
+ _print_summary(per_evaluator_results)
823
+
891
824
  if output_path:
892
825
  _write_output(output_path, result)
893
826
 
894
- _print_summary(per_evaluator_results)
895
-
896
827
  return result
@@ -123,7 +123,8 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
123
123
  user_agent=USER_AGENT,
124
124
  )
125
125
 
126
- track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
126
+ trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
127
+ track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
127
128
  evaluate_target = bool(kwargs.get("target", None))
128
129
  evaluator_config = bool(kwargs.get("evaluator_config", None))
129
130
  custom_dimensions: Dict[str, Union[str, bool]] = {
@@ -7,12 +7,11 @@ import os
7
7
  import re
8
8
  import tempfile
9
9
  from pathlib import Path
10
- from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
10
+ from typing import Any, Dict, NamedTuple, Optional, Union, cast
11
11
  import uuid
12
12
  import base64
13
13
 
14
14
  import pandas as pd
15
- from promptflow.client import PFClient
16
15
  from promptflow.entities import Run
17
16
 
18
17
  from azure.ai.evaluation._constants import (
@@ -21,9 +20,10 @@ from azure.ai.evaluation._constants import (
21
20
  EvaluationRunProperties,
22
21
  Prefixes,
23
22
  )
24
- from azure.ai.evaluation._evaluate._eval_run import EvalRun
25
23
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
26
24
  from azure.ai.evaluation._model_configurations import AzureAIProject
25
+ from azure.ai.evaluation._version import VERSION
26
+ from azure.ai.evaluation._azure._clients import LiteMLClient
27
27
 
28
28
  LOGGER = logging.getLogger(__name__)
29
29
 
@@ -46,6 +46,8 @@ def is_none(value) -> bool:
46
46
  def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
47
47
  trace_provider: str,
48
48
  ) -> AzureMLWorkspace:
49
+ from promptflow._cli._utils import get_workspace_triad_from_local
50
+
49
51
  match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
50
52
  if not match or len(match.groups()) != 5:
51
53
  raise EvaluationException(
@@ -59,10 +61,20 @@ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-lon
59
61
  category=ErrorCategory.INVALID_VALUE,
60
62
  blame=ErrorBlame.UNKNOWN,
61
63
  )
64
+
62
65
  subscription_id = match.group(1)
63
66
  resource_group_name = match.group(3)
64
67
  workspace_name = match.group(5)
65
- return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
68
+
69
+ # In theory this if statement should never evaluate to True, but we'll keep it here just in case
70
+ # for backwards compatibility with what the original code that depended on promptflow-azure did
71
+ if not (subscription_id and resource_group_name and workspace_name):
72
+ local = get_workspace_triad_from_local()
73
+ subscription_id = subscription_id or local.subscription_id or os.getenv("AZUREML_ARM_SUBSCRIPTION")
74
+ resource_group_name = resource_group_name or local.resource_group_name or os.getenv("AZUREML_ARM_RESOURCEGROUP")
75
+ workspace_name = workspace_name or local.workspace_name or os.getenv("AZUREML_ARM_WORKSPACE_NAME")
76
+
77
+ return AzureMLWorkspace(subscription_id or "", resource_group_name or "", workspace_name or "")
66
78
 
67
79
 
68
80
  def load_jsonl(path):
@@ -70,19 +82,6 @@ def load_jsonl(path):
70
82
  return [json.loads(line) for line in f.readlines()]
71
83
 
72
84
 
73
- def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
74
- from promptflow.azure._cli._utils import _get_azure_pf_client
75
-
76
- ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
77
- azure_pf_client = _get_azure_pf_client(
78
- subscription_id=ws_triad.subscription_id,
79
- resource_group=ws_triad.resource_group_name,
80
- workspace_name=ws_triad.workspace_name,
81
- )
82
-
83
- return azure_pf_client, ws_triad
84
-
85
-
86
85
  def _store_multimodal_content(messages, tmpdir: str):
87
86
  # verify if images folder exists
88
87
  images_folder_path = os.path.join(tmpdir, "images")
@@ -92,23 +91,40 @@ def _store_multimodal_content(messages, tmpdir: str):
92
91
  for message in messages:
93
92
  if isinstance(message.get("content", []), list):
94
93
  for content in message.get("content", []):
95
- if content.get("type") == "image_url":
96
- image_url = content.get("image_url")
97
- if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
98
- # Extract the base64 string
99
- base64image = image_url["url"].replace("data:image/jpg;base64,", "")
94
+ process_message_content(content, images_folder_path)
95
+
96
+
97
+ def process_message_content(content, images_folder_path):
98
+ if content.get("type", "") == "image_url":
99
+ image_url = content.get("image_url")
100
100
 
101
- # Generate a unique filename
102
- image_file_name = f"{str(uuid.uuid4())}.jpg"
103
- image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
101
+ if not image_url or "url" not in image_url:
102
+ return None
104
103
 
105
- # Decode the base64 string to binary image data
106
- image_data_binary = base64.b64decode(base64image)
104
+ url = image_url["url"]
105
+ if not url.startswith("data:image/"):
106
+ return None
107
107
 
108
- # Write the binary image data to the file
109
- image_file_path = os.path.join(images_folder_path, image_file_name)
110
- with open(image_file_path, "wb") as f:
111
- f.write(image_data_binary)
108
+ match = re.search("data:image/([^;]+);", url)
109
+ if not match:
110
+ return None
111
+
112
+ ext = match.group(1)
113
+ # Extract the base64 string
114
+ base64image = image_url["url"].replace(f"data:image/{ext};base64,", "")
115
+
116
+ # Generate a unique filename
117
+ image_file_name = f"{str(uuid.uuid4())}.{ext}"
118
+ image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
119
+
120
+ # Decode the base64 string to binary image data
121
+ image_data_binary = base64.b64decode(base64image)
122
+
123
+ # Write the binary image data to the file
124
+ image_file_path = os.path.join(images_folder_path, image_file_name)
125
+ with open(image_file_path, "wb") as f:
126
+ f.write(image_data_binary)
127
+ return None
112
128
 
113
129
 
114
130
  def _log_metrics_and_instance_results(
@@ -117,27 +133,37 @@ def _log_metrics_and_instance_results(
117
133
  trace_destination: Optional[str],
118
134
  run: Run,
119
135
  evaluation_name: Optional[str],
136
+ **kwargs,
120
137
  ) -> Optional[str]:
138
+ from azure.ai.evaluation._evaluate._eval_run import EvalRun
139
+
121
140
  if trace_destination is None:
122
141
  LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
123
142
  return None
124
143
 
125
- azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
126
- tracking_uri = azure_pf_client.ml_client.workspaces.get(ws_triad.workspace_name).mlflow_tracking_uri
144
+ ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
145
+ management_client = LiteMLClient(
146
+ subscription_id=ws_triad.subscription_id,
147
+ resource_group=ws_triad.resource_group_name,
148
+ logger=LOGGER,
149
+ credential=kwargs.get("credential"),
150
+ # let the client automatically determine the credentials to use
151
+ )
152
+ tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
127
153
 
128
154
  # Adding line_number as index column this is needed by UI to form link to individual instance run
129
155
  instance_results["line_number"] = instance_results.index.values
130
156
 
131
157
  with EvalRun(
132
158
  run_name=run.name if run is not None else evaluation_name,
133
- tracking_uri=tracking_uri,
159
+ tracking_uri=cast(str, tracking_uri),
134
160
  subscription_id=ws_triad.subscription_id,
135
161
  group_name=ws_triad.resource_group_name,
136
162
  workspace_name=ws_triad.workspace_name,
137
- ml_client=azure_pf_client.ml_client,
163
+ management_client=management_client,
138
164
  promptflow_run=run,
139
165
  ) as ev_run:
140
- artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
166
+ artifact_name = EvalRun.EVALUATION_ARTIFACT
141
167
 
142
168
  with tempfile.TemporaryDirectory() as tmpdir:
143
169
  # storing multi_modal images if exists
@@ -164,9 +190,15 @@ def _log_metrics_and_instance_results(
164
190
  ev_run.write_properties_to_run_history(
165
191
  properties={
166
192
  EvaluationRunProperties.RUN_TYPE: "eval_run",
167
- EvaluationRunProperties.EVALUATION_RUN: "azure-ai-generative-parent",
193
+ EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
194
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
168
195
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
169
- "isEvaluatorRun": "true",
196
+ }
197
+ )
198
+ else:
199
+ ev_run.write_properties_to_run_history(
200
+ properties={
201
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
170
202
  }
171
203
  )
172
204
 
@@ -211,6 +243,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
211
243
  with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
212
244
  json.dump(data_dict, f)
213
245
 
246
+ print(f'Evaluation results saved to "{p.resolve()}".\n')
247
+
214
248
 
215
249
  def _apply_column_mapping(
216
250
  source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
@@ -26,31 +26,30 @@ class _AsyncBleuScoreEvaluator:
26
26
 
27
27
  class BleuScoreEvaluator:
28
28
  """
29
- Evaluator that computes the BLEU Score between two strings.
29
+ Calculate the BLEU score for a given response and ground truth.
30
30
 
31
31
  BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
32
- translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
33
- generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
34
- better quality.
32
+ translation. It is widely used in text summarization and text generation use cases.
35
33
 
36
- **Usage**
34
+ Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
35
+ especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
36
+ indicator of quality.
37
37
 
38
- .. code-block:: python
38
+ The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
39
39
 
40
- eval_fn = BleuScoreEvaluator()
41
- result = eval_fn(
42
- response="Tokyo is the capital of Japan.",
43
- ground_truth="The capital of Japan is Tokyo.")
40
+ .. admonition:: Example:
44
41
 
45
- **Output format**
46
-
47
- .. code-block:: python
48
-
49
- {
50
- "bleu_score": 0.22
51
- }
42
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
43
+ :start-after: [START bleu_score_evaluator]
44
+ :end-before: [END bleu_score_evaluator]
45
+ :language: python
46
+ :dedent: 8
47
+ :caption: Initialize and call an BleuScoreEvaluator.
52
48
  """
53
49
 
50
+ id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
51
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
52
+
54
53
  def __init__(self):
55
54
  self._async_evaluator = _AsyncBleuScoreEvaluator()
56
55