azure-ai-evaluation 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (123) hide show
  1. azure/ai/evaluation/__init__.py +43 -1
  2. azure/ai/evaluation/_azure/_models.py +6 -6
  3. azure/ai/evaluation/_common/constants.py +6 -2
  4. azure/ai/evaluation/_common/rai_service.py +38 -4
  5. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  6. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  7. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  8. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  9. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  10. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  11. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  12. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  13. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  14. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  15. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  16. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  17. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  18. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  19. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  20. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  21. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  22. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  23. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  24. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
  25. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  26. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  27. azure/ai/evaluation/_common/utils.py +22 -2
  28. azure/ai/evaluation/_constants.py +7 -0
  29. azure/ai/evaluation/_converters/__init__.py +3 -0
  30. azure/ai/evaluation/_converters/_ai_services.py +804 -0
  31. azure/ai/evaluation/_converters/_models.py +302 -0
  32. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
  33. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
  34. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  35. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  36. azure/ai/evaluation/_evaluate/_evaluate.py +31 -2
  37. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
  38. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  39. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
  40. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
  41. azure/ai/evaluation/_evaluators/_common/_base_eval.py +43 -3
  42. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +3 -1
  43. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +43 -4
  44. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
  45. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
  46. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
  47. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
  48. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
  49. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
  50. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
  51. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
  52. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
  53. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +21 -3
  54. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  55. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
  56. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
  57. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
  58. azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
  59. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
  60. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  61. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +157 -0
  62. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
  63. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
  64. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
  65. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
  66. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
  67. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  68. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
  69. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
  70. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
  72. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
  73. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  74. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
  75. azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
  76. azure/ai/evaluation/_exceptions.py +5 -0
  77. azure/ai/evaluation/_legacy/__init__.py +3 -0
  78. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  79. azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
  80. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
  81. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  82. azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
  83. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
  84. azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
  85. azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
  86. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  87. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
  88. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  89. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
  91. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  92. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  93. azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
  94. azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
  95. azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
  96. azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
  97. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  98. azure/ai/evaluation/_red_team/__init__.py +3 -0
  99. azure/ai/evaluation/_red_team/_attack_objective_generator.py +192 -0
  100. azure/ai/evaluation/_red_team/_attack_strategy.py +42 -0
  101. azure/ai/evaluation/_red_team/_callback_chat_target.py +74 -0
  102. azure/ai/evaluation/_red_team/_default_converter.py +21 -0
  103. azure/ai/evaluation/_red_team/_red_team.py +1858 -0
  104. azure/ai/evaluation/_red_team/_red_team_result.py +246 -0
  105. azure/ai/evaluation/_red_team/_utils/__init__.py +3 -0
  106. azure/ai/evaluation/_red_team/_utils/constants.py +64 -0
  107. azure/ai/evaluation/_red_team/_utils/formatting_utils.py +164 -0
  108. azure/ai/evaluation/_red_team/_utils/logging_utils.py +139 -0
  109. azure/ai/evaluation/_red_team/_utils/strategy_utils.py +188 -0
  110. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  111. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  112. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +251 -150
  113. azure/ai/evaluation/_version.py +1 -1
  114. azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
  115. azure/ai/evaluation/simulator/_adversarial_simulator.py +54 -27
  116. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
  117. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
  118. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/METADATA +69 -15
  119. azure_ai_evaluation-1.4.0.dist-info/RECORD +197 -0
  120. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/WHEEL +1 -1
  121. azure_ai_evaluation-1.3.0.dist-info/RECORD +0 -119
  122. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/NOTICE.txt +0 -0
  123. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,71 @@
1
+ ---
2
+ name: Tool Call Accuracy
3
+ description: Evaluates Tool Call Accuracy for tool used by agent
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 800
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: text
14
+
15
+ inputs:
16
+ query:
17
+ type: List
18
+ tool_call:
19
+ type: Dict
20
+ tool_definition:
21
+ type: Dict
22
+
23
+ ---
24
+ system:
25
+ # Instruction
26
+ ## Goal
27
+ ### Your are an expert in evaluating the accuracy of a tool call considering relevance and potential usefulness including syntactic and semantic correctness of a proposed tool call from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
28
+ - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
29
+ - **Data**: Your input data include CONVERSATION , TOOL CALL and TOOL DEFINITION.
30
+ - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
31
+
32
+ user:
33
+ # Definition
34
+ **Tool Call Accuracy** refers to the relevance and potential usefulness of a TOOL CALL in the context of an ongoing CONVERSATION and EXTRACTION of RIGHT PARAMETER VALUES from the CONVERSATION.It assesses how likely the TOOL CALL is to contribute meaningfully to the CONVERSATION and help address the user's needs. Focus on evaluating the potential value of the TOOL CALL within the specific context of the given CONVERSATION, without making assumptions beyond the provided information.
35
+ Consider the following factors in your evaluation:
36
+
37
+ 1. Relevance: How well does the proposed tool call align with the current topic and flow of the conversation?
38
+ 2. Parameter Appropriateness: Do the parameters used in the TOOL CALL match the TOOL DEFINITION and are the parameters relevant to the latest user's query?
39
+ 3. Parameter Value Correctness: Are the parameters values used in the TOOL CALL present or inferred by CONVERSATION and relevant to the latest user's query?
40
+ 4. Potential Value: Is the information this tool call might provide likely to be useful in advancing the conversation or addressing the user expressed or implied needs?
41
+ 5. Context Appropriateness: Does the tool call make sense at this point in the conversation, given what has been discussed so far?
42
+
43
+
44
+ # Ratings
45
+ ## [Tool Call Accuracy: 0] (Irrelevant)
46
+ **Definition:**
47
+ 1. The TOOL CALL is not relevant and will not help resolve the user's need.
48
+ 2. TOOL CALL include parameters values that are not present or inferred from CONVERSATION.
49
+ 3. TOOL CALL has parameters that is not present in TOOL DEFINITION.
50
+
51
+ ## [Tool Call Accuracy: 1] (Relevant)
52
+ **Definition:**
53
+ 1. The TOOL CALL is directly relevant and very likely to help resolve the user's need.
54
+ 2. TOOL CALL include parameters values that are present or inferred from CONVERSATION.
55
+ 3. TOOL CALL has parameters that is present in TOOL DEFINITION.
56
+
57
+ # Data
58
+ CONVERSATION : {{query}}
59
+ TOOL CALL: {{tool_call}}
60
+ TOOL DEFINITION: {{tool_definition}}
61
+
62
+
63
+ # Tasks
64
+ ## Please provide your assessment Score for the previous CONVERSATION , TOOL CALL and TOOL DEFINITION based on the Definitions above. Your output should include the following information:
65
+ - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
66
+ - **Explanation**: a very short explanation of why you think the input Data should get that Score.
67
+ - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "0", "1") based on the levels of the definitions.
68
+
69
+
70
+ ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
71
+ # Output
@@ -0,0 +1,5 @@
1
+ from ._ungrounded_attributes import UngroundedAttributesEvaluator
2
+
3
+ __all__ = [
4
+ "UngroundedAttributesEvaluator",
5
+ ]
@@ -0,0 +1,103 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from typing_extensions import overload, override
5
+ from typing import Dict, Union
6
+
7
+ from azure.ai.evaluation._common._experimental import experimental
8
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
9
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
10
+
11
+ @experimental
12
+ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
13
+ """
14
+ Evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
15
+ where query represents the user query and response represents the AI system response given the provided context.
16
+
17
+ Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class or
18
+ emotional state of a person.
19
+
20
+
21
+ It identifies the following attributes:
22
+
23
+ - emotional_state
24
+ - protected_class
25
+ - groundedness
26
+
27
+ :param credential: The credential for connecting to Azure AI project. Required
28
+ :type credential: ~azure.core.credentials.TokenCredential
29
+ :param azure_ai_project: The scope of the Azure AI project.
30
+ It contains subscription id, resource group, and project name.
31
+ :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
32
+ :param kwargs: Additional arguments to pass to the evaluator.
33
+ :type kwargs: Any
34
+
35
+ .. admonition:: Example:
36
+
37
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
38
+ :start-after: [START ungrounded_attributes_evaluator]
39
+ :end-before: [END ungrounded_attributes_evaluator]
40
+ :language: python
41
+ :dedent: 8
42
+ :caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
43
+
44
+ .. note::
45
+
46
+ If this evaluator is supplied to the `evaluate` function, the metric
47
+ for the ungrounded attributes will be "ungrounded_attributes_label".
48
+ """
49
+
50
+ id = "ungrounded_attributes"
51
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
52
+
53
+ @override
54
+ def __init__(
55
+ self,
56
+ credential,
57
+ azure_ai_project,
58
+ ):
59
+ super().__init__(
60
+ eval_metric=EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
61
+ azure_ai_project=azure_ai_project,
62
+ credential=credential,
63
+ )
64
+
65
+ @overload
66
+ def __call__(
67
+ self,
68
+ *,
69
+ query: str,
70
+ response: str,
71
+ context: str,
72
+ ) -> Dict[str, Union[str, float]]:
73
+ """Evaluate a given query/response pair and context for ungrounded attributes
74
+
75
+ :keyword query: The query to be evaluated.
76
+ :paramtype query: str
77
+ :keyword response: The response to be evaluated.
78
+ :paramtype response: str
79
+ :keyword context: The context to be used for evaluation.
80
+ :paramtype context: str
81
+ :return: The ungrounded attributes label.
82
+ :rtype: Dict[str, Union[str, bool]]
83
+ """
84
+
85
+ @override
86
+ def __call__( # pylint: disable=docstring-missing-param
87
+ self,
88
+ *args,
89
+ **kwargs,
90
+ ):
91
+ """Evaluate a given query/response pair and context for ungrounded attributes
92
+
93
+ :keyword query: The query to be evaluated.
94
+ :paramtype query: str
95
+ :keyword response: The response to be evaluated.
96
+ :paramtype response: str
97
+ :keyword context: The context to be used for evaluation.
98
+ :paramtype context: str
99
+ :return: The ungrounded attributes label.
100
+ :rtype: Dict[str, Union[str, bool]]
101
+ """
102
+
103
+ return super().__call__(*args, **kwargs)
@@ -43,6 +43,8 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
43
43
  :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project
44
44
  name.
45
45
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
46
+ :param threshold: The threshold for the IndirectAttack evaluator. Default is 0.
47
+ :type threshold: int
46
48
 
47
49
  .. admonition:: Example:
48
50
 
@@ -62,15 +62,18 @@ class ErrorTarget(Enum):
62
62
  CODE_CLIENT = "CodeClient"
63
63
  RAI_CLIENT = "RAIClient"
64
64
  COHERENCE_EVALUATOR = "CoherenceEvaluator"
65
+ COMPLETENESS_EVALUATOR = "CompletenessEvaluator"
65
66
  CONTENT_SAFETY_CHAT_EVALUATOR = "ContentSafetyEvaluator"
66
67
  ECI_EVALUATOR = "ECIEvaluator"
67
68
  F1_EVALUATOR = "F1Evaluator"
68
69
  GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"
69
70
  PROTECTED_MATERIAL_EVALUATOR = "ProtectedMaterialEvaluator"
71
+ INTENT_RESOLUTION_EVALUATOR = "IntentResolutionEvaluator"
70
72
  RELEVANCE_EVALUATOR = "RelevanceEvaluator"
71
73
  SIMILARITY_EVALUATOR = "SimilarityEvaluator"
72
74
  FLUENCY_EVALUATOR = "FluencyEvaluator"
73
75
  RETRIEVAL_EVALUATOR = "RetrievalEvaluator"
76
+ TASK_ADHERENCE_EVALUATOR = "TaskAdherenceEvaluator"
74
77
  INDIRECT_ATTACK_EVALUATOR = "IndirectAttackEvaluator"
75
78
  INDIRECT_ATTACK_SIMULATOR = "IndirectAttackSimulator"
76
79
  ADVERSARIAL_SIMULATOR = "AdversarialSimulator"
@@ -80,6 +83,8 @@ class ErrorTarget(Enum):
80
83
  MODELS = "Models"
81
84
  UNKNOWN = "Unknown"
82
85
  CONVERSATION = "Conversation"
86
+ TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator"
87
+ RED_TEAM = "RedTeam"
83
88
 
84
89
 
85
90
  class EvaluationException(AzureError):
@@ -0,0 +1,3 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ # NOTE: This is a direct port of the bare minimum needed for BatchEngine functionality from
6
+ # the original Promptflow code. The goal here is expediency, not elegance. As such
7
+ # parts of this code may be a little "quirky", seem incomplete in places, or contain
8
+ # longer TODOs comments than usual. In a future code update, large swaths of this code
9
+ # will be refactored or deleted outright.
@@ -0,0 +1,45 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from dataclasses import dataclass
6
+ from logging import Logger
7
+
8
+ from ..._constants import PF_BATCH_TIMEOUT_SEC_DEFAULT
9
+
10
+
11
+ @dataclass
12
+ class BatchEngineConfig:
13
+ """Context for a batch of evaluations. This will contain the configuration,
14
+ logging, and other needed information."""
15
+
16
+ logger: Logger
17
+ """The logger to use for logging messages."""
18
+
19
+ batch_timeout_seconds: int = PF_BATCH_TIMEOUT_SEC_DEFAULT
20
+ """The maximum amount of time to wait for all evaluations in the batch to complete."""
21
+
22
+ run_timeout_seconds: int = 600
23
+ """The maximum amount of time to wait for an evaluation to run against a single entry
24
+ in the data input to complete."""
25
+
26
+ max_concurrency: int = 10
27
+ """The maximum number of evaluations to run concurrently."""
28
+
29
+ use_async: bool = True
30
+ """Whether to use asynchronous evaluation."""
31
+
32
+ default_num_results: int = 100
33
+ """The default number of results to return if you don't ask for all results."""
34
+
35
+ def __post_init__(self):
36
+ if self.logger is None:
37
+ raise ValueError("logger cannot be None")
38
+ if self.batch_timeout_seconds <= 0:
39
+ raise ValueError("batch_timeout_seconds must be greater than 0")
40
+ if self.run_timeout_seconds <= 0:
41
+ raise ValueError("run_timeout_seconds must be greater than 0")
42
+ if self.max_concurrency <= 0:
43
+ raise ValueError("max_concurrency must be greater than 0")
44
+ if self.default_num_results <= 0:
45
+ raise ValueError("default_num_results must be greater than 0")
@@ -0,0 +1,368 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ # This contains code merged together from the following files:
6
+ # promptflow-devkit/promptflow/batch/_batch_engine.py
7
+ # promptflow-devkit/promptflow/_proxy/_python_executor_proxy.py
8
+ # promptflow-core/promptflow/executor/_script_executor.py
9
+ # TODO ralphe: The way this code does batch execution needs to be improved. For now
10
+ # porting over the code largely as is to remove the Promptflow dependency
11
+ # as quickly as possible. In phase 2 this code will be heavily refactored.
12
+
13
+ import re
14
+ import asyncio
15
+ from math import floor
16
+ from asyncio import Semaphore
17
+ from contextlib import contextmanager
18
+ from dataclasses import dataclass
19
+ from datetime import datetime, timedelta, timezone
20
+ from typing import Any, Callable, Dict, Final, Generator, Mapping, MutableMapping, Optional, Sequence, Set, Tuple
21
+ from uuid import uuid4
22
+
23
+ from ._utils import get_int_env_var, get_value_from_path
24
+ from ._status import BatchStatus
25
+ from ._result import BatchResult, BatchRunDetails, BatchRunError, TokenMetrics
26
+ from ._run_storage import AbstractRunStorage, NoOpRunStorage
27
+ from ._logging import log_progress, NodeLogManager
28
+ from ..._exceptions import ErrorBlame
29
+ from ._exceptions import (
30
+ BatchEngineCanceledError,
31
+ BatchEngineError,
32
+ BatchEngineRunFailedError,
33
+ BatchEngineTimeoutError,
34
+ BatchEngineValidationError,
35
+ )
36
+ from ._utils_deprecated import (
37
+ async_run_allowing_running_loop,
38
+ convert_eager_flow_output_to_dict,
39
+ )
40
+
41
+
42
+ MAX_WORKER_COUNT: Final[int] = 10
43
+ KEYWORD_PATTERN: Final = re.compile(r"^\${([^{}]+)}$")
44
+
45
+
46
+ class BatchEngine:
47
+ """This class is used to execute flows in batch mode"""
48
+
49
+ def __init__(
50
+ self,
51
+ executor: Callable,
52
+ *,
53
+ storage: Optional[AbstractRunStorage] = None,
54
+ batch_timeout_sec: Optional[int] = None,
55
+ line_timeout_sec: Optional[int] = None,
56
+ max_worker_count: Optional[int] = None,
57
+ **kwargs: Any,
58
+ ):
59
+ """Create a new batch engine instance
60
+
61
+ :param Callable executor: The executor to run the flow
62
+ :param Optional[AbstractRunStorage] storage: The storage to store execution results
63
+ :param Optional[int] batch_timeout_sec: The timeout of batch run in seconds
64
+ :param Optional[int] line_timeout_sec: The timeout of each line in seconds
65
+ :param Optional[int] max_worker_count: The concurrency limit of batch run
66
+ :param kwargs: The keyword arguments related to creating the executor proxy class
67
+ :type kwargs: Any
68
+ """
69
+
70
+ self._executor = executor
71
+ # self._working_dir = working_dir
72
+
73
+ # self._is_eager_flow = True
74
+ # self._is_prompty_flow = False
75
+ # self._program_language = FlowLanguage.Python
76
+ # self._message_format = MessageFormatType.BASIC
77
+ # self._multimedia_processor = MultimediaProcessor.create(self._message_format)
78
+ # self._connections = {}
79
+
80
+ self._storage: AbstractRunStorage = storage or NoOpRunStorage()
81
+
82
+ # TODO ralphe: Consume these from the batch context/config instead of from
83
+ # kwargs or (even worse) environment variables
84
+ # self._batch_use_async = kwargs.get("batch_use_async", True)
85
+ self._batch_timeout_sec = batch_timeout_sec or get_int_env_var("PF_BATCH_TIMEOUT_SEC")
86
+ self._line_timeout_sec = line_timeout_sec or get_int_env_var("PF_LINE_TIMEOUT_SEC", 600)
87
+ self._max_worker_count = max_worker_count or get_int_env_var("PF_WORKER_COUNT") or MAX_WORKER_COUNT
88
+ # update kwargs with worker_count and line_timeout_sec
89
+ kwargs.update({"worker_count": self._max_worker_count, "line_timeout_sec": self._line_timeout_sec})
90
+
91
+ self._is_canceled: bool = False
92
+ self._kwargs: Mapping[str, Any] = kwargs
93
+ # self._init_kwargs: Mapping[str, Any] = init_kwargs or {}
94
+
95
+ def run(
96
+ self,
97
+ data: Sequence[Mapping[str, Any]],
98
+ column_mapping: Mapping[str, str],
99
+ *,
100
+ id: Optional[str] = None,
101
+ max_lines: Optional[int] = None,
102
+ ) -> BatchResult:
103
+ if not data:
104
+ raise BatchEngineValidationError("Please provide a non-empty data mapping.")
105
+ if not column_mapping:
106
+ raise BatchEngineValidationError("The column mapping is required.")
107
+
108
+ start_time = datetime.now(timezone.utc)
109
+
110
+ batch_inputs = self._apply_column_mapping(data, column_mapping, max_lines)
111
+ if not batch_inputs or all(len(data) == 0 for data in batch_inputs):
112
+ raise BatchEngineValidationError("No data to process.")
113
+
114
+ try:
115
+ id = id or str(uuid4())
116
+
117
+ result: BatchResult = async_run_allowing_running_loop(self._exec_in_task, id, batch_inputs, start_time)
118
+
119
+ return result
120
+ except Exception as ex:
121
+ raise BatchEngineError(
122
+ "Unexpected error while running the batch run.", blame=ErrorBlame.SYSTEM_ERROR
123
+ ) from ex
124
+
125
+ def cancel(self):
126
+ # TODO ralphe: Make sure this works
127
+ self._is_canceled = True
128
+
129
+ @staticmethod
130
+ def _apply_column_mapping(
131
+ data: Sequence[Mapping[str, Any]],
132
+ column_mapping: Mapping[str, str],
133
+ max_lines: Optional[int],
134
+ ) -> Sequence[Mapping[str, str]]:
135
+ data = data[:max_lines] if max_lines else data
136
+
137
+ inputs: Sequence[Mapping[str, Any]] = []
138
+ line: int = 0
139
+
140
+ for input in data:
141
+ line += 1
142
+ mapped: Dict[str, Any] = {}
143
+ missing_inputs: Set[str] = set()
144
+
145
+ for key, value in column_mapping.items():
146
+ if not isinstance(value, str):
147
+ # All non-string values are literal values.
148
+ mapped[key] = value
149
+ continue
150
+
151
+ match: Optional[re.Match[str]] = re.search(KEYWORD_PATTERN, value)
152
+ if match is None:
153
+ # Literal string value value
154
+ mapped[key] = value
155
+ continue
156
+
157
+ dict_path = match.group(1)
158
+ found, value = get_value_from_path(dict_path, input)
159
+ if found:
160
+ mapped[key] = value
161
+ else:
162
+ missing_inputs.add(dict_path)
163
+
164
+ if missing_inputs:
165
+ missing = ", ".join(missing_inputs)
166
+ raise BatchEngineValidationError(f"Missing inputs for line {line}: '{missing}'")
167
+
168
+ inputs.append(mapped)
169
+
170
+ return inputs
171
+
172
+ async def _exec_in_task(
173
+ self, run_id: str, batch_inputs: Sequence[Mapping[str, Any]], start_time: datetime
174
+ ) -> BatchResult:
175
+ # Since the batch execution is not guaranteed to be completed in the same order
176
+ # as the inputs, we keep track of these in a mapping from index to result
177
+ results: Dict[int, BatchRunDetails] = {}
178
+ status: BatchStatus = BatchStatus.Completed
179
+ error: Optional[Exception] = None
180
+
181
+ task = asyncio.create_task(self._exec_batch(run_id, batch_inputs, start_time, results))
182
+
183
+ while not task.done():
184
+ # check whether the task is completed or canceled every 1s
185
+ await asyncio.sleep(1)
186
+ if self._is_canceled:
187
+ task.cancel()
188
+ # use current completed line results and aggregation results to create a BatchResult
189
+ status = BatchStatus.Canceled
190
+ error = BatchEngineCanceledError("The batch run is canceled by user.")
191
+ break
192
+ elif self._batch_timeout_expired(start_time):
193
+ task.cancel()
194
+ status = BatchStatus.Failed
195
+ error = BatchEngineTimeoutError(
196
+ f"The batch run failed due to timeout [{self._batch_timeout_sec}s]. "
197
+ f"Please adjust the timeout to a higher value."
198
+ )
199
+ break
200
+
201
+ end_time = datetime.now(timezone.utc)
202
+ metrics = TokenMetrics(0, 0, 0)
203
+ failed_lines: int = 0
204
+
205
+ # generate the details in the same order as the inputs and fill in the missing results
206
+ # with a failed status
207
+ result_details = [
208
+ (
209
+ results[i]
210
+ if i in results
211
+ else BatchRunDetails(
212
+ id=BatchRunDetails.create_id(run_id, i),
213
+ status=BatchStatus.Failed,
214
+ result=None,
215
+ start_time=None,
216
+ end_time=None,
217
+ tokens=TokenMetrics(0, 0, 0),
218
+ error=BatchRunError("The line run is not completed.", None),
219
+ )
220
+ )
221
+ for i in range(len(batch_inputs))
222
+ ]
223
+
224
+ for line_result in result_details:
225
+ # Indicate the worst status of the batch run. This works because
226
+ # canceled and failed have a higher value than completed.
227
+ status = max(status, line_result.status)
228
+ if BatchStatus.is_failed(line_result.status):
229
+ failed_lines += 1
230
+ if line_result.tokens:
231
+ metrics.prompt_tokens += line_result.tokens.prompt_tokens
232
+ metrics.completion_tokens += line_result.tokens.completion_tokens
233
+ metrics.total_tokens += line_result.tokens.total_tokens
234
+
235
+ if failed_lines and not error:
236
+ error = BatchEngineRunFailedError(
237
+ str(floor(failed_lines / len(batch_inputs) * 100)) + f"% of the batch run failed."
238
+ )
239
+
240
+ return BatchResult(
241
+ status=status,
242
+ total_lines=len(batch_inputs),
243
+ failed_lines=failed_lines,
244
+ start_time=start_time,
245
+ end_time=end_time,
246
+ tokens=metrics,
247
+ details=result_details,
248
+ error=error,
249
+ )
250
+
251
+ async def _exec_batch(
252
+ self,
253
+ run_id: str,
254
+ batch_inputs: Sequence[Mapping[str, Any]],
255
+ start_time: datetime,
256
+ results: MutableMapping[int, BatchRunDetails],
257
+ ) -> None:
258
+ semaphore: Semaphore = Semaphore(self._max_worker_count)
259
+
260
+ # TODO ralphe: This async code needs to refactored to use e.g. asyncio.gather, or
261
+ # asyncio.as_completed.
262
+ # TODO ralphe: This code needs to handle cancellation better
263
+ async def create_under_semaphore(index: int, inputs: Mapping[str, Any]):
264
+ async with semaphore:
265
+ return await self._exec_line_async(run_id, inputs, index)
266
+
267
+ pending = [
268
+ asyncio.create_task(create_under_semaphore(index, inputs)) for index, inputs in enumerate(batch_inputs)
269
+ ]
270
+
271
+ total_lines: int = len(batch_inputs)
272
+ completed_lines: int = 0
273
+ while completed_lines < total_lines:
274
+ # TODO ralphe: Fix this code so it doesn't re-order the outputs
275
+ # wait for any task to complete
276
+ done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
277
+ completed_line_results = [task.result() for task in done]
278
+ # persist node run infos and flow run info in line result to storage
279
+ self._persist_run_info([result for _, result in completed_line_results])
280
+ results.update({index: result for index, result in completed_line_results})
281
+ # update the progress log
282
+ completed_lines += len(completed_line_results)
283
+ log_progress(
284
+ run_start_time=start_time,
285
+ total_count=total_lines,
286
+ current_count=completed_lines,
287
+ # TODO ralphe: set logger to use here
288
+ )
289
+
290
+ async def _exec_line_async(
291
+ self,
292
+ run_id: str,
293
+ inputs: Mapping[str, Any],
294
+ index: int,
295
+ ) -> Tuple[int, BatchRunDetails]:
296
+ with self._exec_line_context(run_id, index):
297
+ details: BatchRunDetails = BatchRunDetails(
298
+ id=f"{run_id}_{index}",
299
+ status=BatchStatus.NotStarted,
300
+ result=None,
301
+ start_time=datetime.now(timezone.utc),
302
+ end_time=None,
303
+ tokens=TokenMetrics(0, 0, 0),
304
+ error=None,
305
+ )
306
+
307
+ try:
308
+ # TODO ralphe: Handle line timeouts here
309
+ output: Any = await self._executor(**inputs)
310
+ details.status = BatchStatus.Completed
311
+ details.result = convert_eager_flow_output_to_dict(output)
312
+
313
+ # TODO figure out how to get the token metrics here
314
+ except Exception as ex:
315
+ details.status = BatchStatus.Failed
316
+ details.error = BatchRunError(
317
+ f"Error while evaluating single input: {ex.__class__.__name__}: {str(ex)}", ex
318
+ )
319
+ finally:
320
+ details.end_time = datetime.now(timezone.utc)
321
+
322
+ return index, details
323
+
324
+ def _persist_run_info(self, line_results: Sequence[BatchRunDetails]):
325
+ # TODO ralphe: implement?
326
+ pass
327
+
328
+ def _batch_timeout_expired(self, start_time: datetime) -> bool:
329
+ if self._batch_timeout_sec is None:
330
+ return False
331
+ return (datetime.now(timezone.utc) - start_time).total_seconds() > self._batch_timeout_sec
332
+
333
+ @contextmanager
334
+ def _exec_line_context(self, run_id: str, line_number: int) -> Generator[None, Any, None]:
335
+ # TODO ralphe: Do proper tracing and logging here
336
+ log_manager = NodeLogManager()
337
+ log_manager.set_node_context(run_id, "Flex", line_number)
338
+ with log_manager, self._update_operation_context(run_id, line_number):
339
+ yield
340
+
341
+ @contextmanager
342
+ def _update_operation_context(self, run_id: str, line_number: int) -> Generator[None, Any, None]:
343
+ # operation_context = OperationContext.get_instance()
344
+ # original_context = operation_context.copy()
345
+ # original_mode = operation_context.get("run_mode", RunMode.Test.name)
346
+ # values_for_context = {"flow_id": self._flow_id, "root_run_id": run_id}
347
+ # if original_mode == RunMode.Batch.name:
348
+ # values_for_otel = {
349
+ # "batch_run_id": run_id,
350
+ # "line_number": line_number,
351
+ # }
352
+ # else:
353
+ # values_for_otel = {"line_run_id": run_id}
354
+ # try:
355
+ # append_promptflow_package_ua(operation_context)
356
+ # operation_context.set_execution_target(execution_target=self._execution_target)
357
+ # operation_context.set_default_tracing_keys(DEFAULT_TRACING_KEYS)
358
+ # operation_context.run_mode = original_mode
359
+ # operation_context.update(values_for_context)
360
+ # for k, v in values_for_otel.items():
361
+ # operation_context._add_otel_attributes(k, v)
362
+ # # Inject OpenAI API to make sure traces and headers injection works and
363
+ # # update OpenAI API configs from environment variables.
364
+ # inject_openai_api()
365
+ yield
366
+
367
+ # finally:
368
+ # OperationContext.set_instance(original_context)