azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (100) hide show
  1. azure/ai/evaluation/__init__.py +60 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/constants.py +65 -0
  4. azure/ai/evaluation/_common/rai_service.py +452 -0
  5. azure/ai/evaluation/_common/utils.py +87 -0
  6. azure/ai/evaluation/_constants.py +50 -0
  7. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  8. azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +8 -0
  9. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +72 -0
  10. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +150 -0
  11. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +494 -0
  13. azure/ai/evaluation/_evaluate/_evaluate.py +689 -0
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +174 -0
  15. azure/ai/evaluation/_evaluate/_utils.py +237 -0
  16. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  17. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  18. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +73 -0
  19. azure/ai/evaluation/_evaluators/_chat/__init__.py +9 -0
  20. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  21. azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +9 -0
  22. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  23. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  24. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  25. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +122 -0
  26. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +62 -0
  27. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +21 -0
  28. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +108 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +66 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +78 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +76 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +76 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +99 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +141 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +122 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +61 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +71 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +123 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  47. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  48. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +96 -0
  49. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  50. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -0
  51. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  52. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  53. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_qa/_qa.py +111 -0
  55. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  56. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +131 -0
  57. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +69 -0
  58. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  59. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  60. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +130 -0
  62. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +71 -0
  63. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  64. azure/ai/evaluation/_evaluators/_xpia/xpia.py +140 -0
  65. azure/ai/evaluation/_exceptions.py +107 -0
  66. azure/ai/evaluation/_http_utils.py +395 -0
  67. azure/ai/evaluation/_model_configurations.py +27 -0
  68. azure/ai/evaluation/_user_agent.py +6 -0
  69. azure/ai/evaluation/_version.py +5 -0
  70. azure/ai/evaluation/py.typed +0 -0
  71. azure/ai/evaluation/simulator/__init__.py +15 -0
  72. azure/ai/evaluation/simulator/_adversarial_scenario.py +27 -0
  73. azure/ai/evaluation/simulator/_adversarial_simulator.py +450 -0
  74. azure/ai/evaluation/simulator/_constants.py +17 -0
  75. azure/ai/evaluation/simulator/_conversation/__init__.py +315 -0
  76. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  77. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  78. azure/ai/evaluation/simulator/_direct_attack_simulator.py +252 -0
  79. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  80. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  81. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +93 -0
  82. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +207 -0
  83. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  84. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +147 -0
  85. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +228 -0
  86. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +157 -0
  87. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +157 -0
  88. azure/ai/evaluation/simulator/_model_tools/models.py +616 -0
  89. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +69 -0
  90. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +36 -0
  91. azure/ai/evaluation/simulator/_tracing.py +92 -0
  92. azure/ai/evaluation/simulator/_utils.py +111 -0
  93. azure/ai/evaluation/simulator/simulator.py +579 -0
  94. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  95. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  96. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  97. azure_ai_evaluation-1.0.0b1.dist-info/top_level.txt +1 -0
  98. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  99. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  100. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,36 @@
1
+ ---
2
+ name: TaskSimulatorWithPersona
3
+ description: Simulates a user to complete a conversation
4
+ model:
5
+ api: chat
6
+ configuration:
7
+ type: azure_openai
8
+ azure_deployment: ${env:AZURE_DEPLOYMENT}
9
+ azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
10
+ parameters:
11
+ temperature: 0.0
12
+ top_p: 1.0
13
+ presence_penalty: 0
14
+ frequency_penalty: 0
15
+ response_format:
16
+ type: json_object
17
+
18
+ inputs:
19
+ task:
20
+ type: string
21
+ conversation_history:
22
+ type: dict
23
+
24
+ ---
25
+ system:
26
+ You should behave as a user who is planning to accomplish this task: {{ task }} and you continue to interact with a system that responds to your queries.
27
+ Make sure your conversation is engaging and interactive.
28
+ Output must be in JSON format
29
+ Here's a sample output:
30
+ {
31
+ "content": "Here is my follow-up question.",
32
+ "user": "user"
33
+ }
34
+
35
+ Output with a json object that continues the conversation, given the conversation history:
36
+ {{ conversation_history }}
@@ -0,0 +1,92 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ # pylint: disable=C0103,C0114,C0116,E0401,E0611
5
+
6
+ import functools
7
+
8
+ from promptflow._sdk._telemetry.activity import ActivityType, monitor_operation
9
+
10
+
11
+ def monitor_adversarial_scenario(activity_name: str = "adversarial.simulator.call"):
12
+ """
13
+ Monitor an adversarial scenario.
14
+
15
+ Parameters:
16
+ activity_name (str): The name of the activity to monitor.
17
+ """
18
+
19
+ def decorator(func):
20
+ """
21
+ Decorator for monitoring an adversarial scenario.
22
+
23
+ Parameters:
24
+ func (function): The function to be decorated.
25
+ """
26
+
27
+ @functools.wraps(func)
28
+ def wrapper(*args, **kwargs):
29
+ """
30
+ Wrapper for monitoring an adversarial scenario.
31
+
32
+ Parameters:
33
+ *args: Variable length argument list.
34
+ **kwargs: Arbitrary keyword arguments.
35
+ """
36
+ scenario = str(kwargs.get("scenario", None))
37
+ max_conversation_turns = kwargs.get("max_conversation_turns", None)
38
+ max_simulation_results = kwargs.get("max_simulation_results", None)
39
+ jailbreak = kwargs.get("jailbreak", None)
40
+ decorated_func = monitor_operation(
41
+ activity_name=activity_name,
42
+ activity_type=ActivityType.PUBLICAPI,
43
+ custom_dimensions={
44
+ "scenario": scenario,
45
+ "max_conversation_turns": max_conversation_turns,
46
+ "max_simulation_results": max_simulation_results,
47
+ "jailbreak": jailbreak,
48
+ },
49
+ )(func)
50
+
51
+ return decorated_func(*args, **kwargs)
52
+
53
+ return wrapper
54
+
55
+ return decorator
56
+
57
+
58
+ def monitor_task_simulator(func):
59
+ """
60
+ Monitor a task simulator.
61
+
62
+ Parameters:
63
+ func (function): The function to be decorated.
64
+ """
65
+
66
+ @functools.wraps(func)
67
+ def wrapper(*args, **kwargs):
68
+ """
69
+ Wrapper for monitoring a task simulator.
70
+
71
+ Parameters:
72
+ *args: Variable length argument list.
73
+ **kwargs: Arbitrary keyword arguments.
74
+ """
75
+ text_length = len(kwargs.get("text", ""))
76
+ user_persona_length = len(kwargs.get("user_persona", []))
77
+ num_queries = kwargs.get("num_queries", 0)
78
+ max_conversation_turns = kwargs.get("max_conversation_turns", 0)
79
+ decorated_func = monitor_operation(
80
+ activity_name="task.simulator.call",
81
+ activity_type=ActivityType.PUBLICAPI,
82
+ custom_dimensions={
83
+ "text_length": text_length,
84
+ "user_persona_length": user_persona_length,
85
+ "number_of_queries": num_queries,
86
+ "max_conversation_turns": max_conversation_turns,
87
+ },
88
+ )(func)
89
+
90
+ return decorated_func(*args, **kwargs)
91
+
92
+ return wrapper
@@ -0,0 +1,111 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ """
5
+ This module contains a utility class for managing a list of JSON lines.
6
+ """
7
+ import json
8
+
9
+
10
+ class JsonLineList(list):
11
+ """
12
+ A util to manage a list of JSON lines.
13
+ """
14
+
15
+ def to_json_lines(self):
16
+ """
17
+ Converts the list to a string of JSON lines.
18
+ Each item in the list is converted to a JSON string
19
+ and appended to the result string with a newline.
20
+
21
+ :returns: A string of JSON lines, where each line is a JSON representation of an item in the list.
22
+ :rtype: str
23
+ """
24
+ json_lines = ""
25
+ for item in self:
26
+ json_lines += json.dumps(item) + "\n"
27
+ return json_lines
28
+
29
+ def to_eval_qa_json_lines(self):
30
+ """
31
+ Converts the list to a string of JSON lines suitable for evaluation in a Q&A format.
32
+ Each item in the list is expected to be a dictionary with
33
+ 'messages' key. The 'messages' value is a list of
34
+ dictionaries, each with a 'role' key and a 'content' key.
35
+ The 'role' value should be either 'user' or 'assistant',
36
+ and the 'content' value should be a string.
37
+ If a 'context' key is present in the message, its value is also included
38
+ in the output.
39
+
40
+ :returns: A string of JSON lines.
41
+ :rtype: str
42
+ """
43
+ json_lines = ""
44
+ for item in self:
45
+ user_message = None
46
+ assistant_message = None
47
+ context = None
48
+ for message in item["messages"]:
49
+ if message["role"] == "user":
50
+ user_message = message["content"]
51
+ elif message["role"] == "assistant":
52
+ assistant_message = message["content"]
53
+ if "context" in message:
54
+ context = message.get("context", None)
55
+ if user_message and assistant_message:
56
+ if context:
57
+ json_lines += (
58
+ json.dumps({"query": user_message, "response": assistant_message, "context": context})
59
+ + "\n"
60
+ )
61
+ user_message = assistant_message = context = None
62
+ else:
63
+ json_lines += json.dumps({"query": user_message, "response": assistant_message}) + "\n"
64
+ user_message = assistant_message = None
65
+
66
+ return json_lines
67
+
68
+
69
+ class JsonLineChatProtocol(dict):
70
+ """
71
+ A util to manage a JSON object that follows the chat protocol.
72
+ """
73
+
74
+ def to_json(self):
75
+ """
76
+ Converts the object to a JSON string.
77
+
78
+ :returns: A JSON representation of the object.
79
+ :rtype: str
80
+ """
81
+ return json.dumps(self)
82
+
83
+ def to_eval_qa_json_lines(self):
84
+ """
85
+ Converts the object to a string of JSON lines suitable for evaluation in a Q&A format.
86
+ The object is expected to be a dictionary with 'messages' key.
87
+ """
88
+ user_message = None
89
+ assistant_message = None
90
+ if "context" in self:
91
+ context = self["context"]
92
+ else:
93
+ context = None
94
+ json_lines = ""
95
+ for message in self["messages"]:
96
+ if message["role"] == "user":
97
+ user_message = message["content"]
98
+ elif message["role"] == "assistant":
99
+ assistant_message = message["content"]
100
+ if "context" in message and message["context"] is not None:
101
+ context = message.get("context", context)
102
+ if user_message and assistant_message:
103
+ if context:
104
+ json_lines += (
105
+ json.dumps({"question": user_message, "answer": assistant_message, "context": context}) + "\n"
106
+ )
107
+ user_message = assistant_message = None
108
+ else:
109
+ json_lines += json.dumps({"question": user_message, "answer": assistant_message}) + "\n"
110
+ user_message = assistant_message = None
111
+ return json_lines