unique_toolkit 1.45.4__py3-none-any.whl → 1.45.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. unique_toolkit/agentic/evaluation/config.py +25 -6
  2. unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py +13 -0
  3. unique_toolkit/agentic/evaluation/context_relevancy/{prompts.py → prompts/system_prompt.j2} +11 -43
  4. unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2 +15 -0
  5. unique_toolkit/agentic/evaluation/context_relevancy/service.py +24 -56
  6. unique_toolkit/agentic/evaluation/hallucination/constants.py +26 -15
  7. unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py +13 -0
  8. unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2 +35 -0
  9. unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2 +27 -0
  10. unique_toolkit/agentic/evaluation/hallucination/utils.py +153 -102
  11. unique_toolkit/agentic/evaluation/tests/fixtures.py +102 -0
  12. unique_toolkit/agentic/evaluation/tests/test_config.py +247 -0
  13. unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +141 -121
  14. unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py +600 -0
  15. unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py +1009 -0
  16. unique_toolkit/agentic/evaluation/tests/test_output_parser.py +82 -23
  17. unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py +348 -0
  18. unique_toolkit/agentic/evaluation/utils.py +8 -0
  19. unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +34 -0
  20. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/METADATA +7 -1
  21. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/RECORD +23 -13
  22. unique_toolkit/agentic/evaluation/hallucination/prompts.py +0 -79
  23. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/LICENSE +0 -0
  24. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/WHEEL +0 -0
@@ -1,16 +1,32 @@
1
- from typing import Any
1
+ from typing import Annotated, Any
2
2
 
3
- from pydantic import Field
3
+ from pydantic import BaseModel, Field
4
4
  from pydantic.json_schema import SkipJsonSchema
5
5
 
6
+ from unique_toolkit._common.pydantic.rjsf_tags import RJSFMetaTag
7
+ from unique_toolkit._common.pydantic_helpers import get_configuration_dict
6
8
  from unique_toolkit._common.validators import LMI
9
+ from unique_toolkit.agentic.evaluation.schemas import (
10
+ EvaluationMetricName,
11
+ )
7
12
  from unique_toolkit.agentic.tools.schemas import BaseToolConfig
8
13
  from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
9
14
  from unique_toolkit.language_model.infos import LanguageModelInfo
10
15
 
11
- from .schemas import (
12
- EvaluationMetricName,
13
- )
16
+ PromptType = Annotated[str, RJSFMetaTag.StringWidget.textarea(rows=5)]
17
+
18
+
19
+ class EvaluationMetricPromptsConfig(BaseModel):
20
+ model_config = get_configuration_dict()
21
+
22
+ system_prompt_template: PromptType = Field(
23
+ default="",
24
+ description="The system prompt for the evaluation metric.",
25
+ )
26
+ user_prompt_template: PromptType = Field(
27
+ default="",
28
+ description="The user prompt for the evaluation metric.",
29
+ )
14
30
 
15
31
 
16
32
  class EvaluationMetricConfig(BaseToolConfig):
@@ -23,6 +39,9 @@ class EvaluationMetricConfig(BaseToolConfig):
23
39
  default={},
24
40
  description="Additional options to pass to the language model.",
25
41
  )
26
- custom_prompts: dict[str, str] = {}
42
+ prompts_config: EvaluationMetricPromptsConfig = Field(
43
+ default_factory=EvaluationMetricPromptsConfig,
44
+ description="The prompts config for the evaluation metric.",
45
+ )
27
46
  score_to_label: dict[str, str] = {}
28
47
  score_to_title: dict[str, str] = {}
@@ -0,0 +1,13 @@
1
+ from pathlib import Path
2
+
3
+ from unique_toolkit.agentic.evaluation.utils import load_template
4
+
5
+ CONTEXT_RELEVANCY_PROMPTS_DIR = Path(__file__).parent
6
+
7
+
8
+ def system_prompt_loader():
9
+ return load_template(CONTEXT_RELEVANCY_PROMPTS_DIR, "system_prompt.j2")
10
+
11
+
12
+ def user_prompt_loader():
13
+ return load_template(CONTEXT_RELEVANCY_PROMPTS_DIR, "user_prompt.j2")
@@ -1,4 +1,13 @@
1
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG = """
1
+ {% if structured_output %}
2
+ You will receive an input and a set of contexts.
3
+ Your task is to evaluate how relevant the contexts are to the input text.
4
+ Further you should extract relevant facts from the contexts.
5
+
6
+ # Output Format
7
+ - Generate data according to the provided data schema.
8
+ - Ensure the output adheres to the format required by the pydantic object.
9
+ - All necessary fields should be populated as per the data schema guidelines.
10
+ {% else %}
2
11
  You will receive an input and a set of contexts.
3
12
  Your task is to evaluate how relevant the contexts are to the input text.
4
13
 
@@ -12,45 +21,4 @@ Your answer must be in JSON format:
12
21
  "reason": Your explanation of your judgement of the evaluation,
13
22
  "value": decision, must be one of the following ["low", "medium", "high"]
14
23
  }
15
- """
16
-
17
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT = """
18
- You will receive an input and a set of contexts.
19
- Your task is to evaluate how relevant the contexts are to the input text.
20
- Further you should extract relevant facts from the contexts.
21
-
22
- # Output Format
23
- - Generate data according to the provided data schema.
24
- - Ensure the output adheres to the format required by the pydantic object.
25
- - All necessary fields should be populated as per the data schema guidelines.
26
- """
27
-
28
- CONTEXT_RELEVANCY_METRIC_USER_MSG = """
29
- Here is the data:
30
-
31
- Input:
32
- '''
33
- $input_text
34
- '''
35
-
36
- Contexts:
37
- '''
38
- $context_texts
39
- '''
40
-
41
- Answer as JSON:
42
- """
43
-
44
- CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT = """
45
- Here is the data:
46
-
47
- Input:
48
- '''
49
- $input_text
50
- '''
51
-
52
- Contexts:
53
- '''
54
- $context_texts
55
- '''
56
- """
24
+ {% endif %}
@@ -0,0 +1,15 @@
1
+ Here is the data:
2
+
3
+ Input:
4
+ '''
5
+ {{ input_text }}
6
+ '''
7
+
8
+ Contexts:
9
+ '''
10
+ {{ context_texts }}
11
+ '''
12
+ {% if not structured_output %}
13
+
14
+ Answer as JSON:
15
+ {% endif %}
@@ -4,10 +4,14 @@ from typing import overload
4
4
  from pydantic import BaseModel, ValidationError
5
5
  from typing_extensions import deprecated
6
6
 
7
+ from unique_toolkit._common.utils.jinja.render import render_template
7
8
  from unique_toolkit._common.validate_required_values import (
8
9
  validate_required_values,
9
10
  )
10
- from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
11
+ from unique_toolkit.agentic.evaluation.config import (
12
+ EvaluationMetricConfig,
13
+ EvaluationMetricPromptsConfig,
14
+ )
11
15
  from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
12
16
  EvaluationSchemaStructuredOutput,
13
17
  )
@@ -28,32 +32,25 @@ from unique_toolkit.language_model.infos import (
28
32
  LanguageModelInfo,
29
33
  ModelCapabilities,
30
34
  )
31
- from unique_toolkit.language_model.prompt import Prompt
32
35
  from unique_toolkit.language_model.schemas import (
33
36
  LanguageModelMessages,
37
+ LanguageModelSystemMessage,
38
+ LanguageModelUserMessage,
34
39
  )
35
40
  from unique_toolkit.language_model.service import (
36
41
  LanguageModelService,
37
42
  )
38
43
 
39
- from .prompts import (
40
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
41
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT,
42
- CONTEXT_RELEVANCY_METRIC_USER_MSG,
43
- CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT,
44
- )
45
-
46
- SYSTEM_MSG_KEY = "systemPrompt"
47
- USER_MSG_KEY = "userPrompt"
44
+ from .prompts import system_prompt_loader, user_prompt_loader
48
45
 
49
46
  default_config = EvaluationMetricConfig(
50
47
  enabled=False,
51
48
  name=EvaluationMetricName.CONTEXT_RELEVANCY,
52
49
  language_model=LanguageModelInfo.from_name(DEFAULT_GPT_4o),
53
- custom_prompts={
54
- SYSTEM_MSG_KEY: CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
55
- USER_MSG_KEY: CONTEXT_RELEVANCY_METRIC_USER_MSG,
56
- },
50
+ prompts_config=EvaluationMetricPromptsConfig(
51
+ system_prompt_template=system_prompt_loader(),
52
+ user_prompt_template=user_prompt_loader(),
53
+ ),
57
54
  )
58
55
 
59
56
  relevancy_required_input_fields = [
@@ -225,49 +222,20 @@ class ContextRelevancyEvaluator:
225
222
  """
226
223
  Composes the messages for the relevancy metric.
227
224
  """
228
- system_msg_content = self._get_system_prompt(config, enable_structured_output)
229
- system_msg = Prompt(system_msg_content).to_system_msg()
225
+ # Render system message
226
+ system_msg_content = render_template(
227
+ config.prompts_config.system_prompt_template,
228
+ structured_output=enable_structured_output,
229
+ )
230
+ system_msg = LanguageModelSystemMessage(content=system_msg_content)
230
231
 
231
- user_msg = Prompt(
232
- self._get_user_prompt(config, enable_structured_output),
232
+ # Render user message
233
+ user_msg_content = render_template(
234
+ config.prompts_config.user_prompt_template,
233
235
  input_text=input.input_text,
234
236
  context_texts=input.get_joined_context_texts(),
235
- ).to_user_msg()
237
+ structured_output=enable_structured_output,
238
+ )
239
+ user_msg = LanguageModelUserMessage(content=user_msg_content)
236
240
 
237
241
  return LanguageModelMessages([system_msg, user_msg])
238
-
239
- def _get_system_prompt(
240
- self,
241
- config: EvaluationMetricConfig,
242
- enable_structured_output: bool,
243
- ):
244
- if (
245
- enable_structured_output
246
- and ModelCapabilities.STRUCTURED_OUTPUT
247
- in config.language_model.capabilities
248
- ):
249
- return config.custom_prompts.setdefault(
250
- SYSTEM_MSG_KEY,
251
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT,
252
- )
253
- else:
254
- return config.custom_prompts.setdefault(
255
- SYSTEM_MSG_KEY,
256
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
257
- )
258
-
259
- def _get_user_prompt(
260
- self,
261
- config: EvaluationMetricConfig,
262
- enable_structured_output: bool,
263
- ):
264
- if enable_structured_output:
265
- return config.custom_prompts.setdefault(
266
- USER_MSG_KEY,
267
- CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT,
268
- )
269
- else:
270
- return config.custom_prompts.setdefault(
271
- USER_MSG_KEY,
272
- CONTEXT_RELEVANCY_METRIC_USER_MSG,
273
- )
@@ -1,15 +1,18 @@
1
+ from enum import StrEnum
1
2
  from typing import Any
2
3
 
3
4
  from pydantic import Field
4
5
  from pydantic.json_schema import SkipJsonSchema
5
6
 
6
7
  from unique_toolkit._common.validators import LMI
7
- from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
8
+ from unique_toolkit.agentic.evaluation.config import (
9
+ EvaluationMetricConfig,
10
+ EvaluationMetricPromptsConfig,
11
+ PromptType,
12
+ )
8
13
  from unique_toolkit.agentic.evaluation.hallucination.prompts import (
9
- HALLUCINATION_METRIC_SYSTEM_MSG,
10
- HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
11
- HALLUCINATION_METRIC_USER_MSG,
12
- HALLUCINATION_METRIC_USER_MSG_DEFAULT,
14
+ system_prompt_loader,
15
+ user_prompt_loader,
13
16
  )
14
17
  from unique_toolkit.agentic.evaluation.schemas import (
15
18
  EvaluationMetricInputFieldName,
@@ -18,28 +21,36 @@ from unique_toolkit.agentic.evaluation.schemas import (
18
21
  from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
19
22
  from unique_toolkit.language_model.infos import LanguageModelInfo
20
23
 
21
- SYSTEM_MSG_KEY = "systemPrompt"
22
- USER_MSG_KEY = "userPrompt"
23
- SYSTEM_MSG_DEFAULT_KEY = "systemPromptDefault"
24
- USER_MSG_DEFAULT_KEY = "userPromptDefault"
24
+
25
+ class SourceSelectionMode(StrEnum):
26
+ FROM_IDS = "FROM_IDS"
27
+ FROM_ORDER = "FROM_ORDER"
28
+ FROM_ORIGINAL_RESPONSE = "FROM_ORIGINAL_RESPONSE"
29
+
30
+
31
+ class HallucinationPromptsConfig(EvaluationMetricPromptsConfig):
32
+ system_prompt_template: PromptType = Field(default_factory=system_prompt_loader)
33
+ user_prompt_template: PromptType = Field(default_factory=user_prompt_loader)
25
34
 
26
35
 
27
36
  class HallucinationConfig(EvaluationMetricConfig):
37
+ source_selection_mode: SourceSelectionMode = Field(
38
+ default=SourceSelectionMode.FROM_ORIGINAL_RESPONSE
39
+ )
40
+ reference_pattern: str = Field(default=r"[\[<]?source(\d+)[>\]]?")
28
41
  enabled: SkipJsonSchema[bool] = False
29
42
  name: SkipJsonSchema[EvaluationMetricName] = EvaluationMetricName.HALLUCINATION
30
43
  language_model: LMI = LanguageModelInfo.from_name(
31
44
  DEFAULT_GPT_4o,
32
45
  )
46
+ prompts_config: HallucinationPromptsConfig = Field( # type: ignore[assignment]
47
+ default_factory=HallucinationPromptsConfig,
48
+ description="The prompts config for the hallucination metric",
49
+ )
33
50
  additional_llm_options: dict[str, Any] = Field(
34
51
  default={},
35
52
  description="Additional options to pass to the language model.",
36
53
  )
37
- custom_prompts: dict = {
38
- SYSTEM_MSG_KEY: HALLUCINATION_METRIC_SYSTEM_MSG,
39
- USER_MSG_KEY: HALLUCINATION_METRIC_USER_MSG,
40
- SYSTEM_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
41
- USER_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_USER_MSG_DEFAULT,
42
- }
43
54
  score_to_label: dict = {
44
55
  "LOW": "GREEN",
45
56
  "MEDIUM": "YELLOW",
@@ -0,0 +1,13 @@
1
+ from pathlib import Path
2
+
3
+ from unique_toolkit.agentic.evaluation.utils import load_template
4
+
5
+ HALLUCINATION_PROMPTS_DIR = Path(__file__).parent
6
+
7
+
8
+ def system_prompt_loader():
9
+ return load_template(HALLUCINATION_PROMPTS_DIR, "system_prompt.j2")
10
+
11
+
12
+ def user_prompt_loader():
13
+ return load_template(HALLUCINATION_PROMPTS_DIR, "user_prompt.j2")
@@ -0,0 +1,35 @@
1
+ {% if has_context %}
2
+ You will receive a question, references, a conversation between a user and an agent, and an output.
3
+ The output is the answer to the question.
4
+ Your task is to evaluate whether the FACTS stated in the output are grounded in and supported by the provided references and conversation. Provide your reasoning in 2 sentences.
5
+
6
+ IMPORTANT: You are NOT evaluating completeness. An incomplete answer that is grounded in the sources is acceptable. You are ONLY checking if the facts that ARE present are supported by the sources.
7
+
8
+ Use the following hallucination scale:
9
+ [low] - All facts and claims in the output are directly supported by or grounded in the references/conversation. The answer may be incomplete, but what IS stated is accurate to the sources.
10
+ [medium] - Most facts are supported, but there are some claims or details that go beyond what the references/conversation state or make reasonable inferences not directly supported.
11
+ [high] - The output contains facts that contradict the references/conversation, or makes claims that have no basis in the provided sources.
12
+
13
+ Rules:
14
+ - Do NOT penalize incomplete answers - if the sources have more info but the output doesn't mention it, that's fine
15
+ - Do NOT use external knowledge - only evaluate based on the provided sources
16
+ - DO penalize when the output states facts that are not found in the sources (unsupported claims)
17
+ - DO penalize when the output contradicts what the sources say
18
+ - Ignore references to plots, charts, or visual elements in your evaluation
19
+ {% else %}
20
+ You will receive a question and an output.
21
+ The situation is that NO references were found to answer the question. Your task is to evaluate if the output inappropriately provides factual information despite having no sources to ground it in. Provide your reasoning in 2 sentences, mentioning that no references were provided.
22
+
23
+ Use the following hallucination scale:
24
+ [low] - The output appropriately indicates it cannot answer due to lack of information, or provides no substantive factual claims.
25
+ [medium] - The output provides some limited factual information or makes cautious claims without strong source support.
26
+ [high] - The output confidently provides detailed factual information despite having no sources to support it.
27
+
28
+ Rule: Without sources, providing factual answers constitutes hallucination regardless of external correctness.
29
+ {% endif %}
30
+
31
+ Your answer must be in JSON format:
32
+ {
33
+ "reason": Your explanation of your {{ 'judgement' if has_context else 'reasoning' }} of the evaluation,
34
+ "value": decision, must be one of the following: ["high", "medium", "low"]
35
+ }
@@ -0,0 +1,27 @@
1
+ Here is the data:
2
+
3
+ Input:
4
+ '''
5
+ {{ input_text }}
6
+ '''
7
+ {% if contexts_text %}
8
+
9
+ References:
10
+ '''
11
+ {{ contexts_text }}
12
+ '''
13
+ {% endif %}
14
+ {% if history_messages_text %}
15
+
16
+ Conversation:
17
+ '''
18
+ {{ history_messages_text }}
19
+ '''
20
+ {% endif %}
21
+
22
+ Output:
23
+ '''
24
+ {{ output_text }}
25
+ '''
26
+
27
+ Answer as JSON: