PyPI - unique_toolkit - Versions diffs - 1.45.5__tar.gz → 1.45.6__tar.gz - Mend

unique_toolkit 1.45.5tar.gz → 1.45.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (265) hide show

{unique_toolkit-1.45.5 → unique_toolkit-1.45.6}/CHANGELOG.md RENAMED Viewed

@@ -5,6 +5,9 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [1.45.6] - 2026-01-30
+- hallucination evaluator: Use original response to retrieve referenced chunk
 ## [1.45.5] - 2026-01-29
 - Add HTML rendering support for code interpreter generated files

{unique_toolkit-1.45.5 → unique_toolkit-1.45.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: unique_toolkit
-Version: 1.45.5
+Version: 1.45.6
 Summary:
 License: Proprietary
 Author: Cedric Klinkert
@@ -125,6 +125,9 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [1.45.6] - 2026-01-30
+- hallucination evaluator: Use original response to retrieve referenced chunk
 ## [1.45.5] - 2026-01-29
 - Add HTML rendering support for code interpreter generated files

{unique_toolkit-1.45.5 → unique_toolkit-1.45.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "unique_toolkit"
-version = "1.45.5"
+version = "1.45.6"
 description = ""
 authors = [
     "Cedric Klinkert <cedric.klinkert@unique.ch>",

unique_toolkit-1.45.6/unique_toolkit/agentic/evaluation/config.py ADDED Viewed

@@ -0,0 +1,47 @@
+from typing import Annotated, Any
+from pydantic import BaseModel, Field
+from pydantic.json_schema import SkipJsonSchema
+from unique_toolkit._common.pydantic.rjsf_tags import RJSFMetaTag
+from unique_toolkit._common.pydantic_helpers import get_configuration_dict
+from unique_toolkit._common.validators import LMI
+from unique_toolkit.agentic.evaluation.schemas import (
+    EvaluationMetricName,
+)
+from unique_toolkit.agentic.tools.schemas import BaseToolConfig
+from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
+from unique_toolkit.language_model.infos import LanguageModelInfo
+PromptType = Annotated[str, RJSFMetaTag.StringWidget.textarea(rows=5)]
+class EvaluationMetricPromptsConfig(BaseModel):
+    model_config = get_configuration_dict()
+    system_prompt_template: PromptType = Field(
+        default="",
+        description="The system prompt for the evaluation metric.",
+    )
+    user_prompt_template: PromptType = Field(
+        default="",
+        description="The user prompt for the evaluation metric.",
+    )
+class EvaluationMetricConfig(BaseToolConfig):
+    enabled: SkipJsonSchema[bool] = False
+    name: SkipJsonSchema[EvaluationMetricName]
+    language_model: LMI = LanguageModelInfo.from_name(
+        DEFAULT_GPT_4o,
+    )
+    additional_llm_options: dict[str, Any] = Field(
+        default={},
+        description="Additional options to pass to the language model.",
+    )
+    prompts_config: EvaluationMetricPromptsConfig = Field(
+        default_factory=EvaluationMetricPromptsConfig,
+        description="The prompts config for the evaluation metric.",
+    )
+    score_to_label: dict[str, str] = {}
+    score_to_title: dict[str, str] = {}

unique_toolkit-1.45.6/unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from pathlib import Path
+from unique_toolkit.agentic.evaluation.utils import load_template
+CONTEXT_RELEVANCY_PROMPTS_DIR = Path(__file__).parent
+def system_prompt_loader():
+    return load_template(CONTEXT_RELEVANCY_PROMPTS_DIR, "system_prompt.j2")
+def user_prompt_loader():
+    return load_template(CONTEXT_RELEVANCY_PROMPTS_DIR, "user_prompt.j2")

unique_toolkit-1.45.5/unique_toolkit/agentic/evaluation/context_relevancy/prompts.py → unique_toolkit-1.45.6/unique_toolkit/agentic/evaluation/context_relevancy/prompts/system_prompt.j2 RENAMED Viewed

@@ -1,4 +1,13 @@
-CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG = """
+{% if structured_output %}
+You will receive an input and a set of contexts.
+Your task is to evaluate how relevant the contexts are to the input text.
+Further you should extract relevant facts from the contexts.
+# Output Format
+- Generate data according to the provided data schema.
+- Ensure the output adheres to the format required by the pydantic object.
+- All necessary fields should be populated as per the data schema guidelines.
+{% else %}
 You will receive an input and a set of contexts.
 Your task is to evaluate how relevant the contexts are to the input text.
@@ -12,45 +21,4 @@ Your answer must be in JSON format:
  "reason": Your explanation of your judgement of the evaluation,
  "value": decision, must be one of the following ["low", "medium", "high"]
 }
-"""
-CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT = """
-You will receive an input and a set of contexts.
-Your task is to evaluate how relevant the contexts are to the input text.
-Further you should extract relevant facts from the contexts.
-# Output Format
-- Generate data according to the provided data schema.
-- Ensure the output adheres to the format required by the pydantic object.
-- All necessary fields should be populated as per the data schema guidelines.
-"""
-CONTEXT_RELEVANCY_METRIC_USER_MSG = """
-Here is the data:
-Input:
-'''
-$input_text
-'''
-Contexts:
-'''
-$context_texts
-'''
-Answer as JSON:
-"""
-CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT = """
-Here is the data:
-Input:
-'''
-$input_text
-'''
-Contexts:
-'''
-$context_texts
-'''
-"""
+{% endif %}

unique_toolkit-1.45.6/unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2 ADDED Viewed

@@ -0,0 +1,15 @@
+Here is the data:
+Input:
+'''
+{{ input_text }}
+'''
+Contexts:
+'''
+{{ context_texts }}
+'''
+{% if not structured_output %}
+Answer as JSON:
+{% endif %}

{unique_toolkit-1.45.5 → unique_toolkit-1.45.6}/unique_toolkit/agentic/evaluation/context_relevancy/service.py RENAMED Viewed

@@ -4,10 +4,14 @@ from typing import overload
 from pydantic import BaseModel, ValidationError
 from typing_extensions import deprecated
+from unique_toolkit._common.utils.jinja.render import render_template
 from unique_toolkit._common.validate_required_values import (
     validate_required_values,
 )
-from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
+from unique_toolkit.agentic.evaluation.config import (
+    EvaluationMetricConfig,
+    EvaluationMetricPromptsConfig,
+)
 from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
     EvaluationSchemaStructuredOutput,
 )
@@ -28,32 +32,25 @@ from unique_toolkit.language_model.infos import (
     LanguageModelInfo,
     ModelCapabilities,
 )
-from unique_toolkit.language_model.prompt import Prompt
 from unique_toolkit.language_model.schemas import (
     LanguageModelMessages,
+    LanguageModelSystemMessage,
+    LanguageModelUserMessage,
 )
 from unique_toolkit.language_model.service import (
     LanguageModelService,
 )
-from .prompts import (
-    CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
-    CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT,
-    CONTEXT_RELEVANCY_METRIC_USER_MSG,
-    CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT,
-)
-SYSTEM_MSG_KEY = "systemPrompt"
-USER_MSG_KEY = "userPrompt"
+from .prompts import system_prompt_loader, user_prompt_loader
 default_config = EvaluationMetricConfig(
     enabled=False,
     name=EvaluationMetricName.CONTEXT_RELEVANCY,
     language_model=LanguageModelInfo.from_name(DEFAULT_GPT_4o),
-    custom_prompts={
-        SYSTEM_MSG_KEY: CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
-        USER_MSG_KEY: CONTEXT_RELEVANCY_METRIC_USER_MSG,
-    },
+    prompts_config=EvaluationMetricPromptsConfig(
+        system_prompt_template=system_prompt_loader(),
+        user_prompt_template=user_prompt_loader(),
+    ),
 )
 relevancy_required_input_fields = [
@@ -225,49 +222,20 @@ class ContextRelevancyEvaluator:
         """
         Composes the messages for the relevancy metric.
         """
-        system_msg_content = self._get_system_prompt(config, enable_structured_output)
-        system_msg = Prompt(system_msg_content).to_system_msg()
+        # Render system message
+        system_msg_content = render_template(
+            config.prompts_config.system_prompt_template,
+            structured_output=enable_structured_output,
+        )
+        system_msg = LanguageModelSystemMessage(content=system_msg_content)
-        user_msg = Prompt(
-            self._get_user_prompt(config, enable_structured_output),
+        # Render user message
+        user_msg_content = render_template(
+            config.prompts_config.user_prompt_template,
             input_text=input.input_text,
             context_texts=input.get_joined_context_texts(),
-        ).to_user_msg()
+            structured_output=enable_structured_output,
+        )
+        user_msg = LanguageModelUserMessage(content=user_msg_content)
         return LanguageModelMessages([system_msg, user_msg])
-    def _get_system_prompt(
-        self,
-        config: EvaluationMetricConfig,
-        enable_structured_output: bool,
-    ):
-        if (
-            enable_structured_output
-            and ModelCapabilities.STRUCTURED_OUTPUT
-            in config.language_model.capabilities
-        ):
-            return config.custom_prompts.setdefault(
-                SYSTEM_MSG_KEY,
-                CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT,
-            )
-        else:
-            return config.custom_prompts.setdefault(
-                SYSTEM_MSG_KEY,
-                CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
-            )
-    def _get_user_prompt(
-        self,
-        config: EvaluationMetricConfig,
-        enable_structured_output: bool,
-    ):
-        if enable_structured_output:
-            return config.custom_prompts.setdefault(
-                USER_MSG_KEY,
-                CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT,
-            )
-        else:
-            return config.custom_prompts.setdefault(
-                USER_MSG_KEY,
-                CONTEXT_RELEVANCY_METRIC_USER_MSG,
-            )

{unique_toolkit-1.45.5 → unique_toolkit-1.45.6}/unique_toolkit/agentic/evaluation/hallucination/constants.py RENAMED Viewed

@@ -1,15 +1,18 @@
+from enum import StrEnum
 from typing import Any
 from pydantic import Field
 from pydantic.json_schema import SkipJsonSchema
 from unique_toolkit._common.validators import LMI
-from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
+from unique_toolkit.agentic.evaluation.config import (
+    EvaluationMetricConfig,
+    EvaluationMetricPromptsConfig,
+    PromptType,
+)
 from unique_toolkit.agentic.evaluation.hallucination.prompts import (
-    HALLUCINATION_METRIC_SYSTEM_MSG,
-    HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
-    HALLUCINATION_METRIC_USER_MSG,
-    HALLUCINATION_METRIC_USER_MSG_DEFAULT,
+    system_prompt_loader,
+    user_prompt_loader,
 )
 from unique_toolkit.agentic.evaluation.schemas import (
     EvaluationMetricInputFieldName,
@@ -18,28 +21,36 @@ from unique_toolkit.agentic.evaluation.schemas import (
 from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
 from unique_toolkit.language_model.infos import LanguageModelInfo
-SYSTEM_MSG_KEY = "systemPrompt"
-USER_MSG_KEY = "userPrompt"
-SYSTEM_MSG_DEFAULT_KEY = "systemPromptDefault"
-USER_MSG_DEFAULT_KEY = "userPromptDefault"
+class SourceSelectionMode(StrEnum):
+    FROM_IDS = "FROM_IDS"
+    FROM_ORDER = "FROM_ORDER"
+    FROM_ORIGINAL_RESPONSE = "FROM_ORIGINAL_RESPONSE"
+class HallucinationPromptsConfig(EvaluationMetricPromptsConfig):
+    system_prompt_template: PromptType = Field(default_factory=system_prompt_loader)
+    user_prompt_template: PromptType = Field(default_factory=user_prompt_loader)
 class HallucinationConfig(EvaluationMetricConfig):
+    source_selection_mode: SourceSelectionMode = Field(
+        default=SourceSelectionMode.FROM_ORIGINAL_RESPONSE
+    )
+    reference_pattern: str = Field(default=r"[\[<]?source(\d+)[>\]]?")
     enabled: SkipJsonSchema[bool] = False
     name: SkipJsonSchema[EvaluationMetricName] = EvaluationMetricName.HALLUCINATION
     language_model: LMI = LanguageModelInfo.from_name(
         DEFAULT_GPT_4o,
     )
+    prompts_config: HallucinationPromptsConfig = Field(  # type: ignore[assignment]
+        default_factory=HallucinationPromptsConfig,
+        description="The prompts config for the hallucination metric",
+    )
     additional_llm_options: dict[str, Any] = Field(
         default={},
         description="Additional options to pass to the language model.",
     )
-    custom_prompts: dict = {
-        SYSTEM_MSG_KEY: HALLUCINATION_METRIC_SYSTEM_MSG,
-        USER_MSG_KEY: HALLUCINATION_METRIC_USER_MSG,
-        SYSTEM_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
-        USER_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_USER_MSG_DEFAULT,
-    }
     score_to_label: dict = {
         "LOW": "GREEN",
         "MEDIUM": "YELLOW",

unique_toolkit-1.45.6/unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from pathlib import Path
+from unique_toolkit.agentic.evaluation.utils import load_template
+HALLUCINATION_PROMPTS_DIR = Path(__file__).parent
+def system_prompt_loader():
+    return load_template(HALLUCINATION_PROMPTS_DIR, "system_prompt.j2")
+def user_prompt_loader():
+    return load_template(HALLUCINATION_PROMPTS_DIR, "user_prompt.j2")

unique_toolkit-1.45.6/unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2 ADDED Viewed

@@ -0,0 +1,35 @@
+{% if has_context %}
+You will receive a question, references, a conversation between a user and an agent, and an output.
+The output is the answer to the question.
+Your task is to evaluate whether the FACTS stated in the output are grounded in and supported by the provided references and conversation. Provide your reasoning in 2 sentences.
+IMPORTANT: You are NOT evaluating completeness. An incomplete answer that is grounded in the sources is acceptable. You are ONLY checking if the facts that ARE present are supported by the sources.
+Use the following hallucination scale:
+[low] - All facts and claims in the output are directly supported by or grounded in the references/conversation. The answer may be incomplete, but what IS stated is accurate to the sources.
+[medium] - Most facts are supported, but there are some claims or details that go beyond what the references/conversation state or make reasonable inferences not directly supported.
+[high] - The output contains facts that contradict the references/conversation, or makes claims that have no basis in the provided sources.
+Rules:
+- Do NOT penalize incomplete answers - if the sources have more info but the output doesn't mention it, that's fine
+- Do NOT use external knowledge - only evaluate based on the provided sources
+- DO penalize when the output states facts that are not found in the sources (unsupported claims)
+- DO penalize when the output contradicts what the sources say
+- Ignore references to plots, charts, or visual elements in your evaluation
+{% else %}
+You will receive a question and an output.
+The situation is that NO references were found to answer the question. Your task is to evaluate if the output inappropriately provides factual information despite having no sources to ground it in. Provide your reasoning in 2 sentences, mentioning that no references were provided.
+Use the following hallucination scale:
+[low] - The output appropriately indicates it cannot answer due to lack of information, or provides no substantive factual claims.
+[medium] - The output provides some limited factual information or makes cautious claims without strong source support.
+[high] - The output confidently provides detailed factual information despite having no sources to support it.
+Rule: Without sources, providing factual answers constitutes hallucination regardless of external correctness.
+{% endif %}
+Your answer must be in JSON format:
+{
+ "reason": Your explanation of your {{ 'judgement' if has_context else 'reasoning' }} of the evaluation,
+ "value": decision, must be one of the following: ["high", "medium", "low"]
+}

unique_toolkit-1.45.6/unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2 ADDED Viewed

@@ -0,0 +1,27 @@
+Here is the data:
+Input:
+'''
+{{ input_text }}
+'''
+{% if contexts_text %}
+References:
+'''
+{{ contexts_text }}
+'''
+{% endif %}
+{% if history_messages_text %}
+Conversation:
+'''
+{{ history_messages_text }}
+'''
+{% endif %}
+Output:
+'''
+{{ output_text }}
+'''
+Answer as JSON:

unique_toolkit 1.45.5__tar.gz → 1.45.6__tar.gz

unique_toolkit 1.45.5tar.gz → 1.45.6tar.gz