PyPI - graphrag-eval - Versions diffs - 6.2.0__tar.gz → 6.4.0__tar.gz - Mend

graphrag-eval 6.2.0tar.gz → 6.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

graphrag_eval-6.4.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,46 @@
+Metadata-Version: 2.3
+Name: graphrag-eval
+Version: 6.4.0
+Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
+License: Apache-2.0
+Author: Philip Ganchev
+Author-email: philip.ganchev@graphwise.ai
+Requires-Python: >=3.12,<3.13
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Provides-Extra: llm
+Requires-Dist: pydantic (==2.12.5)
+Requires-Dist: python-dateutil (==2.9.0.post0)
+Requires-Dist: ragas (==0.4.3) ; extra == "llm"
+Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
+Description-Content-Type: text/markdown
+<p align="center">
+  <img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
+</p>
+# QA Evaluation
+This is a Python library for assessing the quality of question-answering systems, such as systems built with LLM-based agents. It is agnostic to the agent implementation and the LLM it uses.
+The evaluation is based on a user-provided reference dataset containing queries, reference responses, and optional reference steps, such as expected tool uses. The evaluator compares these references with the agent's actual responses and executed steps. Reference steps can be grouped to allow some expected steps to occur in any order.
+The library provides built-in evaluation metrics and supports user-defined custom metrics ([§ Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)).
+## Documentation
+- [Quickstart](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/quickstart.md)
+- [Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)
+- [Configuration](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/config.md)
+- [Input](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/input.md)
+- [Output](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/output.md)
+## Maintainers
+Developed and maintained by [Graphwise](https://graphwise.ai/). For issues and feature requests, please open a [GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
+## License
+Apache-2.0 License. See the [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.

graphrag_eval-6.4.0/README.md ADDED Viewed

@@ -0,0 +1,27 @@
+<p align="center">
+  <img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
+</p>
+# QA Evaluation
+This is a Python library for assessing the quality of question-answering systems, such as systems built with LLM-based agents. It is agnostic to the agent implementation and the LLM it uses.
+The evaluation is based on a user-provided reference dataset containing queries, reference responses, and optional reference steps, such as expected tool uses. The evaluator compares these references with the agent's actual responses and executed steps. Reference steps can be grouped to allow some expected steps to occur in any order.
+The library provides built-in evaluation metrics and supports user-defined custom metrics ([§ Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)).
+## Documentation
+- [Quickstart](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/quickstart.md)
+- [Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)
+- [Configuration](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/config.md)
+- [Input](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/input.md)
+- [Output](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/output.md)
+## Maintainers
+Developed and maintained by [Graphwise](https://graphwise.ai/). For issues and feature requests, please open a [GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
+## License
+Apache-2.0 License. See the [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.

{graphrag_eval-6.2.0 → graphrag_eval-6.4.0}/graphrag_eval/aggregation.py RENAMED Viewed

@@ -1,13 +1,13 @@
 import json
-import yaml
 from collections import defaultdict
 from collections.abc import Sequence
 from pathlib import Path
 from statistics import mean, median
 from typing import Any, Collection, Iterable
-from . import evaluation
+import yaml
+from . import evaluation
 METRICS = [
     "answer_recall",
@@ -155,7 +155,7 @@ def compute_micro_stats(
 ) -> dict:
     if custom_metrics is None:
         custom_metrics = []
     values = number_of_samples_per_template_by_status.values()
     micro_summary = defaultdict(dict, {
         "number_of_error_samples": sum(v["error"] for v in values),
@@ -197,7 +197,7 @@ def compute_macro_stats(
 ) -> dict:
     if custom_metrics is None:
         custom_metrics = []
     macro_summary = defaultdict(dict)
     for metric in METRICS + custom_metrics:
         means = [

graphrag_eval-6.4.0/graphrag_eval/answer_correctness.py ADDED Viewed

@@ -0,0 +1,176 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Any, Self, TYPE_CHECKING
+from pydantic import BaseModel, Field
+from graphrag_eval.util import compute_f1
+from .evaluator import Evaluator
+if TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
+def load_default_prompt() -> str:
+    with open(
+        Path(__file__).parent / "prompts" / "template.md",
+        encoding="utf-8"
+    ) as f:
+        return f.read()
+class AnswerCorrectnessConfig(BaseModel):
+    enabled: bool = Field(default=True)
+    prompt: str = Field(default_factory=load_default_prompt)
+class InvalidPromptException(Exception):
+    def __init__(
+        self,
+        message="The prompt template is invalid and cannot be "
+                "formatted."
+    ):
+        self.message = message
+        super().__init__(self.message)
+class AnswerCorrectnessEvaluator:
+    def __init__(
+        self,
+        ragas_llm: InstructorBaseRagasLLM,
+        config: AnswerCorrectnessConfig | None = None,
+    ):
+        self.config = config or AnswerCorrectnessConfig()
+        self.__validate_prompt_template(self.config.prompt)
+        self.prompt_template = self.config.prompt
+        self.ragas_llm = ragas_llm
+    @classmethod
+    def from_config(
+        cls,
+        ragas_llm: InstructorBaseRagasLLM | None,
+        config: AnswerCorrectnessConfig | None
+    ) -> Self | None:
+        if ragas_llm is None:
+            return None
+        if config is None or not config.enabled:
+            return None
+        return cls(ragas_llm=ragas_llm, config=config)
+    @staticmethod
+    def __validate_prompt_template(prompt_template: str):
+        try:
+            prompt_template.format(
+                question="Q?",
+                reference_answer="R",
+                actual_answer="A",
+            )
+        except Exception as exc:
+            raise InvalidPromptException(
+                "Invalid prompt template. Must only contain placeholders: "
+                "{question}, {reference_answer}, and {actual_answer}. "
+                f"Original error: {exc}"
+            ) from exc
+    async def _agenerate(self, prompt):
+        """Wrapper method for easier testing"""
+        return (await self.ragas_llm.agenerate(prompt, None)).choices[0].message.content
+    async def evaluate_answer(
+        self,
+        question: str,
+        reference_answer: str,
+        actual_answer: str
+    ) -> tuple[int, int, int, str]:
+        if any(
+            not s.strip() for s in [question, reference_answer, actual_answer]
+        ):
+            raise ValueError(
+                "The question of the reference or the actual answer is a blank "
+                "string!"
+            )
+        prompt = self.prompt_template.format(
+            question=question,
+            reference_answer=reference_answer,
+            actual_answer=actual_answer,
+        )
+        response_str = await self._agenerate(prompt)
+        return self.extract_response_values(response_str)
+    async def evaluate(
+        self,
+        reference: dict[str, Any],
+        actual: dict[str, Any]
+    ) -> dict[str, Any]:
+        if "actual_answer" not in actual or "reference_answer" not in reference:
+            return {}
+        result = {}
+        try:
+            num_ref_claims, num_actual_claims, num_matching_claims, reason = \
+                await self.evaluate_answer(
+                    reference["question_text"],
+                    reference["reference_answer"],
+                    actual["actual_answer"],
+                )
+            result.update({
+                "answer_reference_claims_count": num_ref_claims,
+                "answer_actual_claims_count": num_actual_claims,
+                "answer_matching_claims_count": num_matching_claims,
+                "answer_correctness_reason": reason,
+            })
+            recall, precision, f1 = self.compute_recall_precision_f1(
+                num_ref_claims, num_actual_claims, num_matching_claims
+            )
+            if recall is not None:
+                result["answer_recall"] = recall
+            if precision is not None:
+                result["answer_precision"] = precision
+            if f1 is not None:
+                result["answer_f1"] = f1
+        except Exception as exc:
+            result["answer_correctness_error"] = str(exc)
+        return result
+    @staticmethod
+    def compute_recall_precision_f1(
+        n_pos: int,
+        n_pred_pos: int,
+        n_true_pos: int,
+    ) -> tuple[float | None, float | None, float | None]:
+        recall = None
+        precision = None
+        if n_pos:
+            recall = n_true_pos / n_pos
+        if n_pred_pos:
+            precision = n_true_pos / n_pred_pos
+        return recall, precision, compute_f1(recall, precision)
+    @staticmethod
+    def extract_response_values(
+        response: str
+    ) -> tuple[int, int, int, str]:
+        vals = response.split("\t")
+        n = len(vals)
+        if n < 4:
+            raise ValueError(f"Expected 4 tab-separated values: {response}")
+        vals = vals[:4]
+        try:
+            n_ref, n_actual, n_matching = map(int, vals[:3])
+        except ValueError:
+            raise ValueError(f"Claims counts should be ints: {vals}")
+        if any([
+            n_ref < 1,
+            n_actual < 1,
+            n_matching < 0,
+            n_matching > n_ref,
+            n_matching > n_actual
+        ]):
+            raise ValueError(
+                "Invalid claims counts combination: "
+                f"{n_ref}\t{n_actual}\t{n_matching}"
+            )
+        return n_ref, n_actual, n_matching, vals[3]
+_: Evaluator = AnswerCorrectnessEvaluator

graphrag_eval-6.4.0/graphrag_eval/answer_relevance.py ADDED Viewed

@@ -0,0 +1,61 @@
+from __future__ import annotations
+from typing import Any, Self, TYPE_CHECKING
+from pydantic import BaseModel, Field
+from .evaluator import Evaluator
+if TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
+    from ragas.embeddings.base import BaseRagasEmbeddings, BaseRagasEmbedding
+class AnswerRelevanceConfig(BaseModel):
+    enabled: bool = Field(default=True)
+class AnswerRelevanceEvaluator:
+    def __init__(
+        self,
+        ragas_llm: InstructorBaseRagasLLM,
+        ragas_embedder: BaseRagasEmbeddings | BaseRagasEmbedding
+    ):
+        from ragas.metrics.collections import AnswerRelevancy
+        self.scorer = AnswerRelevancy(llm=ragas_llm, embeddings=ragas_embedder)
+    @classmethod
+    def from_config(
+        cls,
+        ragas_llm: InstructorBaseRagasLLM | None,
+        ragas_embedder: BaseRagasEmbeddings | BaseRagasEmbedding | None,
+        config: AnswerRelevanceConfig | None
+    ) -> Self | None:
+        if ragas_llm is None or ragas_embedder is None:
+            return None
+        if config is None or not config.enabled:
+            return None
+        return cls(ragas_llm=ragas_llm, ragas_embedder=ragas_embedder)
+    async def evaluate(
+        self,
+        reference: dict[str, Any],
+        actual: dict[str, Any]
+    ) -> dict[str, Any]:
+        if "actual_answer" not in actual:
+            return {}
+        try:
+            result = await self.scorer.ascore(
+                user_input=reference["question_text"],
+                response=actual["actual_answer"]
+            )
+            return {
+                "answer_relevance": result.value
+            }
+        except Exception as e:
+            return {
+                "answer_relevance_error": str(e)
+            }
+_: Evaluator = AnswerRelevanceEvaluator

graphrag_eval-6.4.0/graphrag_eval/cli/answer_correctness.py ADDED Viewed

@@ -0,0 +1,122 @@
+from __future__ import annotations
+import argparse
+import asyncio
+import csv
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import TYPE_CHECKING
+from tqdm import tqdm
+from graphrag_eval import llm_factory
+from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
+from graphrag_eval.evaluation import Config
+if TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
+def parse_args() -> argparse.Namespace:
+    parser = ArgumentParser(
+        description="Calculates answer correctness over the entries from the "
+                    "input tsv file and stores the output in the output tsv "
+                    "file.",
+    )
+    parser.add_argument(
+        "-i",
+        "--input-tsv-file-path",
+        type=Path,
+        required=True,
+        help="Input tsv file path with columns `Question`, `Reference answer` "
+             "and `Actual answer`",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-tsv-file-path",
+        type=Path,
+        required=True,
+        help="Output tsv file path with columns `#Reference`, `#PTarget`, "
+             "`#Matching`, `Reasoning`, `Error`",
+    )
+    parser.add_argument(
+        "-c",
+        "--config-yaml-file-path",
+        type=Path,
+        required=True,
+        help="Config yaml file path with definition of the LLM to use and "
+             "optionally a custom prompt.",
+    )
+    return parser.parse_args()
+async def evaluate_and_write(
+    input_tsv_file_path: Path,
+    output_tsv_file_path: Path,
+    evaluator: AnswerCorrectnessEvaluator,
+) -> None:
+    with open(input_tsv_file_path, encoding="utf-8") as f:
+        reader = csv.DictReader(f, delimiter="\t")
+        rows = [row for row in reader]
+    print(f"Writing results to {output_tsv_file_path}")
+    output_tsv_file_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_tsv_file_path, "w", encoding="utf-8") as f:
+        writer = csv.writer(f, delimiter="\t")
+        writer.writerow(
+            ["#Reference", "#PTarget", "#Matching", "Reasoning", "Error"]
+        )
+        for row in tqdm(rows):
+            if "Question" not in row or \
+                "Reference answer" not in row or \
+                "Actual answer" not in row:
+                raise ValueError("Unexpected input format!")
+            try:
+                vals = await evaluator.evaluate_answer(
+                    row["Question"],
+                    row["Reference answer"],
+                    row["Actual answer"]
+                )
+                vals = vals + ("",)
+                writer.writerow(vals)
+            except Exception as exc:
+                writer.writerow(["", "", "", "", str(exc)])
+            f.flush()
+def run(
+    config_yaml_file_path: Path,
+    input_tsv_file_path: Path,
+    output_tsv_file_path: Path,
+):
+    config = Config.parse(config_yaml_file_path)
+    ragas_llm: InstructorBaseRagasLLM | None = llm_factory.create_llm(
+        config.llm
+    )
+    if ragas_llm is None:
+        raise ValueError(
+            "LLM must be configured to calculate the answer correctness!"
+        )
+    if config.answer_correctness and not config.answer_correctness.enabled:
+        raise ValueError(
+            "Can't disable answer correctness, when running this script!"
+        )
+    evaluator = AnswerCorrectnessEvaluator(
+        ragas_llm=ragas_llm,
+        config=config.answer_correctness,
+    )
+    asyncio.run(evaluate_and_write(
+        input_tsv_file_path,
+        output_tsv_file_path,
+        evaluator,
+    ))
+def main():
+    args = parse_args()
+    run(
+        args.config_yaml_file_path,
+        args.input_tsv_file_path,
+        args.output_tsv_file_path,
+    )

{graphrag_eval-6.2.0 → graphrag_eval-6.4.0}/graphrag_eval/custom_evaluation.py RENAMED Viewed

@@ -1,9 +1,14 @@
+from __future__ import annotations
 import json
-from typing import Literal
+from typing import Literal, Self, TYPE_CHECKING, Any
 from pydantic import BaseModel, ConfigDict, Field, model_validator
-from graphrag_eval.llm_factory import create_llm
+from .evaluator import Evaluator
+if TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
 RESERVED_KEYS = {
     "template_id",
@@ -43,7 +48,7 @@ Inputs = Literal[
 StepsKey = Literal["args", "output"]
-class Config(BaseModel):
+class EvaluatorConfig(BaseModel):
     model_config = ConfigDict(extra='forbid')
     name: str
     inputs: list[Inputs] = Field(..., min_length=1)
@@ -53,7 +58,7 @@ class Config(BaseModel):
     steps_keys: set[StepsKey] | None = Field(default=None, min_length=1)
     @model_validator(mode='after')
-    def validate_step_dependencies(self) -> 'Config':
+    def validate_step_dependencies(self) -> Self:
         if set(self.inputs) & {"reference_steps", "actual_steps"}:
             suffix = "is required when steps are in inputs"
             for var_name in ["steps_name", "steps_keys"]:
@@ -62,7 +67,7 @@ class Config(BaseModel):
         return self
     @model_validator(mode='after')
-    def validate_name_and_outputs(self) -> 'Config':
+    def validate_name_and_outputs(self) -> Self:
         if self.name + "_error" in RESERVED_KEYS:
             raise ValueError(f"Name {self.name} is reserved")
         conflicting_keys = set(self.outputs.keys()) & RESERVED_KEYS
@@ -76,7 +81,7 @@ def create_input_template(input_key: str) -> str:
     return f"# {header}\n{{{input_key}}}"
-def create_prompt_template(config: Config, output_variables: list[str]) -> str:
+def create_prompt_template(config: EvaluatorConfig, output_variables: list[str]) -> str:
     """
     Return a template for the LLM prompt, with placeholders for the inputs,
     instructions, outputs etc. We use this template at evaluation time to
@@ -99,8 +104,8 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
 class CustomEvaluator:
     def __init__(
         self,
-        config: Config,
-        eval_config: "evaluation.Config",
+        ragas_llm: InstructorBaseRagasLLM,
+        config: EvaluatorConfig,
     ):
         self.name = config.name
         self.input_variables = config.inputs
@@ -111,11 +116,24 @@ class CustomEvaluator:
             config,
             self.output_variables
         )
-        self.llm = create_llm(eval_config)
+        self.ragas_llm = ragas_llm
+    @classmethod
+    def from_config(
+        cls,
+        ragas_llm: InstructorBaseRagasLLM | None,
+        evaluation_configs: list[EvaluatorConfig] | None
+    ) -> list[Self]:
+        if ragas_llm and evaluation_configs:
+            return [
+                cls(ragas_llm, evaluation_config)
+                for evaluation_config in evaluation_configs
+            ]
+        return []
     async def _agenerate(self, prompt: str) -> str:
         """Wrapper method for easier testing"""
-        return (await self.llm.agenerate(prompt, None)).choices[0].message.content
+        return (await self.ragas_llm.agenerate(prompt, None)).choices[0].message.content
     def format_steps(self, steps: list) -> str:
         steps_formatted = []
@@ -157,7 +175,11 @@ class CustomEvaluator:
             return result
         return self.error(f"Expected {n_exp} tab-separated values, got: {response}")
-    async def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
+    async def evaluate(
+        self,
+        reference: dict[str, Any],
+        actual: dict[str, Any]
+    ) -> dict[str, Any]:
         inputs = {}
         if "question" in self.input_variables:
             if "question_text" not in reference:
@@ -195,10 +217,4 @@ class CustomEvaluator:
         return self.parse_outputs(response)
-def create_evaluators(config: "evaluation.Config") -> list[CustomEvaluator]:
-    if config.custom_evaluations and config.llm:
-        return [
-            CustomEvaluator(custom_evaluation_config, config)
-            for custom_evaluation_config in config.custom_evaluations
-        ]
-    return []
+_: Evaluator = CustomEvaluator

graphrag-eval 6.2.0__tar.gz → 6.4.0__tar.gz

graphrag-eval 6.2.0tar.gz → 6.4.0tar.gz