PyPI - graphrag-eval - Versions diffs - 6.0.0__tar.gz → 6.2.0__tar.gz - Mend

graphrag-eval 6.0.0tar.gz → 6.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: graphrag-eval
-Version: 6.0.0
+Version: 6.2.0
 Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
 License: Apache-2.0
 Author: Philip Ganchev
@@ -1167,8 +1167,9 @@ the steps are matching.
 - if both are named "retrieve_data_points", then we check if the arguments of
 the steps are matching.
 - if the reference step is named "iri_discovery" and the actual step name is
-"autocomplete_search", тhen check if the IRI specified as "output" of the
-"iri_discovery" step is present in the "output" of the "autocomplete_search".
+"autocomplete_search" or "sparql_query", then check if the IRI specified as
+"output" of the "iri_discovery" step is present in the "output" of the
+  actual step.
 - if the reference and actual step names are the same and the
 "output_media_type" of the reference step is "application/json", then the steps
 match, if the json outputs are the same.

{graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/README.md RENAMED Viewed

@@ -1149,8 +1149,9 @@ the steps are matching.
 - if both are named "retrieve_data_points", then we check if the arguments of
 the steps are matching.
 - if the reference step is named "iri_discovery" and the actual step name is
-"autocomplete_search", тhen check if the IRI specified as "output" of the
-"iri_discovery" step is present in the "output" of the "autocomplete_search".
+"autocomplete_search" or "sparql_query", then check if the IRI specified as
+"output" of the "iri_discovery" step is present in the "output" of the
+  actual step.
 - if the reference and actual step names are the same and the
 "output_media_type" of the reference step is "application/json", then the steps
 match, if the json outputs are the same.

{graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/answer_correctness.py RENAMED Viewed

@@ -1,13 +1,13 @@
+import asyncio
 import csv
 from pathlib import Path
 from tqdm import tqdm
-from graphrag_eval import llm
+from graphrag_eval import llm_factory
 from graphrag_eval.evaluation import Config
 from graphrag_eval.util import compute_f1, singleton
 IN_FILE_PATH = "../data/data-1.tsv"
 PROMPT_FILE_PATH = Path(__file__).parent / "prompts" / "template.md"
 OUT_FILE_PATH = "results/data-1.tsv"
@@ -26,17 +26,17 @@ def parse_args() -> "argparse.Namespace":
             f = float(value)
         except ValueError:
             raise ArgumentTypeError(f"Invalid float value: {value}")
         if f <= 0.0 or f >= 2.0:
             raise ArgumentTypeError(f"Value must be between 0.0 and 2.0, got {f}")
         return f
-    parser = ArgumentParser()
+    parser = ArgumentParser()
     parser.add_argument("-i", "--in-file", type=str, default=IN_FILE_PATH)
     parser.add_argument("-o", "--out-file", type=str, default=OUT_FILE_PATH)
     parser.add_argument("-p", "--provider", type=str, default=LLM_PROVIDER)
     parser.add_argument("-l", "--llm", type=str, default=LLM_MODEL)
-    parser.add_argument("-m", "--max-tokens", type=int, default=MAX_TOKENS)
+    parser.add_argument("-m", "--max-tokens", type=int, default=MAX_TOKENS)
     parser.add_argument(
         "-t",
         "--temperature",
@@ -97,11 +97,11 @@ class AnswerCorrectnessEvaluator:
             self.prompt_template = f.read()
         self.llm = llm
-    def _generate(self, prompt):
+    async def _agenerate(self, prompt):
         """Wrapper method for easier testing"""
-        return self.llm.generate(prompt, None).choices[0].message.content
+        return (await self.llm.agenerate(prompt, None)).choices[0].message.content
-    def evaluate_answer(
+    async def evaluate_answer(
         self,
         question: str,
         reference_answer: str,
@@ -112,21 +112,21 @@ class AnswerCorrectnessEvaluator:
             reference_answer=reference_answer,
             candidate_answer=actual_answer,
         )
-        response_str = self._generate(prompt)
+        response_str = await self._agenerate(prompt)
         return extract_response_values(response_str)
-    def get_correctness_dict(
+    async def get_correctness_dict(
         self,
         reference: dict,
         actual: dict,
     ):
         result = {"reference_answer": reference["reference_answer"]}
         num_ref_claims, num_actual_claims, num_matching_claims, reason, error = \
-        self.evaluate_answer(
-            reference["question_text"],
-            reference["reference_answer"],
-            actual["actual_answer"],
-        )
+            await self.evaluate_answer(
+                reference["question_text"],
+                reference["reference_answer"],
+                actual["actual_answer"],
+            )
         if error:
             result["answer_eval_error"] = error
         else:
@@ -148,12 +148,12 @@ class AnswerCorrectnessEvaluator:
         return result
-def evaluate_and_write(
+async def evaluate_and_write(
     in_file_path: str | Path,
     out_file_path: str | Path,
     config: "evaluation.Config",
 ) -> None:
-    ragas_llm = llm.create_llm(config)
+    ragas_llm = llm_factory.create_llm(config)
     evaluator = AnswerCorrectnessEvaluator(llm=ragas_llm)
     with open(in_file_path, encoding="utf-8") as f:
         reader = csv.DictReader(f, delimiter="\t")
@@ -164,7 +164,7 @@ def evaluate_and_write(
         writer = csv.writer(f, delimiter="\t")
         writer.writerow(OUT_FIELDS)
         for row in tqdm(rows):
-            vals = evaluator.evaluate_answer(
+            vals = await evaluator.evaluate_answer(
                 row["Question"],
                 row["Reference answer"],
                 row["Actual answer"]
@@ -176,8 +176,8 @@ def evaluate_and_write(
 def main():
     args = parse_args()
     config = Config(
-        llm=llm.Config(
-            generation=llm.GenerationConfig(
+        llm=llm_factory.Config(
+            generation=llm_factory.GenerationConfig(
                 provider=args.provider,
                 model=args.llm,
                 temperature=args.temperature,
@@ -185,8 +185,8 @@ def main():
             )
         )
     )
-    evaluate_and_write(
+    asyncio.run(evaluate_and_write(
         args.in_file,
         args.out_file,
         config,
-    )
+    ))

{graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/custom_evaluation.py RENAMED Viewed

@@ -3,6 +3,7 @@ from typing import Literal
 from pydantic import BaseModel, ConfigDict, Field, model_validator
+from graphrag_eval.llm_factory import create_llm
 RESERVED_KEYS = {
     "template_id",
@@ -31,7 +32,6 @@ RESERVED_KEYS = {
     "elapsed_sec",
 }
 Inputs = Literal[
     "question",
     "reference_answer",
@@ -84,8 +84,8 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
     output_variables specifies the order of the outputs.
     """
-    output_instructions = "Output the following values separated by tabs:"\
-        + "".join(f"\n- {k}: {config.outputs[k]}" for k in output_variables)
+    output_instructions = "Output the following values separated by tabs:" \
+                          + "".join(f"\n- {k}: {config.outputs[k]}" for k in output_variables)
     inputs_template = "\n\n".join(
         create_input_template(k) for k in config.inputs
     )
@@ -98,9 +98,9 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
 class CustomEvaluator:
     def __init__(
-        self,
+        self,
         config: Config,
-        llm: "InstructorBaseRagasLLM",
+        eval_config: "evaluation.Config",
     ):
         self.name = config.name
         self.input_variables = config.inputs
@@ -111,11 +111,11 @@ class CustomEvaluator:
             config,
             self.output_variables
         )
-        self.llm = llm
+        self.llm = create_llm(eval_config)
-    def _generate(self, prompt: str) -> str:
+    async def _agenerate(self, prompt: str) -> str:
         """Wrapper method for easier testing"""
-        return self.llm.generate(prompt, None).choices[0].message.content
+        return (await self.llm.agenerate(prompt, None)).choices[0].message.content
     def format_steps(self, steps: list) -> str:
         steps_formatted = []
@@ -134,9 +134,9 @@ class CustomEvaluator:
                         step_out[k] = val
                 else:
                     step_out[k] = val
-            steps_formatted.append(step_out)
+            steps_formatted.append(step_out)
         return json.dumps(steps_formatted, indent=2)
     def error(self, msg: str) -> dict:
         result = {k: None for k in self.output_variables}
         result[self.name + '_error'] = msg
@@ -157,7 +157,7 @@ class CustomEvaluator:
             return result
         return self.error(f"Expected {n_exp} tab-separated values, got: {response}")
-    def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
+    async def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
         inputs = {}
         if "question" in self.input_variables:
             if "question_text" not in reference:
@@ -176,7 +176,7 @@ class CustomEvaluator:
                 return self.error("Reference missing key 'reference_steps'")
             try:
                 formatted_steps_lists = [
-                    self.format_steps(group)
+                    self.format_steps(group)
                     for group in reference["reference_steps"]
                 ]
             except json.JSONDecodeError:
@@ -191,14 +191,14 @@ class CustomEvaluator:
                 return self.error("Malformed actual step JSON")
             inputs["actual_steps"] = formatted_steps_lists
         prompt = self.prompt_template.format(**inputs)
-        response = self._generate(prompt)
+        response = await self._agenerate(prompt)
         return self.parse_outputs(response)
 def create_evaluators(config: "evaluation.Config") -> list[CustomEvaluator]:
     if config.custom_evaluations and config.llm:
         return [
-            CustomEvaluator(c, config.llm.generation)
-            for c in config.custom_evaluations
+            CustomEvaluator(custom_evaluation_config, config)
+            for custom_evaluation_config in config.custom_evaluations
         ]
     return []

{graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/evaluation.py RENAMED Viewed

@@ -4,7 +4,7 @@ import yaml
 from pydantic import BaseModel, Field, model_validator
 from . import custom_evaluation
-from .llm import Config as LLMConfig, create_llm, create_embedder
+from .llm_factory import Config as LLMConfig, create_llm, create_embedder
 from .steps.evaluation import evaluate_steps
@@ -19,7 +19,7 @@ class Config(BaseModel):
             msg = "llm config is required if custom_evaluations are provided"
             raise ValueError(msg)
         return self
     @classmethod
     def parse(cls, config_file_path: str | Path | None) -> "Config":
         if config_file_path:
@@ -78,7 +78,7 @@ async def run_evaluation(
                         llm=ragas_llm
                     )
                     eval_result.update(
-                        answer_correctness_evaluator.get_correctness_dict(
+                        await answer_correctness_evaluator.get_correctness_dict(
                             question,
                             actual_result,
                         )
@@ -90,8 +90,8 @@ async def run_evaluation(
                     ragas_llm,
                 )
             )
-            for relevance_evaluator in custom_evaluators:
-                custom_metrics = relevance_evaluator.evaluate(question, actual_result)
+            for custom_evaluator in custom_evaluators:
+                custom_metrics = await custom_evaluator.evaluate(question, actual_result)
                 eval_result.update(**custom_metrics)
             for key in "input_tokens", "output_tokens", "total_tokens", "elapsed_sec":
                 if key in actual_result:

graphrag_eval-6.0.0/graphrag_eval/llm.py → graphrag_eval-6.2.0/graphrag_eval/llm_factory.py RENAMED Viewed

@@ -37,6 +37,7 @@ def create_llm(config: "evaluation.Config") -> Optional["InstructorBaseRagasLLM"
         )
         ragas_llm.is_async = True
         return ragas_llm
+    return None
 def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding"]:
@@ -53,3 +54,4 @@ def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding
             **params,
         )
         return ragas_embedder
+    return None

graphrag_eval-6.2.0/graphrag_eval/steps/iri_discovery.py ADDED Viewed

@@ -0,0 +1,21 @@
+import json
+from typing import Any
+def do_iri_discovery_steps_equal(
+    reference_step: dict[str, Any],
+    actual_step: dict[str, Any],
+) -> bool:
+    if actual_step["name"] == "autocomplete_search":
+        reference_iri = reference_step["output"]
+        actual_output = json.loads(actual_step["output"])
+        for binding in actual_output["results"]["bindings"]:
+            for _, type_value in binding.items():
+                if type_value["type"] == "uri" and type_value["value"] == reference_iri:
+                    return True
+    elif actual_step["name"] == "sparql_query":
+        reference_iri = reference_step["output"]
+        if reference_iri in actual_step["output"]:
+            return True
+    return False

{graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "graphrag-eval"
-version = "6.0.0"
+version = "6.2.0"
 description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
 authors = [
     { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
@@ -24,7 +24,7 @@ llm = ["ragas", "litellm", "pyyaml"]
 [tool.poetry.group.llm.dependencies]
 ragas = "0.4.3"
-litellm = "1.83.14"
+litellm = "1.85.1"
 pyyaml = "6.0.3"
 [tool.poetry.group.llm]

graphrag_eval-6.0.0/graphrag_eval/steps/iri_discovery.py DELETED Viewed

@@ -1,20 +0,0 @@
-import json
-from typing import Any
-def do_iri_discovery_steps_equal(
-    reference_step: dict[str, Any],
-    actual_step: dict[str, Any],
-) -> bool:
-    if actual_step["name"] != "autocomplete_search":
-        return False
-    reference_iri = reference_step["output"]
-    actual_output = json.loads(actual_step["output"])
-    for binding in actual_output["results"]["bindings"]:
-        for _, type_value in binding.items():
-            if type_value["type"] == "uri" and type_value["value"] == reference_iri:
-                return True
-    return False