graphrag-eval 6.3.0__tar.gz → 6.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/PKG-INFO +1 -1
  2. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/aggregation.py +4 -4
  3. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/answer_correctness.py +52 -15
  4. graphrag_eval-6.4.0/graphrag_eval/answer_relevance.py +61 -0
  5. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/cli/answer_correctness.py +36 -19
  6. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/custom_evaluation.py +34 -18
  7. graphrag_eval-6.4.0/graphrag_eval/evaluation.py +154 -0
  8. graphrag_eval-6.4.0/graphrag_eval/evaluator.py +14 -0
  9. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/llm_factory.py +18 -8
  10. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/evaluation.py +11 -3
  11. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/pyproject.toml +1 -1
  12. graphrag_eval-6.3.0/graphrag_eval/answer_relevance.py +0 -29
  13. graphrag_eval-6.3.0/graphrag_eval/evaluation.py +0 -104
  14. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/LICENSE +0 -0
  15. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/README.md +0 -0
  16. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/__init__.py +0 -0
  17. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/cli/__init__.py +0 -0
  18. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/prompts/template.md +0 -0
  19. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/__init__.py +0 -0
  20. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/iri_discovery.py +0 -0
  21. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/retrieval_answer.py +0 -0
  22. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/retrieval_context_ids.py +0 -0
  23. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/retrieval_context_texts.py +0 -0
  24. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/sparql.py +0 -0
  25. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/timeseries.py +0 -0
  26. {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: graphrag-eval
3
- Version: 6.3.0
3
+ Version: 6.4.0
4
4
  Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
5
5
  License: Apache-2.0
6
6
  Author: Philip Ganchev
@@ -1,13 +1,13 @@
1
1
  import json
2
- import yaml
3
2
  from collections import defaultdict
4
3
  from collections.abc import Sequence
5
4
  from pathlib import Path
6
5
  from statistics import mean, median
7
6
  from typing import Any, Collection, Iterable
8
7
 
9
- from . import evaluation
8
+ import yaml
10
9
 
10
+ from . import evaluation
11
11
 
12
12
  METRICS = [
13
13
  "answer_recall",
@@ -155,7 +155,7 @@ def compute_micro_stats(
155
155
  ) -> dict:
156
156
  if custom_metrics is None:
157
157
  custom_metrics = []
158
-
158
+
159
159
  values = number_of_samples_per_template_by_status.values()
160
160
  micro_summary = defaultdict(dict, {
161
161
  "number_of_error_samples": sum(v["error"] for v in values),
@@ -197,7 +197,7 @@ def compute_macro_stats(
197
197
  ) -> dict:
198
198
  if custom_metrics is None:
199
199
  custom_metrics = []
200
-
200
+
201
201
  macro_summary = defaultdict(dict)
202
202
  for metric in METRICS + custom_metrics:
203
203
  means = [
@@ -1,21 +1,36 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
4
+ from typing import Any, Self, TYPE_CHECKING
2
5
 
3
6
  from pydantic import BaseModel, Field
4
7
 
5
8
  from graphrag_eval.util import compute_f1
9
+ from .evaluator import Evaluator
10
+
11
+ if TYPE_CHECKING:
12
+ from ragas.llms.base import InstructorBaseRagasLLM
6
13
 
7
14
 
8
15
  def load_default_prompt() -> str:
9
- with open(Path(__file__).parent / "prompts" / "template.md", "r", encoding="utf-8") as f:
16
+ with open(
17
+ Path(__file__).parent / "prompts" / "template.md",
18
+ encoding="utf-8"
19
+ ) as f:
10
20
  return f.read()
11
21
 
12
22
 
13
23
  class AnswerCorrectnessConfig(BaseModel):
24
+ enabled: bool = Field(default=True)
14
25
  prompt: str = Field(default_factory=load_default_prompt)
15
26
 
16
27
 
17
28
  class InvalidPromptException(Exception):
18
- def __init__(self, message="The prompt template is invalid and cannot be formatted."):
29
+ def __init__(
30
+ self,
31
+ message="The prompt template is invalid and cannot be "
32
+ "formatted."
33
+ ):
19
34
  self.message = message
20
35
  super().__init__(self.message)
21
36
 
@@ -23,13 +38,25 @@ class InvalidPromptException(Exception):
23
38
  class AnswerCorrectnessEvaluator:
24
39
  def __init__(
25
40
  self,
26
- llm: "InstructorBaseRagasLLM",
41
+ ragas_llm: InstructorBaseRagasLLM,
27
42
  config: AnswerCorrectnessConfig | None = None,
28
43
  ):
29
44
  self.config = config or AnswerCorrectnessConfig()
30
45
  self.__validate_prompt_template(self.config.prompt)
31
46
  self.prompt_template = self.config.prompt
32
- self.llm = llm
47
+ self.ragas_llm = ragas_llm
48
+
49
+ @classmethod
50
+ def from_config(
51
+ cls,
52
+ ragas_llm: InstructorBaseRagasLLM | None,
53
+ config: AnswerCorrectnessConfig | None
54
+ ) -> Self | None:
55
+ if ragas_llm is None:
56
+ return None
57
+ if config is None or not config.enabled:
58
+ return None
59
+ return cls(ragas_llm=ragas_llm, config=config)
33
60
 
34
61
  @staticmethod
35
62
  def __validate_prompt_template(prompt_template: str):
@@ -48,7 +75,7 @@ class AnswerCorrectnessEvaluator:
48
75
 
49
76
  async def _agenerate(self, prompt):
50
77
  """Wrapper method for easier testing"""
51
- return (await self.llm.agenerate(prompt, None)).choices[0].message.content
78
+ return (await self.ragas_llm.agenerate(prompt, None)).choices[0].message.content
52
79
 
53
80
  async def evaluate_answer(
54
81
  self,
@@ -56,9 +83,13 @@ class AnswerCorrectnessEvaluator:
56
83
  reference_answer: str,
57
84
  actual_answer: str
58
85
  ) -> tuple[int, int, int, str]:
59
- if any(not s.strip() for s in [question, reference_answer, actual_answer]):
60
- raise ValueError("The question of the reference or the actual answer is a blank "
61
- "string!")
86
+ if any(
87
+ not s.strip() for s in [question, reference_answer, actual_answer]
88
+ ):
89
+ raise ValueError(
90
+ "The question of the reference or the actual answer is a blank "
91
+ "string!"
92
+ )
62
93
  prompt = self.prompt_template.format(
63
94
  question=question,
64
95
  reference_answer=reference_answer,
@@ -67,12 +98,14 @@ class AnswerCorrectnessEvaluator:
67
98
  response_str = await self._agenerate(prompt)
68
99
  return self.extract_response_values(response_str)
69
100
 
70
- async def get_correctness_dict(
101
+ async def evaluate(
71
102
  self,
72
- reference: dict,
73
- actual: dict,
74
- ):
75
- result = {"reference_answer": reference["reference_answer"]}
103
+ reference: dict[str, Any],
104
+ actual: dict[str, Any]
105
+ ) -> dict[str, Any]:
106
+ if "actual_answer" not in actual or "reference_answer" not in reference:
107
+ return {}
108
+ result = {}
76
109
  try:
77
110
  num_ref_claims, num_actual_claims, num_matching_claims, reason = \
78
111
  await self.evaluate_answer(
@@ -96,7 +129,7 @@ class AnswerCorrectnessEvaluator:
96
129
  if f1 is not None:
97
130
  result["answer_f1"] = f1
98
131
  except Exception as exc:
99
- result["answer_eval_error"] = str(exc)
132
+ result["answer_correctness_error"] = str(exc)
100
133
  return result
101
134
 
102
135
  @staticmethod
@@ -134,6 +167,10 @@ class AnswerCorrectnessEvaluator:
134
167
  n_matching > n_actual
135
168
  ]):
136
169
  raise ValueError(
137
- f"Invalid claims counts combination: {n_ref}\t{n_actual}\t{n_matching}"
170
+ "Invalid claims counts combination: "
171
+ f"{n_ref}\t{n_actual}\t{n_matching}"
138
172
  )
139
173
  return n_ref, n_actual, n_matching, vals[3]
174
+
175
+
176
+ _: Evaluator = AnswerCorrectnessEvaluator
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Self, TYPE_CHECKING
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from .evaluator import Evaluator
8
+
9
+ if TYPE_CHECKING:
10
+ from ragas.llms.base import InstructorBaseRagasLLM
11
+ from ragas.embeddings.base import BaseRagasEmbeddings, BaseRagasEmbedding
12
+
13
+
14
+ class AnswerRelevanceConfig(BaseModel):
15
+ enabled: bool = Field(default=True)
16
+
17
+
18
+ class AnswerRelevanceEvaluator:
19
+ def __init__(
20
+ self,
21
+ ragas_llm: InstructorBaseRagasLLM,
22
+ ragas_embedder: BaseRagasEmbeddings | BaseRagasEmbedding
23
+ ):
24
+ from ragas.metrics.collections import AnswerRelevancy
25
+ self.scorer = AnswerRelevancy(llm=ragas_llm, embeddings=ragas_embedder)
26
+
27
+ @classmethod
28
+ def from_config(
29
+ cls,
30
+ ragas_llm: InstructorBaseRagasLLM | None,
31
+ ragas_embedder: BaseRagasEmbeddings | BaseRagasEmbedding | None,
32
+ config: AnswerRelevanceConfig | None
33
+ ) -> Self | None:
34
+ if ragas_llm is None or ragas_embedder is None:
35
+ return None
36
+ if config is None or not config.enabled:
37
+ return None
38
+ return cls(ragas_llm=ragas_llm, ragas_embedder=ragas_embedder)
39
+
40
+ async def evaluate(
41
+ self,
42
+ reference: dict[str, Any],
43
+ actual: dict[str, Any]
44
+ ) -> dict[str, Any]:
45
+ if "actual_answer" not in actual:
46
+ return {}
47
+ try:
48
+ result = await self.scorer.ascore(
49
+ user_input=reference["question_text"],
50
+ response=actual["actual_answer"]
51
+ )
52
+ return {
53
+ "answer_relevance": result.value
54
+ }
55
+ except Exception as e:
56
+ return {
57
+ "answer_relevance_error": str(e)
58
+ }
59
+
60
+
61
+ _: Evaluator = AnswerRelevanceEvaluator
@@ -1,8 +1,11 @@
1
+ from __future__ import annotations
2
+
1
3
  import argparse
2
4
  import asyncio
3
5
  import csv
4
6
  from argparse import ArgumentParser
5
7
  from pathlib import Path
8
+ from typing import TYPE_CHECKING
6
9
 
7
10
  from tqdm import tqdm
8
11
 
@@ -10,34 +13,39 @@ from graphrag_eval import llm_factory
10
13
  from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
11
14
  from graphrag_eval.evaluation import Config
12
15
 
16
+ if TYPE_CHECKING:
17
+ from ragas.llms.base import InstructorBaseRagasLLM
18
+
13
19
 
14
20
  def parse_args() -> argparse.Namespace:
15
21
  parser = ArgumentParser(
16
- description="Calculates answer correctness over the entries from the input tsv file and "
17
- "stores the output in the output tsv file.",
22
+ description="Calculates answer correctness over the entries from the "
23
+ "input tsv file and stores the output in the output tsv "
24
+ "file.",
18
25
  )
19
26
  parser.add_argument(
20
27
  "-i",
21
28
  "--input-tsv-file-path",
22
29
  type=Path,
23
30
  required=True,
24
- help="Input tsv file path with columns `Question`, `Reference answer` and `Actual answer`",
31
+ help="Input tsv file path with columns `Question`, `Reference answer` "
32
+ "and `Actual answer`",
25
33
  )
26
34
  parser.add_argument(
27
35
  "-o",
28
36
  "--output-tsv-file-path",
29
37
  type=Path,
30
38
  required=True,
31
- help="Output tsv file path with columns `#Reference`, `#PTarget`, `#Matching`, "
32
- "`Reasoning`, `Error`",
39
+ help="Output tsv file path with columns `#Reference`, `#PTarget`, "
40
+ "`#Matching`, `Reasoning`, `Error`",
33
41
  )
34
42
  parser.add_argument(
35
43
  "-c",
36
44
  "--config-yaml-file-path",
37
45
  type=Path,
38
46
  required=True,
39
- help="Config yaml file path with definition of the LLM to use and optionally a custom "
40
- "prompt.",
47
+ help="Config yaml file path with definition of the LLM to use and "
48
+ "optionally a custom prompt.",
41
49
  )
42
50
  return parser.parse_args()
43
51
 
@@ -54,7 +62,9 @@ async def evaluate_and_write(
54
62
  output_tsv_file_path.parent.mkdir(parents=True, exist_ok=True)
55
63
  with open(output_tsv_file_path, "w", encoding="utf-8") as f:
56
64
  writer = csv.writer(f, delimiter="\t")
57
- writer.writerow(["#Reference", "#PTarget", "#Matching", "Reasoning", "Error"])
65
+ writer.writerow(
66
+ ["#Reference", "#PTarget", "#Matching", "Reasoning", "Error"]
67
+ )
58
68
 
59
69
  for row in tqdm(rows):
60
70
  if "Question" not in row or \
@@ -81,19 +91,26 @@ def run(
81
91
  output_tsv_file_path: Path,
82
92
  ):
83
93
  config = Config.parse(config_yaml_file_path)
84
- ragas_llm = llm_factory.create_llm(config)
94
+ ragas_llm: InstructorBaseRagasLLM | None = llm_factory.create_llm(
95
+ config.llm
96
+ )
85
97
  if ragas_llm is None:
86
- raise ValueError("LLM must be configured to calculate the answer correctness!")
87
- else:
88
- evaluator = AnswerCorrectnessEvaluator(
89
- llm=ragas_llm,
90
- config=config.answer_correctness,
98
+ raise ValueError(
99
+ "LLM must be configured to calculate the answer correctness!"
100
+ )
101
+ if config.answer_correctness and not config.answer_correctness.enabled:
102
+ raise ValueError(
103
+ "Can't disable answer correctness, when running this script!"
91
104
  )
92
- asyncio.run(evaluate_and_write(
93
- input_tsv_file_path,
94
- output_tsv_file_path,
95
- evaluator,
96
- ))
105
+ evaluator = AnswerCorrectnessEvaluator(
106
+ ragas_llm=ragas_llm,
107
+ config=config.answer_correctness,
108
+ )
109
+ asyncio.run(evaluate_and_write(
110
+ input_tsv_file_path,
111
+ output_tsv_file_path,
112
+ evaluator,
113
+ ))
97
114
 
98
115
 
99
116
  def main():
@@ -1,9 +1,14 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
- from typing import Literal
4
+ from typing import Literal, Self, TYPE_CHECKING, Any
3
5
 
4
6
  from pydantic import BaseModel, ConfigDict, Field, model_validator
5
7
 
6
- from graphrag_eval.llm_factory import create_llm
8
+ from .evaluator import Evaluator
9
+
10
+ if TYPE_CHECKING:
11
+ from ragas.llms.base import InstructorBaseRagasLLM
7
12
 
8
13
  RESERVED_KEYS = {
9
14
  "template_id",
@@ -43,7 +48,7 @@ Inputs = Literal[
43
48
  StepsKey = Literal["args", "output"]
44
49
 
45
50
 
46
- class Config(BaseModel):
51
+ class EvaluatorConfig(BaseModel):
47
52
  model_config = ConfigDict(extra='forbid')
48
53
  name: str
49
54
  inputs: list[Inputs] = Field(..., min_length=1)
@@ -53,7 +58,7 @@ class Config(BaseModel):
53
58
  steps_keys: set[StepsKey] | None = Field(default=None, min_length=1)
54
59
 
55
60
  @model_validator(mode='after')
56
- def validate_step_dependencies(self) -> 'Config':
61
+ def validate_step_dependencies(self) -> Self:
57
62
  if set(self.inputs) & {"reference_steps", "actual_steps"}:
58
63
  suffix = "is required when steps are in inputs"
59
64
  for var_name in ["steps_name", "steps_keys"]:
@@ -62,7 +67,7 @@ class Config(BaseModel):
62
67
  return self
63
68
 
64
69
  @model_validator(mode='after')
65
- def validate_name_and_outputs(self) -> 'Config':
70
+ def validate_name_and_outputs(self) -> Self:
66
71
  if self.name + "_error" in RESERVED_KEYS:
67
72
  raise ValueError(f"Name {self.name} is reserved")
68
73
  conflicting_keys = set(self.outputs.keys()) & RESERVED_KEYS
@@ -76,7 +81,7 @@ def create_input_template(input_key: str) -> str:
76
81
  return f"# {header}\n{{{input_key}}}"
77
82
 
78
83
 
79
- def create_prompt_template(config: Config, output_variables: list[str]) -> str:
84
+ def create_prompt_template(config: EvaluatorConfig, output_variables: list[str]) -> str:
80
85
  """
81
86
  Return a template for the LLM prompt, with placeholders for the inputs,
82
87
  instructions, outputs etc. We use this template at evaluation time to
@@ -99,8 +104,8 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
99
104
  class CustomEvaluator:
100
105
  def __init__(
101
106
  self,
102
- config: Config,
103
- eval_config: "evaluation.Config",
107
+ ragas_llm: InstructorBaseRagasLLM,
108
+ config: EvaluatorConfig,
104
109
  ):
105
110
  self.name = config.name
106
111
  self.input_variables = config.inputs
@@ -111,11 +116,24 @@ class CustomEvaluator:
111
116
  config,
112
117
  self.output_variables
113
118
  )
114
- self.llm = create_llm(eval_config)
119
+ self.ragas_llm = ragas_llm
120
+
121
+ @classmethod
122
+ def from_config(
123
+ cls,
124
+ ragas_llm: InstructorBaseRagasLLM | None,
125
+ evaluation_configs: list[EvaluatorConfig] | None
126
+ ) -> list[Self]:
127
+ if ragas_llm and evaluation_configs:
128
+ return [
129
+ cls(ragas_llm, evaluation_config)
130
+ for evaluation_config in evaluation_configs
131
+ ]
132
+ return []
115
133
 
116
134
  async def _agenerate(self, prompt: str) -> str:
117
135
  """Wrapper method for easier testing"""
118
- return (await self.llm.agenerate(prompt, None)).choices[0].message.content
136
+ return (await self.ragas_llm.agenerate(prompt, None)).choices[0].message.content
119
137
 
120
138
  def format_steps(self, steps: list) -> str:
121
139
  steps_formatted = []
@@ -157,7 +175,11 @@ class CustomEvaluator:
157
175
  return result
158
176
  return self.error(f"Expected {n_exp} tab-separated values, got: {response}")
159
177
 
160
- async def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
178
+ async def evaluate(
179
+ self,
180
+ reference: dict[str, Any],
181
+ actual: dict[str, Any]
182
+ ) -> dict[str, Any]:
161
183
  inputs = {}
162
184
  if "question" in self.input_variables:
163
185
  if "question_text" not in reference:
@@ -195,10 +217,4 @@ class CustomEvaluator:
195
217
  return self.parse_outputs(response)
196
218
 
197
219
 
198
- def create_evaluators(config: "evaluation.Config") -> list[CustomEvaluator]:
199
- if config.custom_evaluations and config.llm:
200
- return [
201
- CustomEvaluator(custom_evaluation_config, config)
202
- for custom_evaluation_config in config.custom_evaluations
203
- ]
204
- return []
220
+ _: Evaluator = CustomEvaluator
@@ -0,0 +1,154 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Self, TYPE_CHECKING
5
+
6
+ import yaml
7
+ from pydantic import BaseModel, Field, model_validator
8
+
9
+ from .answer_correctness import (
10
+ AnswerCorrectnessConfig,
11
+ AnswerCorrectnessEvaluator,
12
+ )
13
+ from .answer_relevance import AnswerRelevanceConfig, AnswerRelevanceEvaluator
14
+ from .custom_evaluation import EvaluatorConfig, CustomEvaluator
15
+ from .evaluator import Evaluator
16
+ from .llm_factory import LLMConfig, create_llm, create_embedder
17
+ from .steps.evaluation import evaluate_steps
18
+
19
+ if TYPE_CHECKING:
20
+ from ragas.llms.base import InstructorBaseRagasLLM
21
+ from ragas.embeddings.base import BaseRagasEmbeddings, BaseRagasEmbedding
22
+
23
+
24
+ class Config(BaseModel):
25
+ llm: LLMConfig | None = None
26
+ custom_evaluations: list[EvaluatorConfig] | None = Field(
27
+ default=None,
28
+ min_length=1
29
+ )
30
+ answer_correctness: AnswerCorrectnessConfig | None = None
31
+ answer_relevance: AnswerRelevanceConfig | None = None
32
+
33
+ @model_validator(mode="after")
34
+ def validate_config_and_set_defaults(self) -> Self:
35
+ has_llm = self.llm is not None
36
+ has_embedding = has_llm and self.llm.embedding is not None
37
+
38
+ if self.answer_correctness is None and has_llm:
39
+ self.answer_correctness = AnswerCorrectnessConfig()
40
+
41
+ if self.answer_relevance is None and has_embedding:
42
+ self.answer_relevance = AnswerRelevanceConfig()
43
+
44
+ if self.custom_evaluations and not has_llm:
45
+ raise ValueError(
46
+ "llm config is required if custom_evaluations are provided"
47
+ )
48
+ if (
49
+ self.answer_correctness
50
+ and self.answer_correctness.enabled
51
+ and not has_llm
52
+ ):
53
+ raise ValueError(
54
+ "llm config is required if answer correctness is enabled"
55
+ )
56
+ if (
57
+ self.answer_relevance
58
+ and self.answer_relevance.enabled
59
+ and not has_embedding
60
+ ):
61
+ raise ValueError(
62
+ "llm config including embedding is required if answer "
63
+ "relevance is enabled"
64
+ )
65
+ return self
66
+
67
+ @classmethod
68
+ def parse(cls, config_file_path: str | Path | None) -> Self:
69
+ if config_file_path:
70
+ with open(config_file_path, encoding="utf-8") as f:
71
+ config_dict = yaml.safe_load(f)
72
+ return cls(**config_dict)
73
+ return cls()
74
+
75
+
76
+ async def run_evaluation(
77
+ qa_dataset: list[dict],
78
+ responses_dict: dict,
79
+ config_file_path: str | Path | None = None,
80
+ ) -> list[dict]:
81
+ evaluators, ragas_llm = parse_config_and_init_evaluators(config_file_path)
82
+
83
+ # Output metrics are not nested, for simpler aggregation
84
+ evaluation_results = []
85
+ for template in qa_dataset:
86
+ template_id = template["template_id"]
87
+ for question in template["questions"]:
88
+ actual_result = responses_dict[question["id"]]
89
+ eval_result = {
90
+ "template_id": template_id,
91
+ "question_id": actual_result["question_id"],
92
+ "question_text": question["question_text"]
93
+ }
94
+ for key in ("input_tokens", "output_tokens", "total_tokens",
95
+ "elapsed_sec"):
96
+ if key in actual_result:
97
+ eval_result[key] = actual_result[key]
98
+ if "actual_answer" in actual_result:
99
+ eval_result["actual_answer"] = actual_result["actual_answer"]
100
+ if "reference_answer" in question:
101
+ eval_result["reference_answer"] = question["reference_answer"]
102
+ if "reference_steps" in question:
103
+ eval_result["reference_steps"] = question["reference_steps"]
104
+ if "error" in actual_result:
105
+ eval_result.update({
106
+ "status": "error",
107
+ "error": actual_result["error"],
108
+ })
109
+ else:
110
+ eval_result["status"] = "success"
111
+
112
+ eval_result.update(
113
+ await evaluate_steps(question, actual_result, ragas_llm)
114
+ )
115
+ for evaluator in evaluators:
116
+ eval_result.update(
117
+ await evaluator.evaluate(question, actual_result)
118
+ )
119
+
120
+ evaluation_results.append(eval_result)
121
+ return evaluation_results
122
+
123
+
124
+ def parse_config_and_init_evaluators(
125
+ config_file_path: str | Path | None
126
+ ) -> tuple[
127
+ list[Evaluator],
128
+ InstructorBaseRagasLLM | None,
129
+ ]:
130
+ config = Config.parse(config_file_path)
131
+ ragas_llm: InstructorBaseRagasLLM | None = create_llm(config.llm)
132
+ ragas_embedder: BaseRagasEmbeddings | BaseRagasEmbedding | None = (
133
+ create_embedder(config.llm)
134
+ )
135
+
136
+ evaluators: list[Evaluator] = []
137
+
138
+ answer_relevance_evaluator = AnswerRelevanceEvaluator.from_config(
139
+ ragas_llm, ragas_embedder, config.answer_relevance
140
+ )
141
+ if answer_relevance_evaluator:
142
+ evaluators.append(answer_relevance_evaluator)
143
+
144
+ answer_correctness_evaluator = AnswerCorrectnessEvaluator.from_config(
145
+ ragas_llm, config.answer_correctness
146
+ )
147
+ if answer_correctness_evaluator:
148
+ evaluators.append(answer_correctness_evaluator)
149
+
150
+ evaluators.extend(
151
+ CustomEvaluator.from_config(ragas_llm, config.custom_evaluations)
152
+ )
153
+
154
+ return evaluators, ragas_llm
@@ -0,0 +1,14 @@
1
+ from typing import Protocol, Any
2
+
3
+
4
+ class Evaluator(Protocol):
5
+ async def evaluate(
6
+ self,
7
+ reference: dict[str, Any],
8
+ actual: dict[str, Any]
9
+ ) -> dict[str, Any]:
10
+ """
11
+ Evaluate the actual output against the reference.
12
+ Returns a flat dictionary containing scores or error tracking logs.
13
+ """
14
+ ...
@@ -1,7 +1,13 @@
1
- from typing import Optional
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
2
4
 
3
5
  from pydantic import BaseModel, ConfigDict, Field
4
6
 
7
+ if TYPE_CHECKING:
8
+ from ragas.llms.base import InstructorBaseRagasLLM
9
+ from ragas.embeddings.base import BaseRagasEmbeddings, BaseRagasEmbedding
10
+
5
11
 
6
12
  class GenerationConfig(BaseModel):
7
13
  provider: str
@@ -17,18 +23,20 @@ class EmbeddingConfig(BaseModel):
17
23
  model_config = ConfigDict(extra='allow')
18
24
 
19
25
 
20
- class Config(BaseModel):
26
+ class LLMConfig(BaseModel):
21
27
  generation: GenerationConfig
22
28
  embedding: EmbeddingConfig | None = None
23
29
 
24
30
 
25
- def create_llm(config: "evaluation.Config") -> Optional["InstructorBaseRagasLLM"]:
26
- if config.llm:
31
+ def create_llm(
32
+ config: LLMConfig | None
33
+ ) -> InstructorBaseRagasLLM | None:
34
+ if config:
27
35
  import litellm
28
36
  from ragas.llms import llm_factory
29
37
 
30
38
  litellm.drop_params = True # Remove unsupported params from requests
31
- params = config.llm.generation.model_dump()
39
+ params = config.generation.model_dump()
32
40
  ragas_llm = llm_factory(
33
41
  provider="litellm",
34
42
  model=f"{params.pop('provider')}/{params.pop('model')}",
@@ -40,13 +48,15 @@ def create_llm(config: "evaluation.Config") -> Optional["InstructorBaseRagasLLM"
40
48
  return None
41
49
 
42
50
 
43
- def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding"]:
44
- if config.llm and config.llm.embedding:
51
+ def create_embedder(
52
+ config: LLMConfig | None
53
+ ) -> BaseRagasEmbeddings | BaseRagasEmbedding | None:
54
+ if config and config.embedding:
45
55
  import litellm
46
56
  from ragas.embeddings.base import embedding_factory
47
57
 
48
58
  litellm.drop_params = True # Remove unsupported params from requests
49
- params = config.llm.embedding.model_dump()
59
+ params = config.embedding.model_dump()
50
60
  ragas_embedder = embedding_factory(
51
61
  provider="litellm",
52
62
  model=f"{params.pop('provider')}/{params.pop('model')}",
@@ -1,13 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  from collections import defaultdict
4
6
  from collections.abc import Sequence
5
- from typing import Any
7
+ from typing import Any, TYPE_CHECKING
6
8
 
7
9
  from .iri_discovery import do_iri_discovery_steps_equal
8
10
  from .retrieval_context_ids import recall_at_k
9
11
  from .sparql import compare_sparql_results
10
- from .timeseries import do_retrieve_time_series_steps_equal, do_retrieve_data_points_steps_equal
12
+ from .timeseries import (
13
+ do_retrieve_time_series_steps_equal,
14
+ do_retrieve_data_points_steps_equal,
15
+ )
16
+
17
+ if TYPE_CHECKING:
18
+ from ragas.llms.base import InstructorBaseRagasLLM
11
19
 
12
20
  logger = logging.getLogger(__name__)
13
21
 
@@ -140,7 +148,7 @@ def calculate_steps_score(
140
148
  async def evaluate_steps(
141
149
  reference: dict,
142
150
  actual: dict,
143
- ragas_llm: "InstructorBaseRagasLLM",
151
+ ragas_llm: InstructorBaseRagasLLM | None,
144
152
  ) -> dict:
145
153
  eval_result = {}
146
154
  actual_steps = actual.get("actual_steps", [])
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "graphrag-eval"
3
- version = "6.3.0"
3
+ version = "6.4.0"
4
4
  description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
5
5
  authors = [
6
6
  { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
@@ -1,29 +0,0 @@
1
- from ragas.embeddings.base import BaseRagasEmbedding
2
- from ragas.llms.base import InstructorBaseRagasLLM
3
- from ragas.metrics.collections import AnswerRelevancy
4
-
5
- from graphrag_eval.util import singleton
6
-
7
-
8
- @singleton
9
- class Evaluator:
10
- def __init__(self, ragas_llm: InstructorBaseRagasLLM, ragas_embedder: BaseRagasEmbedding):
11
- self.scorer = AnswerRelevancy(llm=ragas_llm, embeddings=ragas_embedder)
12
-
13
- async def get_relevance_dict(
14
- self,
15
- question_text: str,
16
- actual_answer: str,
17
- ) -> dict:
18
- try:
19
- result = await self.scorer.ascore(
20
- user_input=question_text,
21
- response=actual_answer
22
- )
23
- return {
24
- "answer_relevance": result.value
25
- }
26
- except Exception as e:
27
- return {
28
- "answer_relevance_error": str(e)
29
- }
@@ -1,104 +0,0 @@
1
- from pathlib import Path
2
-
3
- import yaml
4
- from pydantic import BaseModel, Field, model_validator
5
-
6
- from . import custom_evaluation
7
- from .answer_correctness import AnswerCorrectnessConfig
8
- from .llm_factory import Config as LLMConfig, create_llm, create_embedder
9
- from .steps.evaluation import evaluate_steps
10
-
11
-
12
- class Config(BaseModel):
13
- llm: LLMConfig | None = None
14
- custom_evaluations: list[custom_evaluation.Config] | None \
15
- = Field(default=None, min_length=1)
16
- answer_correctness: AnswerCorrectnessConfig | None = None
17
-
18
- @model_validator(mode="after")
19
- def validate_config(self) -> "Config":
20
- if self.custom_evaluations and not self.llm:
21
- msg = "llm config is required if custom_evaluations are provided"
22
- raise ValueError(msg)
23
- return self
24
-
25
- @classmethod
26
- def parse(cls, config_file_path: str | Path | None) -> "Config":
27
- if config_file_path:
28
- with open(config_file_path, encoding="utf-8") as f:
29
- config_dict = yaml.safe_load(f)
30
- return cls(**config_dict)
31
- return cls()
32
-
33
-
34
- async def run_evaluation(
35
- qa_dataset: list[dict],
36
- responses_dict: dict,
37
- config_file_path: str | Path | None = None,
38
- ) -> list[dict]:
39
- # Output metrics are not nested, for simpler aggregation
40
- evaluation_results = []
41
- config = Config.parse(config_file_path)
42
- ragas_llm = create_llm(config)
43
- ragas_embedder = create_embedder(config)
44
- custom_evaluators = custom_evaluation.create_evaluators(config)
45
- for template in qa_dataset:
46
- template_id = template["template_id"]
47
- for question in template["questions"]:
48
- actual_result = responses_dict[question["id"]]
49
- eval_result = {
50
- "template_id": template_id,
51
- "question_id": actual_result["question_id"],
52
- "question_text": question["question_text"]
53
- }
54
- if "reference_answer" in question:
55
- eval_result["reference_answer"] = question["reference_answer"]
56
- if "reference_steps" in question:
57
- eval_result["reference_steps"] = question["reference_steps"]
58
- if "error" in actual_result:
59
- eval_result.update({
60
- "status": "error",
61
- "error": actual_result["error"],
62
- })
63
- else:
64
- eval_result["status"] = "success"
65
-
66
- if "actual_answer" in actual_result:
67
- eval_result["actual_answer"] = actual_result["actual_answer"]
68
- if ragas_llm:
69
- from graphrag_eval.answer_relevance import Evaluator
70
- relevance_evaluator = Evaluator(ragas_llm, ragas_embedder)
71
- eval_result.update(
72
- await relevance_evaluator.get_relevance_dict(
73
- question["question_text"],
74
- actual_result["actual_answer"],
75
- )
76
- )
77
- if "reference_answer" in question and ragas_llm:
78
- from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
79
- answer_correctness_evaluator = AnswerCorrectnessEvaluator(
80
- llm=ragas_llm,
81
- config=config.answer_correctness,
82
- )
83
- eval_result.update(
84
- await answer_correctness_evaluator.get_correctness_dict(
85
- question,
86
- actual_result,
87
- )
88
- )
89
- eval_result.update(
90
- await evaluate_steps(
91
- question,
92
- actual_result,
93
- ragas_llm,
94
- )
95
- )
96
- for custom_evaluator in custom_evaluators:
97
- custom_metrics = await custom_evaluator.evaluate(question, actual_result)
98
- eval_result.update(**custom_metrics)
99
- for key in "input_tokens", "output_tokens", "total_tokens", "elapsed_sec":
100
- if key in actual_result:
101
- eval_result[key] = actual_result[key]
102
-
103
- evaluation_results.append(eval_result)
104
- return evaluation_results
File without changes
File without changes