graphrag-eval 6.1.0__tar.gz → 6.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. graphrag_eval-6.3.0/PKG-INFO +46 -0
  2. graphrag_eval-6.3.0/README.md +27 -0
  3. graphrag_eval-6.3.0/graphrag_eval/answer_correctness.py +139 -0
  4. graphrag_eval-6.3.0/graphrag_eval/cli/answer_correctness.py +105 -0
  5. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/custom_evaluation.py +15 -15
  6. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/evaluation.py +9 -6
  7. graphrag_eval-6.1.0/graphrag_eval/llm.py → graphrag_eval-6.3.0/graphrag_eval/llm_factory.py +4 -2
  8. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/prompts/template.md +1 -1
  9. graphrag_eval-6.3.0/graphrag_eval/steps/__init__.py +0 -0
  10. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/pyproject.toml +3 -3
  11. graphrag_eval-6.1.0/PKG-INFO +0 -1310
  12. graphrag_eval-6.1.0/README.md +0 -1291
  13. graphrag_eval-6.1.0/graphrag_eval/answer_correctness.py +0 -192
  14. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/LICENSE +0 -0
  15. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/__init__.py +0 -0
  16. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/aggregation.py +0 -0
  17. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/answer_relevance.py +0 -0
  18. {graphrag_eval-6.1.0/graphrag_eval/steps → graphrag_eval-6.3.0/graphrag_eval/cli}/__init__.py +0 -0
  19. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/evaluation.py +0 -0
  20. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/iri_discovery.py +0 -0
  21. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/retrieval_answer.py +0 -0
  22. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/retrieval_context_ids.py +0 -0
  23. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/retrieval_context_texts.py +0 -0
  24. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/sparql.py +0 -0
  25. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/timeseries.py +0 -0
  26. {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/util.py +0 -0
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.3
2
+ Name: graphrag-eval
3
+ Version: 6.3.0
4
+ Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
5
+ License: Apache-2.0
6
+ Author: Philip Ganchev
7
+ Author-email: philip.ganchev@graphwise.ai
8
+ Requires-Python: >=3.12,<3.13
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Provides-Extra: llm
13
+ Requires-Dist: pydantic (==2.12.5)
14
+ Requires-Dist: python-dateutil (==2.9.0.post0)
15
+ Requires-Dist: ragas (==0.4.3) ; extra == "llm"
16
+ Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
17
+ Description-Content-Type: text/markdown
18
+
19
+ <p align="center">
20
+ <img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
21
+ </p>
22
+
23
+ # QA Evaluation
24
+
25
+ This is a Python library for assessing the quality of question-answering systems, such as systems built with LLM-based agents. It is agnostic to the agent implementation and the LLM it uses.
26
+
27
+ The evaluation is based on a user-provided reference dataset containing queries, reference responses, and optional reference steps, such as expected tool uses. The evaluator compares these references with the agent's actual responses and executed steps. Reference steps can be grouped to allow some expected steps to occur in any order.
28
+
29
+ The library provides built-in evaluation metrics and supports user-defined custom metrics ([§ Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)).
30
+
31
+ ## Documentation
32
+
33
+ - [Quickstart](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/quickstart.md)
34
+ - [Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)
35
+ - [Configuration](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/config.md)
36
+ - [Input](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/input.md)
37
+ - [Output](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/output.md)
38
+
39
+ ## Maintainers
40
+
41
+ Developed and maintained by [Graphwise](https://graphwise.ai/). For issues and feature requests, please open a [GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
42
+
43
+ ## License
44
+
45
+ Apache-2.0 License. See the [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.
46
+
@@ -0,0 +1,27 @@
1
+ <p align="center">
2
+ <img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
3
+ </p>
4
+
5
+ # QA Evaluation
6
+
7
+ This is a Python library for assessing the quality of question-answering systems, such as systems built with LLM-based agents. It is agnostic to the agent implementation and the LLM it uses.
8
+
9
+ The evaluation is based on a user-provided reference dataset containing queries, reference responses, and optional reference steps, such as expected tool uses. The evaluator compares these references with the agent's actual responses and executed steps. Reference steps can be grouped to allow some expected steps to occur in any order.
10
+
11
+ The library provides built-in evaluation metrics and supports user-defined custom metrics ([§ Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)).
12
+
13
+ ## Documentation
14
+
15
+ - [Quickstart](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/quickstart.md)
16
+ - [Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)
17
+ - [Configuration](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/config.md)
18
+ - [Input](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/input.md)
19
+ - [Output](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/output.md)
20
+
21
+ ## Maintainers
22
+
23
+ Developed and maintained by [Graphwise](https://graphwise.ai/). For issues and feature requests, please open a [GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
24
+
25
+ ## License
26
+
27
+ Apache-2.0 License. See the [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.
@@ -0,0 +1,139 @@
1
+ from pathlib import Path
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from graphrag_eval.util import compute_f1
6
+
7
+
8
+ def load_default_prompt() -> str:
9
+ with open(Path(__file__).parent / "prompts" / "template.md", "r", encoding="utf-8") as f:
10
+ return f.read()
11
+
12
+
13
+ class AnswerCorrectnessConfig(BaseModel):
14
+ prompt: str = Field(default_factory=load_default_prompt)
15
+
16
+
17
+ class InvalidPromptException(Exception):
18
+ def __init__(self, message="The prompt template is invalid and cannot be formatted."):
19
+ self.message = message
20
+ super().__init__(self.message)
21
+
22
+
23
+ class AnswerCorrectnessEvaluator:
24
+ def __init__(
25
+ self,
26
+ llm: "InstructorBaseRagasLLM",
27
+ config: AnswerCorrectnessConfig | None = None,
28
+ ):
29
+ self.config = config or AnswerCorrectnessConfig()
30
+ self.__validate_prompt_template(self.config.prompt)
31
+ self.prompt_template = self.config.prompt
32
+ self.llm = llm
33
+
34
+ @staticmethod
35
+ def __validate_prompt_template(prompt_template: str):
36
+ try:
37
+ prompt_template.format(
38
+ question="Q?",
39
+ reference_answer="R",
40
+ actual_answer="A",
41
+ )
42
+ except Exception as exc:
43
+ raise InvalidPromptException(
44
+ "Invalid prompt template. Must only contain placeholders: "
45
+ "{question}, {reference_answer}, and {actual_answer}. "
46
+ f"Original error: {exc}"
47
+ ) from exc
48
+
49
+ async def _agenerate(self, prompt):
50
+ """Wrapper method for easier testing"""
51
+ return (await self.llm.agenerate(prompt, None)).choices[0].message.content
52
+
53
+ async def evaluate_answer(
54
+ self,
55
+ question: str,
56
+ reference_answer: str,
57
+ actual_answer: str
58
+ ) -> tuple[int, int, int, str]:
59
+ if any(not s.strip() for s in [question, reference_answer, actual_answer]):
60
+ raise ValueError("The question of the reference or the actual answer is a blank "
61
+ "string!")
62
+ prompt = self.prompt_template.format(
63
+ question=question,
64
+ reference_answer=reference_answer,
65
+ actual_answer=actual_answer,
66
+ )
67
+ response_str = await self._agenerate(prompt)
68
+ return self.extract_response_values(response_str)
69
+
70
+ async def get_correctness_dict(
71
+ self,
72
+ reference: dict,
73
+ actual: dict,
74
+ ):
75
+ result = {"reference_answer": reference["reference_answer"]}
76
+ try:
77
+ num_ref_claims, num_actual_claims, num_matching_claims, reason = \
78
+ await self.evaluate_answer(
79
+ reference["question_text"],
80
+ reference["reference_answer"],
81
+ actual["actual_answer"],
82
+ )
83
+ result.update({
84
+ "answer_reference_claims_count": num_ref_claims,
85
+ "answer_actual_claims_count": num_actual_claims,
86
+ "answer_matching_claims_count": num_matching_claims,
87
+ "answer_correctness_reason": reason,
88
+ })
89
+ recall, precision, f1 = self.compute_recall_precision_f1(
90
+ num_ref_claims, num_actual_claims, num_matching_claims
91
+ )
92
+ if recall is not None:
93
+ result["answer_recall"] = recall
94
+ if precision is not None:
95
+ result["answer_precision"] = precision
96
+ if f1 is not None:
97
+ result["answer_f1"] = f1
98
+ except Exception as exc:
99
+ result["answer_eval_error"] = str(exc)
100
+ return result
101
+
102
+ @staticmethod
103
+ def compute_recall_precision_f1(
104
+ n_pos: int,
105
+ n_pred_pos: int,
106
+ n_true_pos: int,
107
+ ) -> tuple[float | None, float | None, float | None]:
108
+ recall = None
109
+ precision = None
110
+ if n_pos:
111
+ recall = n_true_pos / n_pos
112
+ if n_pred_pos:
113
+ precision = n_true_pos / n_pred_pos
114
+ return recall, precision, compute_f1(recall, precision)
115
+
116
+ @staticmethod
117
+ def extract_response_values(
118
+ response: str
119
+ ) -> tuple[int, int, int, str]:
120
+ vals = response.split("\t")
121
+ n = len(vals)
122
+ if n < 4:
123
+ raise ValueError(f"Expected 4 tab-separated values: {response}")
124
+ vals = vals[:4]
125
+ try:
126
+ n_ref, n_actual, n_matching = map(int, vals[:3])
127
+ except ValueError:
128
+ raise ValueError(f"Claims counts should be ints: {vals}")
129
+ if any([
130
+ n_ref < 1,
131
+ n_actual < 1,
132
+ n_matching < 0,
133
+ n_matching > n_ref,
134
+ n_matching > n_actual
135
+ ]):
136
+ raise ValueError(
137
+ f"Invalid claims counts combination: {n_ref}\t{n_actual}\t{n_matching}"
138
+ )
139
+ return n_ref, n_actual, n_matching, vals[3]
@@ -0,0 +1,105 @@
1
+ import argparse
2
+ import asyncio
3
+ import csv
4
+ from argparse import ArgumentParser
5
+ from pathlib import Path
6
+
7
+ from tqdm import tqdm
8
+
9
+ from graphrag_eval import llm_factory
10
+ from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
11
+ from graphrag_eval.evaluation import Config
12
+
13
+
14
+ def parse_args() -> argparse.Namespace:
15
+ parser = ArgumentParser(
16
+ description="Calculates answer correctness over the entries from the input tsv file and "
17
+ "stores the output in the output tsv file.",
18
+ )
19
+ parser.add_argument(
20
+ "-i",
21
+ "--input-tsv-file-path",
22
+ type=Path,
23
+ required=True,
24
+ help="Input tsv file path with columns `Question`, `Reference answer` and `Actual answer`",
25
+ )
26
+ parser.add_argument(
27
+ "-o",
28
+ "--output-tsv-file-path",
29
+ type=Path,
30
+ required=True,
31
+ help="Output tsv file path with columns `#Reference`, `#PTarget`, `#Matching`, "
32
+ "`Reasoning`, `Error`",
33
+ )
34
+ parser.add_argument(
35
+ "-c",
36
+ "--config-yaml-file-path",
37
+ type=Path,
38
+ required=True,
39
+ help="Config yaml file path with definition of the LLM to use and optionally a custom "
40
+ "prompt.",
41
+ )
42
+ return parser.parse_args()
43
+
44
+
45
+ async def evaluate_and_write(
46
+ input_tsv_file_path: Path,
47
+ output_tsv_file_path: Path,
48
+ evaluator: AnswerCorrectnessEvaluator,
49
+ ) -> None:
50
+ with open(input_tsv_file_path, encoding="utf-8") as f:
51
+ reader = csv.DictReader(f, delimiter="\t")
52
+ rows = [row for row in reader]
53
+ print(f"Writing results to {output_tsv_file_path}")
54
+ output_tsv_file_path.parent.mkdir(parents=True, exist_ok=True)
55
+ with open(output_tsv_file_path, "w", encoding="utf-8") as f:
56
+ writer = csv.writer(f, delimiter="\t")
57
+ writer.writerow(["#Reference", "#PTarget", "#Matching", "Reasoning", "Error"])
58
+
59
+ for row in tqdm(rows):
60
+ if "Question" not in row or \
61
+ "Reference answer" not in row or \
62
+ "Actual answer" not in row:
63
+ raise ValueError("Unexpected input format!")
64
+
65
+ try:
66
+ vals = await evaluator.evaluate_answer(
67
+ row["Question"],
68
+ row["Reference answer"],
69
+ row["Actual answer"]
70
+ )
71
+ vals = vals + ("",)
72
+ writer.writerow(vals)
73
+ except Exception as exc:
74
+ writer.writerow(["", "", "", "", str(exc)])
75
+ f.flush()
76
+
77
+
78
+ def run(
79
+ config_yaml_file_path: Path,
80
+ input_tsv_file_path: Path,
81
+ output_tsv_file_path: Path,
82
+ ):
83
+ config = Config.parse(config_yaml_file_path)
84
+ ragas_llm = llm_factory.create_llm(config)
85
+ if ragas_llm is None:
86
+ raise ValueError("LLM must be configured to calculate the answer correctness!")
87
+ else:
88
+ evaluator = AnswerCorrectnessEvaluator(
89
+ llm=ragas_llm,
90
+ config=config.answer_correctness,
91
+ )
92
+ asyncio.run(evaluate_and_write(
93
+ input_tsv_file_path,
94
+ output_tsv_file_path,
95
+ evaluator,
96
+ ))
97
+
98
+
99
+ def main():
100
+ args = parse_args()
101
+ run(
102
+ args.config_yaml_file_path,
103
+ args.input_tsv_file_path,
104
+ args.output_tsv_file_path,
105
+ )
@@ -3,6 +3,7 @@ from typing import Literal
3
3
 
4
4
  from pydantic import BaseModel, ConfigDict, Field, model_validator
5
5
 
6
+ from graphrag_eval.llm_factory import create_llm
6
7
 
7
8
  RESERVED_KEYS = {
8
9
  "template_id",
@@ -31,7 +32,6 @@ RESERVED_KEYS = {
31
32
  "elapsed_sec",
32
33
  }
33
34
 
34
-
35
35
  Inputs = Literal[
36
36
  "question",
37
37
  "reference_answer",
@@ -84,8 +84,8 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
84
84
 
85
85
  output_variables specifies the order of the outputs.
86
86
  """
87
- output_instructions = "Output the following values separated by tabs:"\
88
- + "".join(f"\n- {k}: {config.outputs[k]}" for k in output_variables)
87
+ output_instructions = "Output the following values separated by tabs:" \
88
+ + "".join(f"\n- {k}: {config.outputs[k]}" for k in output_variables)
89
89
  inputs_template = "\n\n".join(
90
90
  create_input_template(k) for k in config.inputs
91
91
  )
@@ -98,9 +98,9 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
98
98
 
99
99
  class CustomEvaluator:
100
100
  def __init__(
101
- self,
101
+ self,
102
102
  config: Config,
103
- llm: "InstructorBaseRagasLLM",
103
+ eval_config: "evaluation.Config",
104
104
  ):
105
105
  self.name = config.name
106
106
  self.input_variables = config.inputs
@@ -111,11 +111,11 @@ class CustomEvaluator:
111
111
  config,
112
112
  self.output_variables
113
113
  )
114
- self.llm = llm
114
+ self.llm = create_llm(eval_config)
115
115
 
116
- def _generate(self, prompt: str) -> str:
116
+ async def _agenerate(self, prompt: str) -> str:
117
117
  """Wrapper method for easier testing"""
118
- return self.llm.generate(prompt, None).choices[0].message.content
118
+ return (await self.llm.agenerate(prompt, None)).choices[0].message.content
119
119
 
120
120
  def format_steps(self, steps: list) -> str:
121
121
  steps_formatted = []
@@ -134,9 +134,9 @@ class CustomEvaluator:
134
134
  step_out[k] = val
135
135
  else:
136
136
  step_out[k] = val
137
- steps_formatted.append(step_out)
137
+ steps_formatted.append(step_out)
138
138
  return json.dumps(steps_formatted, indent=2)
139
-
139
+
140
140
  def error(self, msg: str) -> dict:
141
141
  result = {k: None for k in self.output_variables}
142
142
  result[self.name + '_error'] = msg
@@ -157,7 +157,7 @@ class CustomEvaluator:
157
157
  return result
158
158
  return self.error(f"Expected {n_exp} tab-separated values, got: {response}")
159
159
 
160
- def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
160
+ async def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
161
161
  inputs = {}
162
162
  if "question" in self.input_variables:
163
163
  if "question_text" not in reference:
@@ -176,7 +176,7 @@ class CustomEvaluator:
176
176
  return self.error("Reference missing key 'reference_steps'")
177
177
  try:
178
178
  formatted_steps_lists = [
179
- self.format_steps(group)
179
+ self.format_steps(group)
180
180
  for group in reference["reference_steps"]
181
181
  ]
182
182
  except json.JSONDecodeError:
@@ -191,14 +191,14 @@ class CustomEvaluator:
191
191
  return self.error("Malformed actual step JSON")
192
192
  inputs["actual_steps"] = formatted_steps_lists
193
193
  prompt = self.prompt_template.format(**inputs)
194
- response = self._generate(prompt)
194
+ response = await self._agenerate(prompt)
195
195
  return self.parse_outputs(response)
196
196
 
197
197
 
198
198
  def create_evaluators(config: "evaluation.Config") -> list[CustomEvaluator]:
199
199
  if config.custom_evaluations and config.llm:
200
200
  return [
201
- CustomEvaluator(c, config.llm.generation)
202
- for c in config.custom_evaluations
201
+ CustomEvaluator(custom_evaluation_config, config)
202
+ for custom_evaluation_config in config.custom_evaluations
203
203
  ]
204
204
  return []
@@ -4,7 +4,8 @@ import yaml
4
4
  from pydantic import BaseModel, Field, model_validator
5
5
 
6
6
  from . import custom_evaluation
7
- from .llm import Config as LLMConfig, create_llm, create_embedder
7
+ from .answer_correctness import AnswerCorrectnessConfig
8
+ from .llm_factory import Config as LLMConfig, create_llm, create_embedder
8
9
  from .steps.evaluation import evaluate_steps
9
10
 
10
11
 
@@ -12,6 +13,7 @@ class Config(BaseModel):
12
13
  llm: LLMConfig | None = None
13
14
  custom_evaluations: list[custom_evaluation.Config] | None \
14
15
  = Field(default=None, min_length=1)
16
+ answer_correctness: AnswerCorrectnessConfig | None = None
15
17
 
16
18
  @model_validator(mode="after")
17
19
  def validate_config(self) -> "Config":
@@ -19,7 +21,7 @@ class Config(BaseModel):
19
21
  msg = "llm config is required if custom_evaluations are provided"
20
22
  raise ValueError(msg)
21
23
  return self
22
-
24
+
23
25
  @classmethod
24
26
  def parse(cls, config_file_path: str | Path | None) -> "Config":
25
27
  if config_file_path:
@@ -75,10 +77,11 @@ async def run_evaluation(
75
77
  if "reference_answer" in question and ragas_llm:
76
78
  from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
77
79
  answer_correctness_evaluator = AnswerCorrectnessEvaluator(
78
- llm=ragas_llm
80
+ llm=ragas_llm,
81
+ config=config.answer_correctness,
79
82
  )
80
83
  eval_result.update(
81
- answer_correctness_evaluator.get_correctness_dict(
84
+ await answer_correctness_evaluator.get_correctness_dict(
82
85
  question,
83
86
  actual_result,
84
87
  )
@@ -90,8 +93,8 @@ async def run_evaluation(
90
93
  ragas_llm,
91
94
  )
92
95
  )
93
- for relevance_evaluator in custom_evaluators:
94
- custom_metrics = relevance_evaluator.evaluate(question, actual_result)
96
+ for custom_evaluator in custom_evaluators:
97
+ custom_metrics = await custom_evaluator.evaluate(question, actual_result)
95
98
  eval_result.update(**custom_metrics)
96
99
  for key in "input_tokens", "output_tokens", "total_tokens", "elapsed_sec":
97
100
  if key in actual_result:
@@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
6
6
  class GenerationConfig(BaseModel):
7
7
  provider: str
8
8
  model: str
9
- temperature: float = Field(ge=0.0, le=2.0)
10
- max_tokens: int = Field(ge=1)
9
+ temperature: float = Field(default=0.0, ge=0.0, le=2.0)
10
+ max_tokens: int | None = Field(default=None, ge=1)
11
11
  model_config = ConfigDict(extra='allow')
12
12
 
13
13
 
@@ -37,6 +37,7 @@ def create_llm(config: "evaluation.Config") -> Optional["InstructorBaseRagasLLM"
37
37
  )
38
38
  ragas_llm.is_async = True
39
39
  return ragas_llm
40
+ return None
40
41
 
41
42
 
42
43
  def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding"]:
@@ -53,3 +54,4 @@ def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding
53
54
  **params,
54
55
  )
55
56
  return ragas_embedder
57
+ return None
@@ -10,7 +10,7 @@ Below are a query, a reference response and a candidate response to it.
10
10
  {reference_answer}
11
11
 
12
12
  # Candidate response
13
- {candidate_answer}
13
+ {actual_answer}
14
14
 
15
15
  # Output values
16
16
  * v1: Count of reference response claims
File without changes
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "graphrag-eval"
3
- version = "6.1.0"
3
+ version = "6.3.0"
4
4
  description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
5
5
  authors = [
6
6
  { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
@@ -24,7 +24,7 @@ llm = ["ragas", "litellm", "pyyaml"]
24
24
 
25
25
  [tool.poetry.group.llm.dependencies]
26
26
  ragas = "0.4.3"
27
- litellm = "1.83.14"
27
+ litellm = "1.85.1"
28
28
  pyyaml = "6.0.3"
29
29
 
30
30
  [tool.poetry.group.llm]
@@ -41,7 +41,7 @@ pyyaml = "6.0.3"
41
41
  optional = true
42
42
 
43
43
  [project.scripts]
44
- answer-correctness = "graphrag_eval.answer_correctness:main"
44
+ answer-correctness = "graphrag_eval.cli.answer_correctness:main"
45
45
 
46
46
  [build-system]
47
47
  requires = ["poetry-core>=2.0.0"]