graphrag-eval 6.0.0__tar.gz → 6.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/PKG-INFO +4 -3
  2. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/README.md +3 -2
  3. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/answer_correctness.py +22 -22
  4. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/custom_evaluation.py +15 -15
  5. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/evaluation.py +5 -5
  6. graphrag_eval-6.0.0/graphrag_eval/llm.py → graphrag_eval-6.2.0/graphrag_eval/llm_factory.py +2 -0
  7. graphrag_eval-6.2.0/graphrag_eval/steps/iri_discovery.py +21 -0
  8. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/pyproject.toml +2 -2
  9. graphrag_eval-6.0.0/graphrag_eval/steps/iri_discovery.py +0 -20
  10. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/LICENSE +0 -0
  11. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/__init__.py +0 -0
  12. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/aggregation.py +0 -0
  13. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/answer_relevance.py +0 -0
  14. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/prompts/template.md +0 -0
  15. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/__init__.py +0 -0
  16. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/evaluation.py +0 -0
  17. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/retrieval_answer.py +0 -0
  18. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/retrieval_context_ids.py +0 -0
  19. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/retrieval_context_texts.py +0 -0
  20. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/sparql.py +0 -0
  21. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/timeseries.py +0 -0
  22. {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: graphrag-eval
3
- Version: 6.0.0
3
+ Version: 6.2.0
4
4
  Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
5
5
  License: Apache-2.0
6
6
  Author: Philip Ganchev
@@ -1167,8 +1167,9 @@ the steps are matching.
1167
1167
  - if both are named "retrieve_data_points", then we check if the arguments of
1168
1168
  the steps are matching.
1169
1169
  - if the reference step is named "iri_discovery" and the actual step name is
1170
- "autocomplete_search", тhen check if the IRI specified as "output" of the
1171
- "iri_discovery" step is present in the "output" of the "autocomplete_search".
1170
+ "autocomplete_search" or "sparql_query", then check if the IRI specified as
1171
+ "output" of the "iri_discovery" step is present in the "output" of the
1172
+ actual step.
1172
1173
  - if the reference and actual step names are the same and the
1173
1174
  "output_media_type" of the reference step is "application/json", then the steps
1174
1175
  match, if the json outputs are the same.
@@ -1149,8 +1149,9 @@ the steps are matching.
1149
1149
  - if both are named "retrieve_data_points", then we check if the arguments of
1150
1150
  the steps are matching.
1151
1151
  - if the reference step is named "iri_discovery" and the actual step name is
1152
- "autocomplete_search", тhen check if the IRI specified as "output" of the
1153
- "iri_discovery" step is present in the "output" of the "autocomplete_search".
1152
+ "autocomplete_search" or "sparql_query", then check if the IRI specified as
1153
+ "output" of the "iri_discovery" step is present in the "output" of the
1154
+ actual step.
1154
1155
  - if the reference and actual step names are the same and the
1155
1156
  "output_media_type" of the reference step is "application/json", then the steps
1156
1157
  match, if the json outputs are the same.
@@ -1,13 +1,13 @@
1
+ import asyncio
1
2
  import csv
2
3
  from pathlib import Path
3
4
 
4
5
  from tqdm import tqdm
5
6
 
6
- from graphrag_eval import llm
7
+ from graphrag_eval import llm_factory
7
8
  from graphrag_eval.evaluation import Config
8
9
  from graphrag_eval.util import compute_f1, singleton
9
10
 
10
-
11
11
  IN_FILE_PATH = "../data/data-1.tsv"
12
12
  PROMPT_FILE_PATH = Path(__file__).parent / "prompts" / "template.md"
13
13
  OUT_FILE_PATH = "results/data-1.tsv"
@@ -26,17 +26,17 @@ def parse_args() -> "argparse.Namespace":
26
26
  f = float(value)
27
27
  except ValueError:
28
28
  raise ArgumentTypeError(f"Invalid float value: {value}")
29
-
29
+
30
30
  if f <= 0.0 or f >= 2.0:
31
31
  raise ArgumentTypeError(f"Value must be between 0.0 and 2.0, got {f}")
32
32
  return f
33
33
 
34
- parser = ArgumentParser()
34
+ parser = ArgumentParser()
35
35
  parser.add_argument("-i", "--in-file", type=str, default=IN_FILE_PATH)
36
36
  parser.add_argument("-o", "--out-file", type=str, default=OUT_FILE_PATH)
37
37
  parser.add_argument("-p", "--provider", type=str, default=LLM_PROVIDER)
38
38
  parser.add_argument("-l", "--llm", type=str, default=LLM_MODEL)
39
- parser.add_argument("-m", "--max-tokens", type=int, default=MAX_TOKENS)
39
+ parser.add_argument("-m", "--max-tokens", type=int, default=MAX_TOKENS)
40
40
  parser.add_argument(
41
41
  "-t",
42
42
  "--temperature",
@@ -97,11 +97,11 @@ class AnswerCorrectnessEvaluator:
97
97
  self.prompt_template = f.read()
98
98
  self.llm = llm
99
99
 
100
- def _generate(self, prompt):
100
+ async def _agenerate(self, prompt):
101
101
  """Wrapper method for easier testing"""
102
- return self.llm.generate(prompt, None).choices[0].message.content
102
+ return (await self.llm.agenerate(prompt, None)).choices[0].message.content
103
103
 
104
- def evaluate_answer(
104
+ async def evaluate_answer(
105
105
  self,
106
106
  question: str,
107
107
  reference_answer: str,
@@ -112,21 +112,21 @@ class AnswerCorrectnessEvaluator:
112
112
  reference_answer=reference_answer,
113
113
  candidate_answer=actual_answer,
114
114
  )
115
- response_str = self._generate(prompt)
115
+ response_str = await self._agenerate(prompt)
116
116
  return extract_response_values(response_str)
117
117
 
118
- def get_correctness_dict(
118
+ async def get_correctness_dict(
119
119
  self,
120
120
  reference: dict,
121
121
  actual: dict,
122
122
  ):
123
123
  result = {"reference_answer": reference["reference_answer"]}
124
124
  num_ref_claims, num_actual_claims, num_matching_claims, reason, error = \
125
- self.evaluate_answer(
126
- reference["question_text"],
127
- reference["reference_answer"],
128
- actual["actual_answer"],
129
- )
125
+ await self.evaluate_answer(
126
+ reference["question_text"],
127
+ reference["reference_answer"],
128
+ actual["actual_answer"],
129
+ )
130
130
  if error:
131
131
  result["answer_eval_error"] = error
132
132
  else:
@@ -148,12 +148,12 @@ class AnswerCorrectnessEvaluator:
148
148
  return result
149
149
 
150
150
 
151
- def evaluate_and_write(
151
+ async def evaluate_and_write(
152
152
  in_file_path: str | Path,
153
153
  out_file_path: str | Path,
154
154
  config: "evaluation.Config",
155
155
  ) -> None:
156
- ragas_llm = llm.create_llm(config)
156
+ ragas_llm = llm_factory.create_llm(config)
157
157
  evaluator = AnswerCorrectnessEvaluator(llm=ragas_llm)
158
158
  with open(in_file_path, encoding="utf-8") as f:
159
159
  reader = csv.DictReader(f, delimiter="\t")
@@ -164,7 +164,7 @@ def evaluate_and_write(
164
164
  writer = csv.writer(f, delimiter="\t")
165
165
  writer.writerow(OUT_FIELDS)
166
166
  for row in tqdm(rows):
167
- vals = evaluator.evaluate_answer(
167
+ vals = await evaluator.evaluate_answer(
168
168
  row["Question"],
169
169
  row["Reference answer"],
170
170
  row["Actual answer"]
@@ -176,8 +176,8 @@ def evaluate_and_write(
176
176
  def main():
177
177
  args = parse_args()
178
178
  config = Config(
179
- llm=llm.Config(
180
- generation=llm.GenerationConfig(
179
+ llm=llm_factory.Config(
180
+ generation=llm_factory.GenerationConfig(
181
181
  provider=args.provider,
182
182
  model=args.llm,
183
183
  temperature=args.temperature,
@@ -185,8 +185,8 @@ def main():
185
185
  )
186
186
  )
187
187
  )
188
- evaluate_and_write(
188
+ asyncio.run(evaluate_and_write(
189
189
  args.in_file,
190
190
  args.out_file,
191
191
  config,
192
- )
192
+ ))
@@ -3,6 +3,7 @@ from typing import Literal
3
3
 
4
4
  from pydantic import BaseModel, ConfigDict, Field, model_validator
5
5
 
6
+ from graphrag_eval.llm_factory import create_llm
6
7
 
7
8
  RESERVED_KEYS = {
8
9
  "template_id",
@@ -31,7 +32,6 @@ RESERVED_KEYS = {
31
32
  "elapsed_sec",
32
33
  }
33
34
 
34
-
35
35
  Inputs = Literal[
36
36
  "question",
37
37
  "reference_answer",
@@ -84,8 +84,8 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
84
84
 
85
85
  output_variables specifies the order of the outputs.
86
86
  """
87
- output_instructions = "Output the following values separated by tabs:"\
88
- + "".join(f"\n- {k}: {config.outputs[k]}" for k in output_variables)
87
+ output_instructions = "Output the following values separated by tabs:" \
88
+ + "".join(f"\n- {k}: {config.outputs[k]}" for k in output_variables)
89
89
  inputs_template = "\n\n".join(
90
90
  create_input_template(k) for k in config.inputs
91
91
  )
@@ -98,9 +98,9 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
98
98
 
99
99
  class CustomEvaluator:
100
100
  def __init__(
101
- self,
101
+ self,
102
102
  config: Config,
103
- llm: "InstructorBaseRagasLLM",
103
+ eval_config: "evaluation.Config",
104
104
  ):
105
105
  self.name = config.name
106
106
  self.input_variables = config.inputs
@@ -111,11 +111,11 @@ class CustomEvaluator:
111
111
  config,
112
112
  self.output_variables
113
113
  )
114
- self.llm = llm
114
+ self.llm = create_llm(eval_config)
115
115
 
116
- def _generate(self, prompt: str) -> str:
116
+ async def _agenerate(self, prompt: str) -> str:
117
117
  """Wrapper method for easier testing"""
118
- return self.llm.generate(prompt, None).choices[0].message.content
118
+ return (await self.llm.agenerate(prompt, None)).choices[0].message.content
119
119
 
120
120
  def format_steps(self, steps: list) -> str:
121
121
  steps_formatted = []
@@ -134,9 +134,9 @@ class CustomEvaluator:
134
134
  step_out[k] = val
135
135
  else:
136
136
  step_out[k] = val
137
- steps_formatted.append(step_out)
137
+ steps_formatted.append(step_out)
138
138
  return json.dumps(steps_formatted, indent=2)
139
-
139
+
140
140
  def error(self, msg: str) -> dict:
141
141
  result = {k: None for k in self.output_variables}
142
142
  result[self.name + '_error'] = msg
@@ -157,7 +157,7 @@ class CustomEvaluator:
157
157
  return result
158
158
  return self.error(f"Expected {n_exp} tab-separated values, got: {response}")
159
159
 
160
- def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
160
+ async def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
161
161
  inputs = {}
162
162
  if "question" in self.input_variables:
163
163
  if "question_text" not in reference:
@@ -176,7 +176,7 @@ class CustomEvaluator:
176
176
  return self.error("Reference missing key 'reference_steps'")
177
177
  try:
178
178
  formatted_steps_lists = [
179
- self.format_steps(group)
179
+ self.format_steps(group)
180
180
  for group in reference["reference_steps"]
181
181
  ]
182
182
  except json.JSONDecodeError:
@@ -191,14 +191,14 @@ class CustomEvaluator:
191
191
  return self.error("Malformed actual step JSON")
192
192
  inputs["actual_steps"] = formatted_steps_lists
193
193
  prompt = self.prompt_template.format(**inputs)
194
- response = self._generate(prompt)
194
+ response = await self._agenerate(prompt)
195
195
  return self.parse_outputs(response)
196
196
 
197
197
 
198
198
  def create_evaluators(config: "evaluation.Config") -> list[CustomEvaluator]:
199
199
  if config.custom_evaluations and config.llm:
200
200
  return [
201
- CustomEvaluator(c, config.llm.generation)
202
- for c in config.custom_evaluations
201
+ CustomEvaluator(custom_evaluation_config, config)
202
+ for custom_evaluation_config in config.custom_evaluations
203
203
  ]
204
204
  return []
@@ -4,7 +4,7 @@ import yaml
4
4
  from pydantic import BaseModel, Field, model_validator
5
5
 
6
6
  from . import custom_evaluation
7
- from .llm import Config as LLMConfig, create_llm, create_embedder
7
+ from .llm_factory import Config as LLMConfig, create_llm, create_embedder
8
8
  from .steps.evaluation import evaluate_steps
9
9
 
10
10
 
@@ -19,7 +19,7 @@ class Config(BaseModel):
19
19
  msg = "llm config is required if custom_evaluations are provided"
20
20
  raise ValueError(msg)
21
21
  return self
22
-
22
+
23
23
  @classmethod
24
24
  def parse(cls, config_file_path: str | Path | None) -> "Config":
25
25
  if config_file_path:
@@ -78,7 +78,7 @@ async def run_evaluation(
78
78
  llm=ragas_llm
79
79
  )
80
80
  eval_result.update(
81
- answer_correctness_evaluator.get_correctness_dict(
81
+ await answer_correctness_evaluator.get_correctness_dict(
82
82
  question,
83
83
  actual_result,
84
84
  )
@@ -90,8 +90,8 @@ async def run_evaluation(
90
90
  ragas_llm,
91
91
  )
92
92
  )
93
- for relevance_evaluator in custom_evaluators:
94
- custom_metrics = relevance_evaluator.evaluate(question, actual_result)
93
+ for custom_evaluator in custom_evaluators:
94
+ custom_metrics = await custom_evaluator.evaluate(question, actual_result)
95
95
  eval_result.update(**custom_metrics)
96
96
  for key in "input_tokens", "output_tokens", "total_tokens", "elapsed_sec":
97
97
  if key in actual_result:
@@ -37,6 +37,7 @@ def create_llm(config: "evaluation.Config") -> Optional["InstructorBaseRagasLLM"
37
37
  )
38
38
  ragas_llm.is_async = True
39
39
  return ragas_llm
40
+ return None
40
41
 
41
42
 
42
43
  def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding"]:
@@ -53,3 +54,4 @@ def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding
53
54
  **params,
54
55
  )
55
56
  return ragas_embedder
57
+ return None
@@ -0,0 +1,21 @@
1
+ import json
2
+ from typing import Any
3
+
4
+
5
+ def do_iri_discovery_steps_equal(
6
+ reference_step: dict[str, Any],
7
+ actual_step: dict[str, Any],
8
+ ) -> bool:
9
+ if actual_step["name"] == "autocomplete_search":
10
+ reference_iri = reference_step["output"]
11
+ actual_output = json.loads(actual_step["output"])
12
+
13
+ for binding in actual_output["results"]["bindings"]:
14
+ for _, type_value in binding.items():
15
+ if type_value["type"] == "uri" and type_value["value"] == reference_iri:
16
+ return True
17
+ elif actual_step["name"] == "sparql_query":
18
+ reference_iri = reference_step["output"]
19
+ if reference_iri in actual_step["output"]:
20
+ return True
21
+ return False
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "graphrag-eval"
3
- version = "6.0.0"
3
+ version = "6.2.0"
4
4
  description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
5
5
  authors = [
6
6
  { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
@@ -24,7 +24,7 @@ llm = ["ragas", "litellm", "pyyaml"]
24
24
 
25
25
  [tool.poetry.group.llm.dependencies]
26
26
  ragas = "0.4.3"
27
- litellm = "1.83.14"
27
+ litellm = "1.85.1"
28
28
  pyyaml = "6.0.3"
29
29
 
30
30
  [tool.poetry.group.llm]
@@ -1,20 +0,0 @@
1
- import json
2
- from typing import Any
3
-
4
-
5
- def do_iri_discovery_steps_equal(
6
- reference_step: dict[str, Any],
7
- actual_step: dict[str, Any],
8
- ) -> bool:
9
- if actual_step["name"] != "autocomplete_search":
10
- return False
11
-
12
- reference_iri = reference_step["output"]
13
- actual_output = json.loads(actual_step["output"])
14
-
15
- for binding in actual_output["results"]["bindings"]:
16
- for _, type_value in binding.items():
17
- if type_value["type"] == "uri" and type_value["value"] == reference_iri:
18
- return True
19
-
20
- return False
File without changes