graphrag-eval 5.0.2__tar.gz → 5.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: graphrag-eval
3
- Version: 5.0.2
3
+ Version: 5.1.0
4
4
  Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
5
5
  License: Apache-2.0
6
6
  Author: Philip Ganchev
@@ -9,10 +9,12 @@ Requires-Python: >=3.12,<3.13
9
9
  Classifier: License :: OSI Approved :: Apache Software License
10
10
  Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.12
12
- Provides-Extra: openai
13
- Requires-Dist: langevals (==0.1.*) ; extra == "openai"
14
- Requires-Dist: langevals-ragas (>=0.1.12,<0.2.0) ; extra == "openai"
15
- Requires-Dist: openai (>=1.97.0,<2.0.0) ; extra == "openai"
12
+ Provides-Extra: ragas
13
+ Requires-Dist: langchain-openai (==0.3.7) ; extra == "ragas"
14
+ Requires-Dist: langchain_community (==0.3.18) ; extra == "ragas"
15
+ Requires-Dist: langevals[ragas] (==0.1.8) ; extra == "ragas"
16
+ Requires-Dist: litellm (==1.61.20) ; extra == "ragas"
17
+ Requires-Dist: ragas (==0.2.9) ; extra == "ragas"
16
18
  Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
17
19
  Description-Content-Type: text/markdown
18
20
 
@@ -43,12 +45,12 @@ graphrag-eval = "*"
43
45
  To evaluate answer relevance and answer correctness:
44
46
 
45
47
  ```bash
46
- pip install 'graphrag-eval[openai]'
48
+ pip install 'graphrag-eval[ragas]'
47
49
  ```
48
50
 
49
51
  or add the following dependency in your `pyproject.toml` file:
50
52
  ```toml
51
- graphrag-eval = {version = "*", extras = ["openai"]}
53
+ graphrag-eval = {version = "*", extras = ["ragas"]}
52
54
  ```
53
55
 
54
56
  ## Maintainers
@@ -61,7 +63,7 @@ For issues or feature requests, please open [a GitHub issue](https://github.com/
61
63
  To evaluate only correctness of final answers (system responses), you can clone this repository and run the code on the command line:
62
64
 
63
65
  1. Prepare an input TSV file with columns `Question`, `Reference answer` and `Actual answer`
64
- 1. Execute `poetry install --with openai`
66
+ 1. Execute `poetry install --with ragas`
65
67
  1. Execute `OPENAI_API_KEY=<your_api_key> poetry run answer-correctness -i <input_file.tsv> -o <output_file.tsv>`
66
68
 
67
69
  We plan to improve CLI support in future releases.
@@ -445,7 +447,6 @@ The output is a list of statistics for each question from the reference Q&A data
445
447
  retrieval_answer_recall_reason: The context contains all the transformers listed in the reference answer
446
448
  retrieval_answer_recall_cost: 0.0007
447
449
  retrieval_answer_precision: 1.0
448
- retrieval_answer_precision_reason: The context contains only transformers listed in the reference answer
449
450
  retrieval_answer_precision_cost: 0.0003
450
451
  retrieval_answer_f1: 1.0
451
452
  retrieval_answer_f1_cost: 0.001
@@ -570,7 +571,6 @@ All `actual_steps` with `name` "retrieval" contain:
570
571
  - `retrieval_answer_recall_error`: (optional) error message if `retrieval_answer_recall` evaluation fails
571
572
  - `retrieval_answer_recall_cost`: cost of evaluating `retrieval_answer_recall`, in US dollars
572
573
  - `retrieval_answer_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
573
- - `retrieval_answer_precision_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_precision`
574
574
  - `retrieval_answer_precision_error`: (optional) error message if `retrieval_answer_precision` evaluation fails
575
575
  - `retrieval_answer_precision_cost`: cost of evaluating `retrieval_answer_precision`, in US dollars
576
576
  - `retrieval_answer_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_answer_recall` and `retrieval_answer_precision` succeed
@@ -605,6 +605,9 @@ Aggregates are:
605
605
  - `once_per_sample`: how many times each step was executed, counted only once per question
606
606
  - `empty_results`: how many times the step was executed and returned empty results
607
607
  - `errors`: how many times the step was executed and resulted in error
608
+ - `retrieval_answer_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_recall` for all successful questions in this template
609
+ - `retrieval_answer_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_precision` for all successful questions in this template
610
+ - `retrieval_answer_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_f1` for all successful questions in this template
608
611
  - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` for all successful questions in this template
609
612
  - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` for all successful questions in this template
610
613
  - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` for all successful questions in this template
@@ -620,6 +623,9 @@ Aggregates are:
620
623
  - `answer_f1`: `sum`, `mean`, `median`, `min` and `max` for `answer_f1` of all successful questions
621
624
  - `answer_relevance`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance` of all successful questions
622
625
  - `answer_relevance_cost`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance_cost` of all successful questions
626
+ - `retrieval_answer_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_recall` of all successful questions
627
+ - `retrieval_answer_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_precision` of all successful questions
628
+ - `retrieval_answer_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_f1` of all successful questions
623
629
  - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` of all successful questions
624
630
  - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` of all successful questions
625
631
  - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` of all successful questions
@@ -634,6 +640,9 @@ Aggregates are:
634
640
  - `answer_f1`: `mean` for `answer_f1`
635
641
  - `answer_relevance`: `mean` for `answer_relevance`
636
642
  - `answer_relevance_cost`: `mean` for `answer_relevance_cost`
643
+ - `retrieval_answer_recall`: `mean` for `retrieval_answer_recall`
644
+ - `retrieval_answer_precision`: `mean` for `retrieval_answer_precision`
645
+ - `retrieval_answer_f1`: `mean` for `retrieval_answer_f1`
637
646
  - `retrieval_context_recall`: `mean` for `retrieval_context_recall`
638
647
  - `retrieval_context_precision`: `mean` for `retrieval_context_precision`
639
648
  - `retrieval_context_f1`: `mean` for `retrieval_context_f1`
@@ -1031,7 +1040,7 @@ The following metrics are based on the content of retrieved documents.
1031
1040
 
1032
1041
  #### Context Recall@k
1033
1042
 
1034
- The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we inclide in the first k spots?"
1043
+ The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we include in the first k spots?"
1035
1044
  * **Formula**:
1036
1045
  $`
1037
1046
  \frac{\text{Number of relevant items in top k}}{\text{Number of relevant items}}
@@ -25,12 +25,12 @@ graphrag-eval = "*"
25
25
  To evaluate answer relevance and answer correctness:
26
26
 
27
27
  ```bash
28
- pip install 'graphrag-eval[openai]'
28
+ pip install 'graphrag-eval[ragas]'
29
29
  ```
30
30
 
31
31
  or add the following dependency in your `pyproject.toml` file:
32
32
  ```toml
33
- graphrag-eval = {version = "*", extras = ["openai"]}
33
+ graphrag-eval = {version = "*", extras = ["ragas"]}
34
34
  ```
35
35
 
36
36
  ## Maintainers
@@ -43,7 +43,7 @@ For issues or feature requests, please open [a GitHub issue](https://github.com/
43
43
  To evaluate only correctness of final answers (system responses), you can clone this repository and run the code on the command line:
44
44
 
45
45
  1. Prepare an input TSV file with columns `Question`, `Reference answer` and `Actual answer`
46
- 1. Execute `poetry install --with openai`
46
+ 1. Execute `poetry install --with ragas`
47
47
  1. Execute `OPENAI_API_KEY=<your_api_key> poetry run answer-correctness -i <input_file.tsv> -o <output_file.tsv>`
48
48
 
49
49
  We plan to improve CLI support in future releases.
@@ -427,7 +427,6 @@ The output is a list of statistics for each question from the reference Q&A data
427
427
  retrieval_answer_recall_reason: The context contains all the transformers listed in the reference answer
428
428
  retrieval_answer_recall_cost: 0.0007
429
429
  retrieval_answer_precision: 1.0
430
- retrieval_answer_precision_reason: The context contains only transformers listed in the reference answer
431
430
  retrieval_answer_precision_cost: 0.0003
432
431
  retrieval_answer_f1: 1.0
433
432
  retrieval_answer_f1_cost: 0.001
@@ -552,7 +551,6 @@ All `actual_steps` with `name` "retrieval" contain:
552
551
  - `retrieval_answer_recall_error`: (optional) error message if `retrieval_answer_recall` evaluation fails
553
552
  - `retrieval_answer_recall_cost`: cost of evaluating `retrieval_answer_recall`, in US dollars
554
553
  - `retrieval_answer_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
555
- - `retrieval_answer_precision_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_precision`
556
554
  - `retrieval_answer_precision_error`: (optional) error message if `retrieval_answer_precision` evaluation fails
557
555
  - `retrieval_answer_precision_cost`: cost of evaluating `retrieval_answer_precision`, in US dollars
558
556
  - `retrieval_answer_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_answer_recall` and `retrieval_answer_precision` succeed
@@ -587,6 +585,9 @@ Aggregates are:
587
585
  - `once_per_sample`: how many times each step was executed, counted only once per question
588
586
  - `empty_results`: how many times the step was executed and returned empty results
589
587
  - `errors`: how many times the step was executed and resulted in error
588
+ - `retrieval_answer_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_recall` for all successful questions in this template
589
+ - `retrieval_answer_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_precision` for all successful questions in this template
590
+ - `retrieval_answer_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_f1` for all successful questions in this template
590
591
  - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` for all successful questions in this template
591
592
  - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` for all successful questions in this template
592
593
  - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` for all successful questions in this template
@@ -602,6 +603,9 @@ Aggregates are:
602
603
  - `answer_f1`: `sum`, `mean`, `median`, `min` and `max` for `answer_f1` of all successful questions
603
604
  - `answer_relevance`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance` of all successful questions
604
605
  - `answer_relevance_cost`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance_cost` of all successful questions
606
+ - `retrieval_answer_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_recall` of all successful questions
607
+ - `retrieval_answer_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_precision` of all successful questions
608
+ - `retrieval_answer_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_f1` of all successful questions
605
609
  - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` of all successful questions
606
610
  - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` of all successful questions
607
611
  - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` of all successful questions
@@ -616,6 +620,9 @@ Aggregates are:
616
620
  - `answer_f1`: `mean` for `answer_f1`
617
621
  - `answer_relevance`: `mean` for `answer_relevance`
618
622
  - `answer_relevance_cost`: `mean` for `answer_relevance_cost`
623
+ - `retrieval_answer_recall`: `mean` for `retrieval_answer_recall`
624
+ - `retrieval_answer_precision`: `mean` for `retrieval_answer_precision`
625
+ - `retrieval_answer_f1`: `mean` for `retrieval_answer_f1`
619
626
  - `retrieval_context_recall`: `mean` for `retrieval_context_recall`
620
627
  - `retrieval_context_precision`: `mean` for `retrieval_context_precision`
621
628
  - `retrieval_context_f1`: `mean` for `retrieval_context_f1`
@@ -1013,7 +1020,7 @@ The following metrics are based on the content of retrieved documents.
1013
1020
 
1014
1021
  #### Context Recall@k
1015
1022
 
1016
- The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we inclide in the first k spots?"
1023
+ The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we include in the first k spots?"
1017
1024
  * **Formula**:
1018
1025
  $`
1019
1026
  \frac{\text{Number of relevant items in top k}}{\text{Number of relevant items}}
@@ -7,7 +7,7 @@ from langevals_ragas.response_relevancy import (
7
7
  def get_relevance_dict(
8
8
  question_text: str,
9
9
  actual_answer: str,
10
- model_name : str = 'openai/gpt-4o-mini',
10
+ model_name: str = 'openai/gpt-4o-mini',
11
11
  max_tokens: int = 65_536
12
12
  ) -> dict:
13
13
  settings_dict = {
@@ -16,16 +16,19 @@ def _evaluate(
16
16
  metric: str
17
17
  ) -> dict[str, float | str]:
18
18
  try:
19
- result = evaluator.evaluate(entry)
20
- if result.status == "processed":
21
- return {
22
- f"retrieval_answer_{metric}": result.score,
23
- f"retrieval_answer_{metric}_cost": result.cost.amount,
24
- f"retrieval_answer_{metric}_reason": result.details
19
+ le_result = evaluator.evaluate(entry)
20
+ if le_result.status == "processed":
21
+ result = {
22
+ f"retrieval_answer_{metric}": le_result.score,
25
23
  }
24
+ if le_result.cost:
25
+ result[f"retrieval_answer_{metric}_cost"] = le_result.cost.amount
26
+ if le_result.details:
27
+ result[f"retrieval_answer_{metric}_reason"] = le_result.details
28
+ return result
26
29
  else:
27
30
  return {
28
- f"retrieval_answer_{metric}_error": result.details
31
+ f"retrieval_answer_{metric}_error": le_result.details
29
32
  }
30
33
  except Exception as e:
31
34
  return {
@@ -12,14 +12,14 @@ from graphrag_eval.util import get_f1_dict
12
12
 
13
13
  def _evaluate(
14
14
  entry: RagasContextRecallEntry | RagasContextPrecisionEntry,
15
- evauator: RagasContextRecallEvaluator | RagasContextPrecisionEvaluator,
15
+ evaluator: RagasContextRecallEvaluator | RagasContextPrecisionEvaluator,
16
16
  metric: str
17
17
  ) -> dict:
18
18
  try:
19
- result = evauator.evaluate(entry)
19
+ result = evaluator.evaluate(entry)
20
20
  if result.status == "processed":
21
21
  result_dict = {
22
- f"retrieval_context_{metric}": result.score,
22
+ f"retrieval_context_{metric}": result.score,
23
23
  }
24
24
  if result.details:
25
25
  result_dict[f"retrieval_context_{metric}_reason"] = result.details
@@ -1,5 +1,4 @@
1
1
  from collections import Counter
2
- import re
3
2
  from typing import Union
4
3
  import itertools
5
4
  import math
@@ -0,0 +1,51 @@
1
+ [project]
2
+ name = "graphrag-eval"
3
+ version = "5.1.0"
4
+ description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
5
+ authors = [
6
+ { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
7
+ { name = "Aleksis Datseris", email = "aleksis.datseris@graphwise.ai" },
8
+ { name = "Neli Hateva", email = "neli.hateva@graphwise.ai" },
9
+ ]
10
+ readme = "README.md"
11
+ license = "Apache-2.0"
12
+ requires-python = ">=3.12,<3.13"
13
+
14
+ [project.urls]
15
+ repository = "https://github.com/Ontotext-AD/graphrag-eval"
16
+
17
+ [tool.poetry.dependencies]
18
+ langevals = { version = "0.1.8", optional = true, extras = ["ragas"] }
19
+ ragas = { version = "0.2.9", optional = true }
20
+ langchain-openai = { version = "0.3.7", optional = true }
21
+ langchain_community = { version = "0.3.18", optional = true }
22
+ litellm = { version = "1.61.20", optional = true }
23
+
24
+ [tool.poetry.extras]
25
+ ragas = ["langevals", "ragas", "langchain-openai", "langchain_community", "litellm"]
26
+
27
+ [tool.poetry.group.ragas.dependencies]
28
+ langevals = { version = "0.1.8", extras = ["ragas"] }
29
+ ragas = "0.2.9"
30
+ langchain-openai = "0.3.7"
31
+ langchain_community = "0.3.18"
32
+ litellm = "1.61.20"
33
+
34
+ [tool.poetry.group.ragas]
35
+ optional = true
36
+
37
+ [tool.poetry.group.test.dependencies]
38
+ pytest = "<9,>=8"
39
+ pytest-cov = "<8,>=7"
40
+ jsonlines = "4.0.0"
41
+ pyyaml = "6.0.3"
42
+
43
+ [tool.poetry.group.test]
44
+ optional = true
45
+
46
+ [project.scripts]
47
+ answer-correctness = "graphrag_eval.answer_correctness:main"
48
+
49
+ [build-system]
50
+ requires = ["poetry-core>=2.0.0"]
51
+ build-backend = "poetry.core.masonry.api"
@@ -1,47 +0,0 @@
1
- [project]
2
- name = "graphrag-eval"
3
- version = "5.0.2"
4
- description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
5
- authors = [
6
- { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
7
- { name = "Aleksis Datseris", email = "aleksis.datseris@graphwise.ai" },
8
- { name = "Neli Hateva", email = "neli.hateva@graphwise.ai" },
9
- ]
10
- readme = "README.md"
11
- license = "Apache-2.0"
12
- requires-python = ">=3.12,<3.13"
13
-
14
- [project.urls]
15
- repository = "https://github.com/Ontotext-AD/graphrag-eval"
16
-
17
- [tool.poetry.dependencies]
18
- openai = { version = "^1.97.0", optional = true }
19
- langevals = { version = "0.1.*", optional = true }
20
- langevals-ragas = { version = "^0.1.12", optional = true }
21
-
22
- [tool.poetry.extras]
23
- openai = ["openai", "langevals", "langevals-ragas"]
24
-
25
- [tool.poetry.group.openai.dependencies]
26
- openai = "^1.97.0"
27
- langevals = "0.1.*"
28
- langevals-ragas = "^0.1.12"
29
-
30
- [tool.poetry.group.openai]
31
- optional = true
32
-
33
- [tool.poetry.group.test.dependencies]
34
- pytest = "<9,>=8"
35
- pytest-cov = "<7,>=6"
36
- jsonlines = "4.0.0"
37
- pyyaml = "^6.0.2"
38
-
39
- [tool.poetry.group.test]
40
- optional = true
41
-
42
- [project.scripts]
43
- answer-correctness = "graphrag_eval.answer_correctness:main"
44
-
45
- [build-system]
46
- requires = ["poetry-core>=2.0.0"]
47
- build-backend = "poetry.core.masonry.api"
File without changes