PyPI - graphrag-eval - Versions diffs - 5.0.2__tar.gz → 5.1.0__tar.gz - Mend

graphrag-eval 5.0.2tar.gz → 5.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{graphrag_eval-5.0.2 → graphrag_eval-5.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: graphrag-eval
-Version: 5.0.2
+Version: 5.1.0
 Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
 License: Apache-2.0
 Author: Philip Ganchev
@@ -9,10 +9,12 @@ Requires-Python: >=3.12,<3.13
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.12
-Provides-Extra: openai
-Requires-Dist: langevals (==0.1.*) ; extra == "openai"
-Requires-Dist: langevals-ragas (>=0.1.12,<0.2.0) ; extra == "openai"
-Requires-Dist: openai (>=1.97.0,<2.0.0) ; extra == "openai"
+Provides-Extra: ragas
+Requires-Dist: langchain-openai (==0.3.7) ; extra == "ragas"
+Requires-Dist: langchain_community (==0.3.18) ; extra == "ragas"
+Requires-Dist: langevals[ragas] (==0.1.8) ; extra == "ragas"
+Requires-Dist: litellm (==1.61.20) ; extra == "ragas"
+Requires-Dist: ragas (==0.2.9) ; extra == "ragas"
 Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
 Description-Content-Type: text/markdown
@@ -43,12 +45,12 @@ graphrag-eval = "*"
 To evaluate answer relevance and answer correctness:
 ```bash
-pip install 'graphrag-eval[openai]'
+pip install 'graphrag-eval[ragas]'
 ```
 or add the following dependency in your `pyproject.toml` file:
 ```toml
-graphrag-eval = {version = "*", extras = ["openai"]}
+graphrag-eval = {version = "*", extras = ["ragas"]}
 ```
 ## Maintainers
@@ -61,7 +63,7 @@ For issues or feature requests, please open [a GitHub issue](https://github.com/
 To evaluate only correctness of final answers (system responses), you can clone this repository and run the code on the command line:
 1. Prepare an input TSV file with columns `Question`, `Reference answer` and `Actual answer`
-1. Execute `poetry install --with openai`
+1. Execute `poetry install --with ragas`
 1. Execute `OPENAI_API_KEY=<your_api_key> poetry run answer-correctness -i <input_file.tsv> -o <output_file.tsv>`
 We plan to improve CLI support in future releases.
@@ -445,7 +447,6 @@ The output is a list of statistics for each question from the reference Q&A data
     retrieval_answer_recall_reason: The context contains all the transformers listed in the reference answer
     retrieval_answer_recall_cost: 0.0007
     retrieval_answer_precision: 1.0
-    retrieval_answer_precision_reason: The context contains only transformers listed in the reference answer
     retrieval_answer_precision_cost: 0.0003
     retrieval_answer_f1: 1.0
     retrieval_answer_f1_cost: 0.001
@@ -570,7 +571,6 @@ All `actual_steps` with `name` "retrieval" contain:
 - `retrieval_answer_recall_error`: (optional) error message if `retrieval_answer_recall` evaluation fails
 - `retrieval_answer_recall_cost`: cost of evaluating `retrieval_answer_recall`, in US dollars
 - `retrieval_answer_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
-- `retrieval_answer_precision_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_precision`
 - `retrieval_answer_precision_error`: (optional) error message if `retrieval_answer_precision` evaluation fails
 - `retrieval_answer_precision_cost`: cost of evaluating `retrieval_answer_precision`, in US dollars
 - `retrieval_answer_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_answer_recall` and `retrieval_answer_precision` succeed
@@ -605,6 +605,9 @@ Aggregates are:
     - `once_per_sample`: how many times each step was executed, counted only once per question
     - `empty_results`: how many times the step was executed and returned empty results
     - `errors`: how many times the step was executed and resulted in error
+  - `retrieval_answer_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_recall` for all successful questions in this template
+  - `retrieval_answer_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_precision` for all successful questions in this template
+  - `retrieval_answer_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_f1` for all successful questions in this template
   - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` for all successful questions in this template
   - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` for all successful questions in this template
   - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` for all successful questions in this template
@@ -620,6 +623,9 @@ Aggregates are:
   - `answer_f1`: `sum`, `mean`, `median`, `min` and `max` for `answer_f1` of all successful questions
   - `answer_relevance`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance` of all successful questions
   - `answer_relevance_cost`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance_cost` of all successful questions
+  - `retrieval_answer_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_recall` of all successful questions
+  - `retrieval_answer_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_precision` of all successful questions
+  - `retrieval_answer_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_f1` of all successful questions
   - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` of all successful questions
   - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` of all successful questions
   - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` of all successful questions
@@ -634,6 +640,9 @@ Aggregates are:
   - `answer_f1`: `mean` for `answer_f1`
   - `answer_relevance`: `mean` for `answer_relevance`
   - `answer_relevance_cost`: `mean` for `answer_relevance_cost`
+  - `retrieval_answer_recall`: `mean` for `retrieval_answer_recall`
+  - `retrieval_answer_precision`: `mean` for `retrieval_answer_precision`
+  - `retrieval_answer_f1`: `mean` for `retrieval_answer_f1`
   - `retrieval_context_recall`: `mean` for `retrieval_context_recall`
   - `retrieval_context_precision`: `mean` for `retrieval_context_precision`
   - `retrieval_context_f1`: `mean` for `retrieval_context_f1`
@@ -1031,7 +1040,7 @@ The following metrics are based on the content of retrieved documents.
 #### Context Recall@k
-The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we inclide in the first k spots?"
+The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we include in the first k spots?"
 * **Formula**:
     $`
     \frac{\text{Number of relevant items in top k}}{\text{Number of relevant items}}

{graphrag_eval-5.0.2 → graphrag_eval-5.1.0}/README.md RENAMED Viewed

@@ -25,12 +25,12 @@ graphrag-eval = "*"
 To evaluate answer relevance and answer correctness:
 ```bash
-pip install 'graphrag-eval[openai]'
+pip install 'graphrag-eval[ragas]'
 ```
 or add the following dependency in your `pyproject.toml` file:
 ```toml
-graphrag-eval = {version = "*", extras = ["openai"]}
+graphrag-eval = {version = "*", extras = ["ragas"]}
 ```
 ## Maintainers
@@ -43,7 +43,7 @@ For issues or feature requests, please open [a GitHub issue](https://github.com/
 To evaluate only correctness of final answers (system responses), you can clone this repository and run the code on the command line:
 1. Prepare an input TSV file with columns `Question`, `Reference answer` and `Actual answer`
-1. Execute `poetry install --with openai`
+1. Execute `poetry install --with ragas`
 1. Execute `OPENAI_API_KEY=<your_api_key> poetry run answer-correctness -i <input_file.tsv> -o <output_file.tsv>`
 We plan to improve CLI support in future releases.
@@ -427,7 +427,6 @@ The output is a list of statistics for each question from the reference Q&A data
     retrieval_answer_recall_reason: The context contains all the transformers listed in the reference answer
     retrieval_answer_recall_cost: 0.0007
     retrieval_answer_precision: 1.0
-    retrieval_answer_precision_reason: The context contains only transformers listed in the reference answer
     retrieval_answer_precision_cost: 0.0003
     retrieval_answer_f1: 1.0
     retrieval_answer_f1_cost: 0.001
@@ -552,7 +551,6 @@ All `actual_steps` with `name` "retrieval" contain:
 - `retrieval_answer_recall_error`: (optional) error message if `retrieval_answer_recall` evaluation fails
 - `retrieval_answer_recall_cost`: cost of evaluating `retrieval_answer_recall`, in US dollars
 - `retrieval_answer_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
-- `retrieval_answer_precision_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_precision`
 - `retrieval_answer_precision_error`: (optional) error message if `retrieval_answer_precision` evaluation fails
 - `retrieval_answer_precision_cost`: cost of evaluating `retrieval_answer_precision`, in US dollars
 - `retrieval_answer_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_answer_recall` and `retrieval_answer_precision` succeed
@@ -587,6 +585,9 @@ Aggregates are:
     - `once_per_sample`: how many times each step was executed, counted only once per question
     - `empty_results`: how many times the step was executed and returned empty results
     - `errors`: how many times the step was executed and resulted in error
+  - `retrieval_answer_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_recall` for all successful questions in this template
+  - `retrieval_answer_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_precision` for all successful questions in this template
+  - `retrieval_answer_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_f1` for all successful questions in this template
   - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` for all successful questions in this template
   - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` for all successful questions in this template
   - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` for all successful questions in this template
@@ -602,6 +603,9 @@ Aggregates are:
   - `answer_f1`: `sum`, `mean`, `median`, `min` and `max` for `answer_f1` of all successful questions
   - `answer_relevance`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance` of all successful questions
   - `answer_relevance_cost`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance_cost` of all successful questions
+  - `retrieval_answer_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_recall` of all successful questions
+  - `retrieval_answer_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_precision` of all successful questions
+  - `retrieval_answer_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_answer_f1` of all successful questions
   - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` of all successful questions
   - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` of all successful questions
   - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` of all successful questions
@@ -616,6 +620,9 @@ Aggregates are:
   - `answer_f1`: `mean` for `answer_f1`
   - `answer_relevance`: `mean` for `answer_relevance`
   - `answer_relevance_cost`: `mean` for `answer_relevance_cost`
+  - `retrieval_answer_recall`: `mean` for `retrieval_answer_recall`
+  - `retrieval_answer_precision`: `mean` for `retrieval_answer_precision`
+  - `retrieval_answer_f1`: `mean` for `retrieval_answer_f1`
   - `retrieval_context_recall`: `mean` for `retrieval_context_recall`
   - `retrieval_context_precision`: `mean` for `retrieval_context_precision`
   - `retrieval_context_f1`: `mean` for `retrieval_context_f1`
@@ -1013,7 +1020,7 @@ The following metrics are based on the content of retrieved documents.
 #### Context Recall@k
-The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we inclide in the first k spots?"
+The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we include in the first k spots?"
 * **Formula**:
     $`
     \frac{\text{Number of relevant items in top k}}{\text{Number of relevant items}}

{graphrag_eval-5.0.2 → graphrag_eval-5.1.0}/graphrag_eval/answer_relevance.py RENAMED Viewed

@@ -7,7 +7,7 @@ from langevals_ragas.response_relevancy import (
 def get_relevance_dict(
     question_text: str,
     actual_answer: str,
-    model_name : str = 'openai/gpt-4o-mini',
+    model_name: str = 'openai/gpt-4o-mini',
     max_tokens: int = 65_536
 ) -> dict:
     settings_dict = {

{graphrag_eval-5.0.2 → graphrag_eval-5.1.0}/graphrag_eval/steps/retrieval_answer.py RENAMED Viewed

@@ -16,16 +16,19 @@ def _evaluate(
     metric: str
 ) -> dict[str, float | str]:
     try:
-        result = evaluator.evaluate(entry)
-        if result.status == "processed":
-            return {
-                f"retrieval_answer_{metric}": result.score,
-                f"retrieval_answer_{metric}_cost": result.cost.amount,
-                f"retrieval_answer_{metric}_reason": result.details
+        le_result = evaluator.evaluate(entry)
+        if le_result.status == "processed":
+            result = {
+                f"retrieval_answer_{metric}": le_result.score,
             }
+            if le_result.cost:
+                result[f"retrieval_answer_{metric}_cost"] = le_result.cost.amount
+            if le_result.details:
+                result[f"retrieval_answer_{metric}_reason"] = le_result.details
+            return result
         else:
             return {
-                f"retrieval_answer_{metric}_error": result.details
+                f"retrieval_answer_{metric}_error": le_result.details
             }
     except Exception as e:
         return {

{graphrag_eval-5.0.2 → graphrag_eval-5.1.0}/graphrag_eval/steps/retrieval_context_texts.py RENAMED Viewed

@@ -12,14 +12,14 @@ from graphrag_eval.util import get_f1_dict
 def _evaluate(
     entry: RagasContextRecallEntry | RagasContextPrecisionEntry,
-    evauator: RagasContextRecallEvaluator | RagasContextPrecisionEvaluator,
+    evaluator: RagasContextRecallEvaluator | RagasContextPrecisionEvaluator,
     metric: str
 ) -> dict:
     try:
-        result = evauator.evaluate(entry)
+        result = evaluator.evaluate(entry)
         if result.status == "processed":
             result_dict = {
-                f"retrieval_context_{metric}": result.score,
+                f"retrieval_context_{metric}": result.score,
             }
             if result.details:
                 result_dict[f"retrieval_context_{metric}_reason"] = result.details

{graphrag_eval-5.0.2 → graphrag_eval-5.1.0}/graphrag_eval/steps/sparql.py RENAMED Viewed

@@ -1,5 +1,4 @@
 from collections import Counter
-import re
 from typing import Union
 import itertools
 import math

graphrag_eval-5.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,51 @@
+[project]
+name = "graphrag-eval"
+version = "5.1.0"
+description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
+authors = [
+    { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
+    { name = "Aleksis Datseris", email = "aleksis.datseris@graphwise.ai" },
+    { name = "Neli Hateva", email = "neli.hateva@graphwise.ai" },
+]
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.12,<3.13"
+[project.urls]
+repository = "https://github.com/Ontotext-AD/graphrag-eval"
+[tool.poetry.dependencies]
+langevals = { version = "0.1.8", optional = true, extras = ["ragas"] }
+ragas = { version = "0.2.9", optional = true }
+langchain-openai = { version = "0.3.7", optional = true }
+langchain_community = { version = "0.3.18", optional = true }
+litellm = { version = "1.61.20", optional = true }
+[tool.poetry.extras]
+ragas = ["langevals", "ragas", "langchain-openai", "langchain_community", "litellm"]
+[tool.poetry.group.ragas.dependencies]
+langevals = { version = "0.1.8", extras = ["ragas"] }
+ragas = "0.2.9"
+langchain-openai = "0.3.7"
+langchain_community = "0.3.18"
+litellm = "1.61.20"
+[tool.poetry.group.ragas]
+optional = true
+[tool.poetry.group.test.dependencies]
+pytest = "<9,>=8"
+pytest-cov = "<8,>=7"
+jsonlines = "4.0.0"
+pyyaml = "6.0.3"
+[tool.poetry.group.test]
+optional = true
+[project.scripts]
+answer-correctness = "graphrag_eval.answer_correctness:main"
+[build-system]
+requires = ["poetry-core>=2.0.0"]
+build-backend = "poetry.core.masonry.api"

graphrag_eval-5.0.2/pyproject.toml DELETED Viewed

@@ -1,47 +0,0 @@
-[project]
-name = "graphrag-eval"
-version = "5.0.2"
-description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
-authors = [
-  { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
-  { name = "Aleksis Datseris", email = "aleksis.datseris@graphwise.ai" },
-  { name = "Neli Hateva", email = "neli.hateva@graphwise.ai" },
-]
-readme = "README.md"
-license = "Apache-2.0"
-requires-python = ">=3.12,<3.13"
-[project.urls]
-repository = "https://github.com/Ontotext-AD/graphrag-eval"
-[tool.poetry.dependencies]
-openai = { version = "^1.97.0", optional = true }
-langevals = { version = "0.1.*", optional = true }
-langevals-ragas = { version = "^0.1.12", optional = true }
-[tool.poetry.extras]
-openai = ["openai", "langevals", "langevals-ragas"]
-[tool.poetry.group.openai.dependencies]
-openai = "^1.97.0"
-langevals = "0.1.*"
-langevals-ragas = "^0.1.12"
-[tool.poetry.group.openai]
-optional = true
-[tool.poetry.group.test.dependencies]
-pytest = "<9,>=8"
-pytest-cov = "<7,>=6"
-jsonlines = "4.0.0"
-pyyaml = "^6.0.2"
-[tool.poetry.group.test]
-optional = true
-[project.scripts]
-answer-correctness = "graphrag_eval.answer_correctness:main"
-[build-system]
-requires = ["poetry-core>=2.0.0"]
-build-backend = "poetry.core.masonry.api"