PyPI - graphrag-eval - Versions diffs - 5.1.1__tar.gz → 5.2.0__tar.gz - Mend

graphrag-eval 5.1.1tar.gz → 5.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: graphrag-eval
-Version: 5.1.1
+Version: 5.2.0
 Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
 License: Apache-2.0
 Author: Philip Ganchev
@@ -107,6 +107,7 @@ Each step includes:
 - `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
 - `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
 - `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
+- `ignore_duplicates`: (optional, defaults to `true`) For SPARQL query results, whether duplicate binding values in the expected or in the actual results should be ignored for the comparison.
 #### Reference Data

{graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/README.md RENAMED Viewed

@@ -87,6 +87,7 @@ Each step includes:
 - `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
 - `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
 - `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
+- `ignore_duplicates`: (optional, defaults to `true`) For SPARQL query results, whether duplicate binding values in the expected or in the actual results should be ignored for the comparison.
 #### Reference Data

{graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/aggregation.py RENAMED Viewed

@@ -4,7 +4,6 @@ from collections.abc import Sequence
 from statistics import mean, median
 from typing import Any, Collection, Iterable
 METRICS = [
     "answer_recall",
     "answer_precision",
@@ -135,7 +134,7 @@ def compute_micro_stats(
     number_of_samples_per_template_by_status,
     stats_per_template,
     step_metrics_per_template
-):
+) -> dict:
     values = number_of_samples_per_template_by_status.values()
     micro_summary = defaultdict(dict, {
         "number_of_error_samples": sum(v["error"] for v in values),
@@ -158,7 +157,7 @@ def compute_micro_stats(
             micro_step_metrics[metric].extend(values)
     for metric, values in micro_step_metrics.items():
         micro_summary[metric] = stats_for_series(values)
-    return micro_summary
+    return dict(micro_summary)
 def compute_macro_stats(
@@ -182,7 +181,7 @@ def compute_macro_stats(
                 macro_step_metrics[metric].append(stats["mean"])
     for metric, values in macro_step_metrics.items():
         macro_summary[metric]["mean"] = mean(values or [0])
-    return macro_summary
+    return dict(macro_summary)
 def compute_aggregates(samples: list[dict]) -> dict:
@@ -205,18 +204,19 @@ def compute_aggregates(samples: list[dict]) -> dict:
         update_steps_summary(sample, steps_summary_per_template[template_id])
         update_step_metrics(sample, step_metrics_per_template[template_id])
-    summary = {}
-    summary["per_template"] = compute_per_template_stats(
-        templates_ids,
-        number_of_samples_per_template_by_status,
-        stats_per_template,
-        steps_summary_per_template,
-        step_metrics_per_template,
-    )
-    summary["micro"] = compute_micro_stats(
-        number_of_samples_per_template_by_status,
-        stats_per_template,
-        step_metrics_per_template
-    )
+    summary = {
+        "per_template": compute_per_template_stats(
+            templates_ids,
+            number_of_samples_per_template_by_status,
+            stats_per_template,
+            steps_summary_per_template,
+            step_metrics_per_template,
+        ),
+        "micro": compute_micro_stats(
+            number_of_samples_per_template_by_status,
+            stats_per_template,
+            step_metrics_per_template
+        )
+    }
     summary["macro"] = compute_macro_stats(summary["per_template"])
     return summary

{graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/steps/evaluation.py RENAMED Viewed

@@ -1,12 +1,11 @@
 import json
 from collections import defaultdict
-from typing import Any
 from collections.abc import Sequence
+from typing import Any
 from .retrieval_context_ids import recall_at_k
 from .sparql import compare_sparql_results
 Match = tuple[int, int, int, float]
 Step = dict[str, Any]
 StepsGroup = Sequence[Step]  # We will index into a group
@@ -23,6 +22,7 @@ def compare_steps_outputs(reference_step: Step, actual_step: Step) -> float:
             json.loads(actual_output),
             reference_step["required_columns"],
             reference_step.get("ordered", False),
+            reference_step.get("ignore_duplicates", True),
         )
     if reference_step.get("output_media_type") == "application/json":
         return float(json.loads(reference_output) == json.loads(actual_output))

{graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/steps/sparql.py RENAMED Viewed

@@ -1,7 +1,7 @@
-from collections import Counter
-from typing import Union
 import itertools
 import math
+from collections import Counter
+from typing import Union
 XSD_NUMERIC_TYPES = {
     "http://www.w3.org/2001/XMLSchema#integer",
@@ -35,7 +35,7 @@ def truncate(number: float, decimals: int = 0) -> float:
     elif decimals == 0:
         return math.trunc(number)
-    factor = 10.0**decimals
+    factor = 10.0 ** decimals
     return math.trunc(number * factor) / factor
@@ -137,8 +137,8 @@ def compare_values(
     actual_vars: Union[list[str], tuple[str, ...]],
     actual_var_to_values: dict[str, list],
     results_are_ordered: bool,
+    ignore_duplicates: bool,
 ) -> bool:
     if len(reference_vars) < len(actual_vars):
         for combination in itertools.combinations(actual_vars, len(reference_vars)):
             if compare_values(
@@ -147,6 +147,7 @@ def compare_values(
                 combination,
                 actual_var_to_values,
                 results_are_ordered,
+                ignore_duplicates,
             ):
                 return True
         return False
@@ -154,9 +155,9 @@ def compare_values(
     table = convert_table_dict2lines(reference_vars, reference_var_to_values)
     for permutation in itertools.permutations(actual_vars):
         actual_table = convert_table_dict2lines(permutation, actual_var_to_values)
-        if (results_are_ordered and table == actual_table) or (
-            not results_are_ordered and Counter(table) == Counter(actual_table)
-        ):
+        if (results_are_ordered and table == actual_table) or \
+            ((not results_are_ordered) and ignore_duplicates and set(table) == set(actual_table)) or \
+            ((not results_are_ordered) and (not ignore_duplicates) and Counter(table) == Counter(actual_table)):
             return True
     return False
@@ -167,6 +168,7 @@ def compare_sparql_results(
     actual_sparql_result: dict,
     required_vars: list[str],
     results_are_ordered: bool = False,
+    ignore_duplicates: bool = True,
 ) -> float:
     # DESCRIBE results
     if isinstance(actual_sparql_result, str):
@@ -208,5 +210,6 @@ def compare_sparql_results(
             actual_vars,
             actual_var_to_values,
             results_are_ordered,
+            ignore_duplicates,
         )
     )

{graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "graphrag-eval"
-version = "5.1.1"
+version = "5.2.0"
 description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
 authors = [
     { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },