graphrag-eval 4.0.0__tar.gz → 5.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,17 @@
1
+ Metadata-Version: 2.3
2
+ Name: graphrag-eval
3
+ Version: 5.0.0
4
+ Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
5
+ License: Apache-2.0
6
+ Author: Neli Hateva
7
+ Author-email: neli.hateva@graphwise.ai
8
+ Requires-Python: >=3.12,<3.13
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
13
+ Description-Content-Type: text/markdown
14
+
1
15
  <p align="center">
2
16
  <img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
3
17
  </p>
@@ -36,7 +50,7 @@ graphrag-eval = {version = "*", extras = ["openai"]}
36
50
  ## Maintainers
37
51
 
38
52
  Developed and maintained by [Graphwise](https://graphwise.ai/).
39
- For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/qa-eval/issues).
53
+ For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
40
54
 
41
55
  ## Command Line Use
42
56
 
@@ -77,13 +91,14 @@ A reference corpus is a list of templates, each of which contains:
77
91
  - `question_text`: The natural language query passed to the LLM
78
92
  - `reference_steps`: (optional) A list of expected steps grouped by expected order of execution, where all steps in a group can be executed in any order relative to each other, but after all steps in the previous group and before all steps in the next group.
79
93
  - `reference_answer`: (optional) The expected answer to the question
94
+
80
95
  The assumption is that the final answer to the question is derived from the outputs of the steps, which are executed last (last level).
81
96
 
82
97
  Each step includes:
83
98
 
84
99
  - `name`: The type of step being performed (e.g., `sparql_query`)
85
100
  - `args`: Arguments of the step (e.g., arguments to a tool used in the step, such as a SPARQL query)
86
- - `output`: The expected output from the step
101
+ - `output`: The expected output from the step.
87
102
  - `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
88
103
  - `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
89
104
  - `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
@@ -99,7 +114,22 @@ The example corpus below illustrates a minimal but realistic Q&A dataset, showin
99
114
  question_text: List all transformers within Substation OSLO
100
115
  reference_answer: OSLO T1, OSLO T2
101
116
  reference_steps:
102
- - - name: sparql_query
117
+ - - name: retrieval
118
+ args:
119
+ query: transformers Substation OSLO
120
+ k: 2
121
+ output: |-
122
+ [
123
+ {
124
+ "id": "http://example.com/resource/doc/1",
125
+ "text": "Transformer OSLO T1 is in Substation Oslo."
126
+ },
127
+ {
128
+ "id": "http://example.com/resource/doc/2",
129
+ "text": "Transformer OSLO T2 is in Substation Oslo."
130
+ }
131
+ ]
132
+ - name: sparql_query
103
133
  args:
104
134
  query: |2
105
135
 
@@ -253,6 +283,16 @@ Below is an example response from the question-answering system for a single que
253
283
  "total_tokens": 298753,
254
284
  "elapsed_sec": 46.48961806297302,
255
285
  "actual_steps": [
286
+ {
287
+ "name": "retrieval",
288
+ "args": {
289
+ "query": "transformers Substation OSLO",
290
+ "k": 2
291
+ },
292
+ "id": "call_3",
293
+ "status": "success",
294
+ "output": "[\n {\n \"id\": \"http://example.com/resource/doc/1\",\n \"text\": \"Transformer OSLO T1 is in Substation Oslo.\"\n },\n {\n \"id\": \"http://example.com/resource/doc/2\",\n \"text\": \"Transformer OSLO T2 is in Substation Oslo.\"\n }\n]"
295
+ },
256
296
  {
257
297
  "name": "autocomplete_search",
258
298
  "args": {
@@ -323,7 +363,23 @@ The output is a list of statistics for each question from the reference Q&A data
323
363
  question_text: List all transformers within Substation OSLO
324
364
  reference_answer: OSLO T1, OSLO T2
325
365
  reference_steps:
326
- - - name: sparql_query
366
+ - - name: retrieval
367
+ args:
368
+ query: transformers Substation OSLO
369
+ k: 2
370
+ matches: call_3
371
+ output: |-
372
+ [
373
+ {
374
+ "id": "http://example.com/resource/doc/1",
375
+ "text": "Transformer OSLO T1 is in Substation Oslo."
376
+ },
377
+ {
378
+ "id": "http://example.com/resource/doc/2",
379
+ "text": "Transformer OSLO T2 is in Substation Oslo."
380
+ }
381
+ ]
382
+ - name: sparql_query
327
383
  args:
328
384
  query: |2
329
385
 
@@ -364,6 +420,31 @@ The output is a list of statistics for each question from the reference Q&A data
364
420
  answer_relevance: 0.9
365
421
  answer_relevance_cost: 0.0007
366
422
  actual_steps:
423
+ - name: retrieval
424
+ id: call_3
425
+ args:
426
+ query: transformers Substation OSLO
427
+ k: 2
428
+ status: success
429
+ output: |-
430
+ [
431
+ {
432
+ "id": "http://example.com/resource/doc/1",
433
+ "text": "Transformer OSLO T1 is in Substation Oslo."
434
+ },
435
+ {
436
+ "id": "http://example.com/resource/doc/2",
437
+ "text": "Transformer OSLO T2 is in Substation Oslo."
438
+ }
439
+ ]
440
+ retrieval_answer_recall: 1.0
441
+ retrieval_answer_recall_reason: The context contains all the transformers listed in the reference answer
442
+ retrieval_answer_recall_cost: 0.0007
443
+ retrieval_answer_precision: 1.0
444
+ retrieval_answer_precision_reason: The context contains only transformers listed in the reference answer
445
+ retrieval_answer_precision_cost: 0.0003
446
+ retrieval_answer_f1: 1.0
447
+ retrieval_answer_f1_cost: 0.001
367
448
  - name: autocomplete_search
368
449
  args:
369
450
  query: OSLO
@@ -470,12 +551,33 @@ The output is a list of statistics for each question from the reference Q&A data
470
551
  - `answer_relevance_error`: (optional) error message if answer relevance evaluation failed
471
552
  - `answer_relevance_cost`: The LLM use cost of computing `answer_relevance`, in US dollars
472
553
  - `actual_steps`: (optional) copy of the steps in the evaluation target, if specified there
473
- - `steps_score`: a real number between 0 and 1, computed by comparing the results of the last steps that were executed to the reference's last group of steps. If there is no match in the actual steps, then the score is `0`. Otherwise, it is calculated as the number of the matched steps on the last group divided by the total number of steps in the last group.
554
+ - `steps_score`: a real number between 0 and 1, computed by comparing the results of the last executed steps to the output of the reference's last group of steps.
555
+ - If there is no match in the actual steps, then the score is `0.0`
556
+ - If the executed step's name is "retrieval" and the last reference group contains a retrieval step, then the score is the [recall at k](#context-recallk) of the retrieved document ids with respect to the reference.
557
+ - Otherwise, the score is the number of the matched steps on the last group divided by the total number of steps in the last group.
474
558
  - `input_tokens`: input tokens usage
475
559
  - `output_tokens`: output tokens usage
476
560
  - `total_tokens`: total tokens usage
477
561
  - `elapsed_sec`: elapsed seconds
478
562
 
563
+ All `actual_steps` with `name` "retrieval" contain:
564
+ - `retrieval_answer_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
565
+ - `retrieval_answer_recall_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_recall`
566
+ - `retrieval_answer_recall_error`: (optional) error message if `retrieval_answer_recall` evaluation fails
567
+ - `retrieval_answer_recall_cost`: cost of evaluating `retrieval_answer_recall`, in US dollars
568
+ - `retrieval_answer_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
569
+ - `retrieval_answer_precision_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_precision`
570
+ - `retrieval_answer_precision_error`: (optional) error message if `retrieval_answer_precision` evaluation fails
571
+ - `retrieval_answer_precision_cost`: cost of evaluating `retrieval_answer_precision`, in US dollars
572
+ - `retrieval_answer_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_answer_recall` and `retrieval_answer_precision` succeed
573
+ - `retrieval_answer_f1_cost`: The sum of `retrieval_answer_recall_cost` and `retrieval_answer_precision_cost`
574
+ - `retrieval_context_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
575
+ - `retrieval_context_recall_error`: (optional) error message if `retrieval_context_recall` evaluation fails
576
+ - `retrieval_context_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
577
+ - `retrieval_context_precision_error`: (optional) error message if `retrieval_context_precision` evaluation fails
578
+ - `retrieval_context_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_context_recall` and `retrieval_context_precision` succeed
579
+
580
+
479
581
  #### Aggregates Keys
480
582
 
481
583
  The `aggregates` object provides aggregated evaluation metrics.
@@ -499,6 +601,9 @@ Aggregates are:
499
601
  - `once_per_sample`: how many times each step was executed, counted only once per question
500
602
  - `empty_results`: how many times the step was executed and returned empty results
501
603
  - `errors`: how many times the step was executed and resulted in error
604
+ - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` for all successful questions in this template
605
+ - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` for all successful questions in this template
606
+ - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` for all successful questions in this template
502
607
  - `micro`: statistics across questions, regardless of template. It includes:
503
608
  - `number_of_error_samples`: total number of questions, which resulted in error response
504
609
  - `number_of_success_samples`: total number of questions, which resulted in successful response
@@ -511,6 +616,9 @@ Aggregates are:
511
616
  - `answer_f1`: `sum`, `mean`, `median`, `min` and `max` for `answer_f1` of all successful questions
512
617
  - `answer_relevance`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance` of all successful questions
513
618
  - `answer_relevance_cost`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance_cost` of all successful questions
619
+ - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` of all successful questions
620
+ - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` of all successful questions
621
+ - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` of all successful questions
514
622
  - `steps_score`: `sum`, `mean`, `median`, `min` and `max` for `steps_score` of all successful questions
515
623
  - `macro`: averages across templates, i.e., the mean of each metric per template, averaged. It includes:
516
624
  - `input_tokens`: `mean` for `input_tokens`
@@ -522,6 +630,9 @@ Aggregates are:
522
630
  - `answer_f1`: `mean` for `answer_f1`
523
631
  - `answer_relevance`: `mean` for `answer_relevance`
524
632
  - `answer_relevance_cost`: `mean` for `answer_relevance_cost`
633
+ - `retrieval_context_recall`: `mean` for `retrieval_context_recall`
634
+ - `retrieval_context_precision`: `mean` for `retrieval_context_precision`
635
+ - `retrieval_context_f1`: `mean` for `retrieval_context_f1`
525
636
  - `steps_score`: `mean` for `steps_score`
526
637
 
527
638
  #### Example Aggregates
@@ -898,18 +1009,30 @@ macro:
898
1009
  mean: 25.911653497483996
899
1010
  ```
900
1011
 
1012
+ ### SPARQL queries comparison
1013
+
1014
+ The algorithm iterates over all subsets of columns in the actual result of the same size as in the reference result.
1015
+ For each subset, it compares the set of columns (skipping optional columns).
1016
+ It matches floating-point numbers up to a 1e-8 precision. It does not do this for special types such as duration.
1017
+
1018
+ The average time complexity is О(nr\*nc_ref!\*binomial(nc_act, nc_ref)), where
1019
+
1020
+ * *nr* is the number of rows in the actual result
1021
+ * *nc_ref* is the number of columns in the reference result
1022
+ * *nc_act* is the number of columns in the actual result
1023
+
901
1024
  ### Retrieval Evaluation
902
1025
 
903
- The following metrics are based on the ids of retrieved documents.
1026
+ The following metrics are based on the content of retrieved documents.
904
1027
 
905
- #### Recall@k Metric
1028
+ #### Context Recall@k
906
1029
 
907
1030
  The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we inclide in the first k spots?"
908
1031
  * **Formula**:
909
1032
  $`
910
1033
  \frac{\text{Number of relevant items in top k}}{\text{Number of relevant items}}
911
1034
  `$
912
- * **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the *total* number of relevant items.
1035
+ * **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the first 'k' relevant items.
913
1036
  * **Example**: Suppose there are 4 relevant documents for a given query. Suppose our system retrieves 3 of them in the top 5 results (`k=5`). Recall@5 is `3 / 4 = 0.75`.
914
1037
 
915
1038
  ```python
@@ -920,7 +1043,7 @@ recall_at_k(
920
1043
  ) # => 0.75
921
1044
  ```
922
1045
 
923
- #### Average Precision (AP) Metric
1046
+ #### Context Precision@k
924
1047
 
925
1048
  Evaluates a ranked list of recommendations by looking at the precision at the position of each correctly retrieved item. It rewards systems for placing relevant items higher up in the list. It's more sophisticated than just looking at precision at a single cutoff because it considers the entire ranking.
926
1049
  * **Formula**:
@@ -950,3 +1073,4 @@ average_precision(
950
1073
  retrieved_docs=[1, 4, 3, 5, 7]
951
1074
  ) # ~=> 0.8056
952
1075
  ```
1076
+
@@ -1,17 +1,3 @@
1
- Metadata-Version: 2.3
2
- Name: graphrag-eval
3
- Version: 4.0.0
4
- Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
5
- License: Apache-2.0
6
- Author: Neli Hateva
7
- Author-email: neli.hateva@graphwise.ai
8
- Requires-Python: >=3.12,<3.13
9
- Classifier: License :: OSI Approved :: Apache Software License
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.12
12
- Project-URL: Repository, https://github.com/Ontotext-AD/qa-eval
13
- Description-Content-Type: text/markdown
14
-
15
1
  <p align="center">
16
2
  <img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
17
3
  </p>
@@ -50,7 +36,7 @@ graphrag-eval = {version = "*", extras = ["openai"]}
50
36
  ## Maintainers
51
37
 
52
38
  Developed and maintained by [Graphwise](https://graphwise.ai/).
53
- For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/qa-eval/issues).
39
+ For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
54
40
 
55
41
  ## Command Line Use
56
42
 
@@ -91,13 +77,14 @@ A reference corpus is a list of templates, each of which contains:
91
77
  - `question_text`: The natural language query passed to the LLM
92
78
  - `reference_steps`: (optional) A list of expected steps grouped by expected order of execution, where all steps in a group can be executed in any order relative to each other, but after all steps in the previous group and before all steps in the next group.
93
79
  - `reference_answer`: (optional) The expected answer to the question
80
+
94
81
  The assumption is that the final answer to the question is derived from the outputs of the steps, which are executed last (last level).
95
82
 
96
83
  Each step includes:
97
84
 
98
85
  - `name`: The type of step being performed (e.g., `sparql_query`)
99
86
  - `args`: Arguments of the step (e.g., arguments to a tool used in the step, such as a SPARQL query)
100
- - `output`: The expected output from the step
87
+ - `output`: The expected output from the step.
101
88
  - `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
102
89
  - `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
103
90
  - `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
@@ -113,7 +100,22 @@ The example corpus below illustrates a minimal but realistic Q&A dataset, showin
113
100
  question_text: List all transformers within Substation OSLO
114
101
  reference_answer: OSLO T1, OSLO T2
115
102
  reference_steps:
116
- - - name: sparql_query
103
+ - - name: retrieval
104
+ args:
105
+ query: transformers Substation OSLO
106
+ k: 2
107
+ output: |-
108
+ [
109
+ {
110
+ "id": "http://example.com/resource/doc/1",
111
+ "text": "Transformer OSLO T1 is in Substation Oslo."
112
+ },
113
+ {
114
+ "id": "http://example.com/resource/doc/2",
115
+ "text": "Transformer OSLO T2 is in Substation Oslo."
116
+ }
117
+ ]
118
+ - name: sparql_query
117
119
  args:
118
120
  query: |2
119
121
 
@@ -267,6 +269,16 @@ Below is an example response from the question-answering system for a single que
267
269
  "total_tokens": 298753,
268
270
  "elapsed_sec": 46.48961806297302,
269
271
  "actual_steps": [
272
+ {
273
+ "name": "retrieval",
274
+ "args": {
275
+ "query": "transformers Substation OSLO",
276
+ "k": 2
277
+ },
278
+ "id": "call_3",
279
+ "status": "success",
280
+ "output": "[\n {\n \"id\": \"http://example.com/resource/doc/1\",\n \"text\": \"Transformer OSLO T1 is in Substation Oslo.\"\n },\n {\n \"id\": \"http://example.com/resource/doc/2\",\n \"text\": \"Transformer OSLO T2 is in Substation Oslo.\"\n }\n]"
281
+ },
270
282
  {
271
283
  "name": "autocomplete_search",
272
284
  "args": {
@@ -337,7 +349,23 @@ The output is a list of statistics for each question from the reference Q&A data
337
349
  question_text: List all transformers within Substation OSLO
338
350
  reference_answer: OSLO T1, OSLO T2
339
351
  reference_steps:
340
- - - name: sparql_query
352
+ - - name: retrieval
353
+ args:
354
+ query: transformers Substation OSLO
355
+ k: 2
356
+ matches: call_3
357
+ output: |-
358
+ [
359
+ {
360
+ "id": "http://example.com/resource/doc/1",
361
+ "text": "Transformer OSLO T1 is in Substation Oslo."
362
+ },
363
+ {
364
+ "id": "http://example.com/resource/doc/2",
365
+ "text": "Transformer OSLO T2 is in Substation Oslo."
366
+ }
367
+ ]
368
+ - name: sparql_query
341
369
  args:
342
370
  query: |2
343
371
 
@@ -378,6 +406,31 @@ The output is a list of statistics for each question from the reference Q&A data
378
406
  answer_relevance: 0.9
379
407
  answer_relevance_cost: 0.0007
380
408
  actual_steps:
409
+ - name: retrieval
410
+ id: call_3
411
+ args:
412
+ query: transformers Substation OSLO
413
+ k: 2
414
+ status: success
415
+ output: |-
416
+ [
417
+ {
418
+ "id": "http://example.com/resource/doc/1",
419
+ "text": "Transformer OSLO T1 is in Substation Oslo."
420
+ },
421
+ {
422
+ "id": "http://example.com/resource/doc/2",
423
+ "text": "Transformer OSLO T2 is in Substation Oslo."
424
+ }
425
+ ]
426
+ retrieval_answer_recall: 1.0
427
+ retrieval_answer_recall_reason: The context contains all the transformers listed in the reference answer
428
+ retrieval_answer_recall_cost: 0.0007
429
+ retrieval_answer_precision: 1.0
430
+ retrieval_answer_precision_reason: The context contains only transformers listed in the reference answer
431
+ retrieval_answer_precision_cost: 0.0003
432
+ retrieval_answer_f1: 1.0
433
+ retrieval_answer_f1_cost: 0.001
381
434
  - name: autocomplete_search
382
435
  args:
383
436
  query: OSLO
@@ -484,12 +537,33 @@ The output is a list of statistics for each question from the reference Q&A data
484
537
  - `answer_relevance_error`: (optional) error message if answer relevance evaluation failed
485
538
  - `answer_relevance_cost`: The LLM use cost of computing `answer_relevance`, in US dollars
486
539
  - `actual_steps`: (optional) copy of the steps in the evaluation target, if specified there
487
- - `steps_score`: a real number between 0 and 1, computed by comparing the results of the last steps that were executed to the reference's last group of steps. If there is no match in the actual steps, then the score is `0`. Otherwise, it is calculated as the number of the matched steps on the last group divided by the total number of steps in the last group.
540
+ - `steps_score`: a real number between 0 and 1, computed by comparing the results of the last executed steps to the output of the reference's last group of steps.
541
+ - If there is no match in the actual steps, then the score is `0.0`
542
+ - If the executed step's name is "retrieval" and the last reference group contains a retrieval step, then the score is the [recall at k](#context-recallk) of the retrieved document ids with respect to the reference.
543
+ - Otherwise, the score is the number of the matched steps on the last group divided by the total number of steps in the last group.
488
544
  - `input_tokens`: input tokens usage
489
545
  - `output_tokens`: output tokens usage
490
546
  - `total_tokens`: total tokens usage
491
547
  - `elapsed_sec`: elapsed seconds
492
548
 
549
+ All `actual_steps` with `name` "retrieval" contain:
550
+ - `retrieval_answer_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
551
+ - `retrieval_answer_recall_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_recall`
552
+ - `retrieval_answer_recall_error`: (optional) error message if `retrieval_answer_recall` evaluation fails
553
+ - `retrieval_answer_recall_cost`: cost of evaluating `retrieval_answer_recall`, in US dollars
554
+ - `retrieval_answer_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
555
+ - `retrieval_answer_precision_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_precision`
556
+ - `retrieval_answer_precision_error`: (optional) error message if `retrieval_answer_precision` evaluation fails
557
+ - `retrieval_answer_precision_cost`: cost of evaluating `retrieval_answer_precision`, in US dollars
558
+ - `retrieval_answer_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_answer_recall` and `retrieval_answer_precision` succeed
559
+ - `retrieval_answer_f1_cost`: The sum of `retrieval_answer_recall_cost` and `retrieval_answer_precision_cost`
560
+ - `retrieval_context_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
561
+ - `retrieval_context_recall_error`: (optional) error message if `retrieval_context_recall` evaluation fails
562
+ - `retrieval_context_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
563
+ - `retrieval_context_precision_error`: (optional) error message if `retrieval_context_precision` evaluation fails
564
+ - `retrieval_context_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_context_recall` and `retrieval_context_precision` succeed
565
+
566
+
493
567
  #### Aggregates Keys
494
568
 
495
569
  The `aggregates` object provides aggregated evaluation metrics.
@@ -513,6 +587,9 @@ Aggregates are:
513
587
  - `once_per_sample`: how many times each step was executed, counted only once per question
514
588
  - `empty_results`: how many times the step was executed and returned empty results
515
589
  - `errors`: how many times the step was executed and resulted in error
590
+ - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` for all successful questions in this template
591
+ - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` for all successful questions in this template
592
+ - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` for all successful questions in this template
516
593
  - `micro`: statistics across questions, regardless of template. It includes:
517
594
  - `number_of_error_samples`: total number of questions, which resulted in error response
518
595
  - `number_of_success_samples`: total number of questions, which resulted in successful response
@@ -525,6 +602,9 @@ Aggregates are:
525
602
  - `answer_f1`: `sum`, `mean`, `median`, `min` and `max` for `answer_f1` of all successful questions
526
603
  - `answer_relevance`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance` of all successful questions
527
604
  - `answer_relevance_cost`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance_cost` of all successful questions
605
+ - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` of all successful questions
606
+ - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` of all successful questions
607
+ - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` of all successful questions
528
608
  - `steps_score`: `sum`, `mean`, `median`, `min` and `max` for `steps_score` of all successful questions
529
609
  - `macro`: averages across templates, i.e., the mean of each metric per template, averaged. It includes:
530
610
  - `input_tokens`: `mean` for `input_tokens`
@@ -536,6 +616,9 @@ Aggregates are:
536
616
  - `answer_f1`: `mean` for `answer_f1`
537
617
  - `answer_relevance`: `mean` for `answer_relevance`
538
618
  - `answer_relevance_cost`: `mean` for `answer_relevance_cost`
619
+ - `retrieval_context_recall`: `mean` for `retrieval_context_recall`
620
+ - `retrieval_context_precision`: `mean` for `retrieval_context_precision`
621
+ - `retrieval_context_f1`: `mean` for `retrieval_context_f1`
539
622
  - `steps_score`: `mean` for `steps_score`
540
623
 
541
624
  #### Example Aggregates
@@ -912,18 +995,30 @@ macro:
912
995
  mean: 25.911653497483996
913
996
  ```
914
997
 
998
+ ### SPARQL queries comparison
999
+
1000
+ The algorithm iterates over all subsets of columns in the actual result of the same size as in the reference result.
1001
+ For each subset, it compares the set of columns (skipping optional columns).
1002
+ It matches floating-point numbers up to a 1e-8 precision. It does not do this for special types such as duration.
1003
+
1004
+ The average time complexity is О(nr\*nc_ref!\*binomial(nc_act, nc_ref)), where
1005
+
1006
+ * *nr* is the number of rows in the actual result
1007
+ * *nc_ref* is the number of columns in the reference result
1008
+ * *nc_act* is the number of columns in the actual result
1009
+
915
1010
  ### Retrieval Evaluation
916
1011
 
917
- The following metrics are based on the ids of retrieved documents.
1012
+ The following metrics are based on the content of retrieved documents.
918
1013
 
919
- #### Recall@k Metric
1014
+ #### Context Recall@k
920
1015
 
921
1016
  The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we inclide in the first k spots?"
922
1017
  * **Formula**:
923
1018
  $`
924
1019
  \frac{\text{Number of relevant items in top k}}{\text{Number of relevant items}}
925
1020
  `$
926
- * **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the *total* number of relevant items.
1021
+ * **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the first 'k' relevant items.
927
1022
  * **Example**: Suppose there are 4 relevant documents for a given query. Suppose our system retrieves 3 of them in the top 5 results (`k=5`). Recall@5 is `3 / 4 = 0.75`.
928
1023
 
929
1024
  ```python
@@ -934,7 +1029,7 @@ recall_at_k(
934
1029
  ) # => 0.75
935
1030
  ```
936
1031
 
937
- #### Average Precision (AP) Metric
1032
+ #### Context Precision@k
938
1033
 
939
1034
  Evaluates a ranked list of recommendations by looking at the precision at the position of each correctly retrieved item. It rewards systems for placing relevant items higher up in the list. It's more sophisticated than just looking at precision at a single cutoff because it considers the entire ranking.
940
1035
  * **Formula**:
@@ -964,4 +1059,3 @@ average_precision(
964
1059
  retrieved_docs=[1, 4, 3, 5, 7]
965
1060
  ) # ~=> 0.8056
966
1061
  ```
967
-
@@ -16,7 +16,22 @@ METRICS = [
16
16
  "total_tokens",
17
17
  "elapsed_sec"
18
18
  ]
19
-
19
+ STEPS_METRICS = {
20
+ "retrieval": [
21
+ "retrieval_answer_precision",
22
+ "retrieval_answer_precision_cost",
23
+ "retrieval_answer_recall",
24
+ "retrieval_answer_recall_cost",
25
+ "retrieval_answer_f1",
26
+ "retrieval_answer_f1_cost",
27
+ "retrieval_context_precision",
28
+ "retrieval_context_precision_cost",
29
+ "retrieval_context_recall",
30
+ "retrieval_context_recall_cost",
31
+ "retrieval_context_f1",
32
+ "retrieval_context_f1_cost",
33
+ ]
34
+ }
20
35
  PROTECTED_METRICS = [
21
36
  "input_tokens",
22
37
  "output_tokens",
@@ -35,6 +50,19 @@ def stats_for_series(values: Iterable[int | float]) -> dict[str, float]:
35
50
  }
36
51
 
37
52
 
53
+ def update_step_metrics_per_template(
54
+ sample: dict,
55
+ step_metrics_per_template: dict,
56
+ template_id: str
57
+ ):
58
+ for step in sample.get("actual_steps", []):
59
+ if step["name"] in STEPS_METRICS:
60
+ for metric in STEPS_METRICS[step["name"]]:
61
+ value = step.get(metric)
62
+ if value is not None:
63
+ step_metrics_per_template[template_id][metric].append(value)
64
+
65
+
38
66
  def update_stats_per_template(
39
67
  sample: dict,
40
68
  stats_per_template: dict,
@@ -76,6 +104,7 @@ def compute_aggregates(samples: list[dict]) -> dict:
76
104
  number_of_samples_per_template_by_status = defaultdict(lambda: defaultdict(int))
77
105
  stats_per_template = defaultdict(lambda: defaultdict(list))
78
106
  steps_summary_per_template = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
107
+ step_metrics_per_template = defaultdict(lambda: defaultdict(list))
79
108
 
80
109
  # Compute per-template stats
81
110
  templates_ids = set()
@@ -94,6 +123,11 @@ def compute_aggregates(samples: list[dict]) -> dict:
94
123
  steps_summary_per_template,
95
124
  template_id
96
125
  )
126
+ update_step_metrics_per_template(
127
+ sample,
128
+ step_metrics_per_template,
129
+ template_id
130
+ )
97
131
 
98
132
  summary = {"per_template": {}}
99
133
 
@@ -115,6 +149,13 @@ def compute_aggregates(samples: list[dict]) -> dict:
115
149
  if series or metric in PROTECTED_METRICS:
116
150
  template_summary[metric] = stats_for_series(series)
117
151
 
152
+ # Add step metrics for the template
153
+ template_step_metrics = {}
154
+ for metric, values in step_metrics_per_template[template_id].items():
155
+ template_step_metrics[metric] = stats_for_series(values)
156
+ if template_step_metrics:
157
+ template_summary["steps"].update(template_step_metrics)
158
+
118
159
  summary["per_template"][template_id] = template_summary
119
160
 
120
161
  # Add micro stats
@@ -137,6 +178,17 @@ def compute_aggregates(samples: list[dict]) -> dict:
137
178
  if series or metric in PROTECTED_METRICS:
138
179
  summary["micro"][metric] = stats_for_series(series)
139
180
 
181
+ # Add micro step metrics
182
+ micro_step_metrics = defaultdict(list)
183
+ for template_metrics in step_metrics_per_template.values():
184
+ for metric, values in template_metrics.items():
185
+ micro_step_metrics[metric].extend(values)
186
+ step_metrics = {
187
+ metric: stats_for_series(values)
188
+ for metric, values in micro_step_metrics.items()
189
+ }
190
+ summary["micro"].update(step_metrics)
191
+
140
192
  # Add macro stats
141
193
  summary["macro"] = {}
142
194
  for metric in METRICS:
@@ -148,4 +200,17 @@ def compute_aggregates(samples: list[dict]) -> dict:
148
200
  if means or metric in PROTECTED_METRICS:
149
201
  summary["macro"][metric] = {"mean": mean(means) if means else 0}
150
202
 
203
+ # Add macro step metrics
204
+ macro_step_metrics = defaultdict(list)
205
+ for template_id, template_summary in summary["per_template"].items():
206
+ if "steps" in template_summary:
207
+ for metric, stats in template_summary["steps"].items():
208
+ if "mean" in stats:
209
+ macro_step_metrics[metric].append(stats["mean"])
210
+ step_metrics = {
211
+ metric: {"mean": mean(values) if values else 0}
212
+ for metric, values in macro_step_metrics.items()
213
+ }
214
+ summary["macro"].update(step_metrics)
215
+
151
216
  return summary
@@ -48,7 +48,7 @@ def run_evaluation(
48
48
  actual_result,
49
49
  )
50
50
  )
51
- if "steps" in actual_result:
51
+ if "actual_steps" in actual_result:
52
52
  eval_result.update(
53
53
  get_steps_evaluation_result_dict(question, actual_result)
54
54
  )
@@ -1,13 +1,14 @@
1
1
  import json
2
2
  from collections import defaultdict
3
3
 
4
- from .retrieval import recall_at_k
4
+ from .retrieval_context_ids import recall_at_k
5
5
  from .sparql import compare_sparql_results
6
6
 
7
7
 
8
8
  def compare_steps_outputs(reference: dict, actual: dict) -> float:
9
- ref_output = reference["output"]
9
+ ref_output = reference.get("output")
10
10
  act_output = actual["output"]
11
+ assert ref_output, "Reference step output is mandatory"
11
12
  if reference.get("output_media_type") == "application/sparql-results+json":
12
13
  return compare_sparql_results(
13
14
  json.loads(ref_output),
@@ -17,9 +18,11 @@ def compare_steps_outputs(reference: dict, actual: dict) -> float:
17
18
  )
18
19
  if reference.get("output_media_type") == "application/json":
19
20
  return float(json.loads(ref_output) == json.loads(act_output))
20
- if reference["name"] == "retrieval":
21
- k = reference["args"]["k"]
22
- return recall_at_k(ref_output, act_output, k)
21
+ if reference["name"] == actual["name"] == "retrieval":
22
+ ref_contexts_ids = [c["id"] for c in json.loads(ref_output)]
23
+ act_contexts_ids = [c["id"] for c in json.loads(act_output)]
24
+ k = actual["args"]["k"]
25
+ return recall_at_k(ref_contexts_ids, act_contexts_ids, k)
23
26
  return float(ref_output == act_output)
24
27
 
25
28
 
@@ -95,9 +98,11 @@ def get_steps_matches(
95
98
 
96
99
  def evaluate_steps(
97
100
  reference_steps_groups: list[list[dict]],
98
- actual_steps: list[dict]
101
+ actual_steps: list[dict],
102
+ matches: list[tuple[int, int, int, float]] | None = None
99
103
  ) -> float:
100
- matches = get_steps_matches(reference_steps_groups, actual_steps)
104
+ if matches is None:
105
+ matches = get_steps_matches(reference_steps_groups, actual_steps)
101
106
  matches_by_group = defaultdict(list)
102
107
  scores_by_group = defaultdict(float)
103
108
  for ref_group_idx, ref_match_idx, actual_idx, score in matches:
@@ -110,11 +115,33 @@ def evaluate_steps(
110
115
 
111
116
 
112
117
  def get_steps_evaluation_result_dict(reference: dict, target: dict) -> dict:
113
- act_steps = target["steps"]
114
118
  eval_result = {}
119
+ act_steps = target.get("actual_steps", [])
115
120
  eval_result["actual_steps"] = act_steps
121
+ for act_step in act_steps:
122
+ if act_step["name"] == "retrieval":
123
+ from .retrieval_answer import get_retrieval_evaluation_dict
124
+ result = get_retrieval_evaluation_dict(
125
+ question_text=reference["question_text"],
126
+ reference_answer=reference.get("reference_answer"),
127
+ actual_answer=target.get("actual_answer"),
128
+ actual_contexts=json.loads(act_step["output"])
129
+ )
130
+ act_step.update(result)
116
131
  if "reference_steps" in reference:
117
132
  ref_steps = reference["reference_steps"]
118
- steps_score = evaluate_steps(ref_steps, act_steps)
133
+ matches = get_steps_matches(ref_steps, act_steps)
134
+ steps_score = evaluate_steps(ref_steps, act_steps, matches)
119
135
  eval_result["steps_score"] = steps_score
136
+ for ref_group_idx, ref_match_idx, act_idx, _ in matches:
137
+ ref_step = ref_steps[ref_group_idx][ref_match_idx]
138
+ act_step = act_steps[act_idx]
139
+ if ref_step["name"] == "retrieval":
140
+ from .retrieval_context_texts import \
141
+ get_retrieval_evaluation_dict
142
+ res = get_retrieval_evaluation_dict(
143
+ reference_contexts=json.loads(ref_step["output"]),
144
+ actual_contexts=json.loads(act_step["output"])
145
+ )
146
+ act_step.update(res)
120
147
  return eval_result
@@ -0,0 +1,62 @@
1
+ from langevals_ragas.response_context_recall import (
2
+ RagasResponseContextRecallEntry,
3
+ RagasResponseContextRecallEvaluator,
4
+ )
5
+ from langevals_ragas.response_context_precision import (
6
+ RagasResponseContextPrecisionEntry,
7
+ RagasResponseContextPrecisionEvaluator,
8
+ )
9
+
10
+ from graphrag_eval.util import get_f1_dict
11
+
12
+
13
+ def _evaluate(
14
+ evaluator: RagasResponseContextRecallEvaluator | RagasResponseContextPrecisionEvaluator,
15
+ entry: RagasResponseContextRecallEntry | RagasResponseContextPrecisionEntry,
16
+ metric: str
17
+ ) -> dict[str, float | str]:
18
+ try:
19
+ result = evaluator.evaluate(entry)
20
+ if result.status == "processed":
21
+ return {
22
+ f"retrieval_answer_{metric}": result.score,
23
+ f"retrieval_answer_{metric}_cost": result.cost.amount,
24
+ f"retrieval_answer_{metric}_reason": result.details
25
+ }
26
+ else:
27
+ return {
28
+ f"retrieval_answer_{metric}_error": result.details
29
+ }
30
+ except Exception as e:
31
+ return {
32
+ f"retrieval_answer_{metric}_error": str(e)
33
+ }
34
+
35
+
36
+ def get_retrieval_evaluation_dict(
37
+ question_text: str,
38
+ actual_contexts: list[dict[str, str]],
39
+ reference_answer: str | None = None,
40
+ actual_answer: str | None = None,
41
+ model_name : str = "openai/gpt-4o-mini",
42
+ max_tokens : int = 65_536
43
+ ) -> dict:
44
+ if not reference_answer and not actual_answer:
45
+ return {}
46
+ settings_dict = {
47
+ "model": model_name,
48
+ "max_tokens": max_tokens
49
+ }
50
+ entry = RagasResponseContextPrecisionEntry(
51
+ input=question_text,
52
+ expected_output=reference_answer,
53
+ output=actual_answer,
54
+ contexts=[a["text"] for a in actual_contexts]
55
+ )
56
+ result = {}
57
+ evaluator = RagasResponseContextRecallEvaluator(settings=settings_dict)
58
+ result.update(_evaluate(evaluator, entry, "recall"))
59
+ evaluator = RagasResponseContextPrecisionEvaluator(settings=settings_dict)
60
+ result.update(_evaluate(evaluator, entry, "precision"))
61
+ result.update(get_f1_dict(result, "retrieval_answer"))
62
+ return result
@@ -0,0 +1,50 @@
1
+ from typing import Iterable
2
+
3
+
4
+ def recall_at_k(relevant_ids: list, retrieved_ids: list, k: int = 10) -> float:
5
+ """
6
+ Calculates Recall@k.
7
+
8
+ Args:
9
+ relevant_ids (list): A list of ground truth relevant document IDs.
10
+ retrieved_ids (list): A list of retrieved document IDs, ordered by rank.
11
+ k (int): The cutoff for the retrieval list.
12
+
13
+ Returns:
14
+ float: The Recall@k score.
15
+ """
16
+ retrieved_at_k = retrieved_ids[:k]
17
+ relevant_at_k = relevant_ids[:k]
18
+ true_positives = len(set(relevant_at_k).intersection(set(retrieved_at_k)))
19
+ total_relevant = len(relevant_at_k)
20
+ if total_relevant == 0:
21
+ return 0.0
22
+ return true_positives / total_relevant
23
+
24
+
25
+ def average_precision(relevant_ids: Iterable, retrieved_ids: Iterable) -> float:
26
+ """
27
+ Calculates Average Precision (AP) for a single query.
28
+
29
+ Args:
30
+ relevant_ids (Iterable): A set of ground truth relevant document IDs.
31
+ retrieved_ids (Iterable): A list of retrieved document IDs, ordered by rank.
32
+
33
+ Returns:
34
+ float: The Average Precision score.
35
+ """
36
+ relevant_set = set(relevant_ids)
37
+ hits = 0
38
+ sum_of_precisions = 0.0
39
+
40
+ for i, doc_id in enumerate(retrieved_ids):
41
+ if doc_id in relevant_set:
42
+ hits += 1
43
+ precision_at_k = hits / (i + 1)
44
+ sum_of_precisions += precision_at_k
45
+
46
+ total_relevant = len(relevant_set)
47
+ if total_relevant == 0:
48
+ return 0.0
49
+
50
+ return sum_of_precisions / total_relevant
@@ -0,0 +1,59 @@
1
+ from langevals_ragas.context_precision import (
2
+ RagasContextPrecisionEntry,
3
+ RagasContextPrecisionEvaluator,
4
+ )
5
+ from langevals_ragas.context_recall import (
6
+ RagasContextRecallEntry,
7
+ RagasContextRecallEvaluator,
8
+ )
9
+
10
+ from graphrag_eval.util import get_f1_dict
11
+
12
+
13
+ def _evaluate(
14
+ entry: RagasContextRecallEntry | RagasContextPrecisionEntry,
15
+ evauator: RagasContextRecallEvaluator | RagasContextPrecisionEvaluator,
16
+ metric: str
17
+ ) -> dict:
18
+ try:
19
+ result = evauator.evaluate(entry)
20
+ if result.status == "processed":
21
+ result_dict = {
22
+ f"retrieval_context_{metric}": result.score,
23
+ }
24
+ if result.details:
25
+ result_dict[f"retrieval_context_{metric}_reason"] = result.details
26
+ if result.cost is not None:
27
+ result_dict[f"retrieval_context_{metric}_cost"] = result.cost.amount
28
+ return result_dict
29
+ else:
30
+ return {
31
+ f"retrieval_context_{metric}_error": result.details,
32
+ }
33
+ except Exception as e:
34
+ return {
35
+ f"retrieval_context_{metric}_error": str(e),
36
+ }
37
+
38
+
39
+ def get_retrieval_evaluation_dict(
40
+ reference_contexts: list[dict[str, str]],
41
+ actual_contexts: list[dict[str, str]],
42
+ model_name : str = "openai/gpt-4o-mini",
43
+ max_tokens : int = 65_536
44
+ ) -> dict:
45
+ settings_dict = {
46
+ "model": model_name,
47
+ "max_tokens": max_tokens
48
+ }
49
+ entry = RagasContextRecallEntry(
50
+ expected_contexts=[a["text"] for a in reference_contexts],
51
+ contexts=[a["text"] for a in actual_contexts]
52
+ )
53
+ result = {}
54
+ evaluator = RagasContextRecallEvaluator(settings=settings_dict)
55
+ result.update(_evaluate(entry, evaluator, "recall"))
56
+ evaluator = RagasContextPrecisionEvaluator(settings=settings_dict)
57
+ result.update(_evaluate(entry, evaluator, "precision"))
58
+ result.update(get_f1_dict(result, "retrieval_context"))
59
+ return result
@@ -1,10 +1,31 @@
1
1
  from collections import Counter
2
+ import re
2
3
  from typing import Union
3
4
  import itertools
4
5
  import math
5
6
 
6
-
7
- def truncate(number, decimals=0):
7
+ XSD_NUMERIC_TYPES = {
8
+ "http://www.w3.org/2001/XMLSchema#integer",
9
+ "http://www.w3.org/2001/XMLSchema#int",
10
+ "http://www.w3.org/2001/XMLSchema#long",
11
+ "http://www.w3.org/2001/XMLSchema#short",
12
+ "http://www.w3.org/2001/XMLSchema#byte",
13
+ "http://www.w3.org/2001/XMLSchema#nonNegativeInteger",
14
+ "http://www.w3.org/2001/XMLSchema#positiveInteger",
15
+ "http://www.w3.org/2001/XMLSchema#unsignedLong",
16
+ "http://www.w3.org/2001/XMLSchema#unsignedInt",
17
+ "http://www.w3.org/2001/XMLSchema#unsignedShort",
18
+ "http://www.w3.org/2001/XMLSchema#unsignedByte",
19
+ }
20
+ XSD_FLOAT_TYPES = {
21
+ "http://www.w3.org/2001/XMLSchema#decimal",
22
+ "http://www.w3.org/2001/XMLSchema#double",
23
+ "http://www.w3.org/2001/XMLSchema#float",
24
+ }
25
+ XSD_BOOLEAN = "http://www.w3.org/2001/XMLSchema#boolean"
26
+
27
+
28
+ def truncate(number: float, decimals: int = 0) -> float:
8
29
  """
9
30
  Truncates a float to a certain number of decimal places.
10
31
  """
@@ -19,37 +40,92 @@ def truncate(number, decimals=0):
19
40
  return math.trunc(number * factor) / factor
20
41
 
21
42
 
43
+ def parse_sparql_term(term: dict) -> Union[str, float, bool, None]:
44
+ if not isinstance(term, dict):
45
+ return term
46
+
47
+ term_type = term.get("type")
48
+ value = term.get("value")
49
+
50
+ if term_type in ("literal", "typed-literal"):
51
+ datatype = term.get("datatype")
52
+ if not datatype:
53
+ return value
54
+
55
+ if datatype in XSD_NUMERIC_TYPES:
56
+ try:
57
+ return int(value)
58
+ except (ValueError, TypeError):
59
+ return value
60
+ elif datatype in XSD_FLOAT_TYPES:
61
+ try:
62
+ value = float(value)
63
+ return truncate(value, 5)
64
+ except (ValueError, TypeError):
65
+ return value
66
+ elif datatype == XSD_BOOLEAN:
67
+ return value.lower() in ("true", "1")
68
+ else:
69
+ return value
70
+
71
+ return value
72
+
73
+
22
74
  def get_var_to_values(
23
75
  vars_: list[str],
24
76
  bindings: list[dict],
25
77
  ) -> dict[str, list]:
26
- var_to_values = dict()
78
+ var_to_values = {}
27
79
  for var in vars_:
28
80
  var_to_values[var] = []
29
81
  for binding in bindings:
30
82
  if var in binding:
31
- var_to_values[var].append(binding[var]["value"])
83
+ var_to_values[var].append(parse_sparql_term(binding[var]))
32
84
  else:
33
85
  var_to_values[var].append(None)
34
86
  return dict(var_to_values)
35
87
 
36
88
 
37
- def parse_dict2table(
89
+ def convert_table_dict2lines(
38
90
  reference_vars: Union[list[str], tuple[str, ...]],
39
91
  reference_var_to_values: dict[str, list],
40
92
  ) -> list[str]:
93
+ """Converts a dictionary of lists (columns) into a list of row strings.
94
+
95
+ This function takes a dictionary where keys are column headers and values are
96
+ lists of column data. It transforms this column-oriented data into a list
97
+ of rows, where each row is a single string formed by concatenating the
98
+ string representation of its cell values.
99
+
100
+ It assumes that all lists in the `reference_var_to_values` dictionary
101
+ have the same length.
102
+
103
+ Args:
104
+ reference_vars: An ordered list or tuple of keys that defines the
105
+ column order for the output rows.
106
+ reference_var_to_values: A dictionary mapping column names (keys) to
107
+ lists of their corresponding values.
108
+
109
+ Returns:
110
+ A list of strings, where each string is a concatenation of the values
111
+ for a single row, ordered according to `reference_vars`.
112
+
113
+ Example:
114
+ >>> columns = ['name', 'age', 'city']
115
+ >>> data = {
116
+ ... 'name': ['Alice', 'Bob'],
117
+ ... 'age': [30, 25],
118
+ ... 'city': ['New York', 'Los Angeles']
119
+ ... }
120
+ >>> dict2lines(columns, data)
121
+ ['Alice30New York', 'Bob25Los Angeles']
122
+ """
41
123
  result = []
42
124
  num_rows = len(reference_var_to_values[reference_vars[0]])
43
125
  for row_idx in range(num_rows):
44
126
  row = []
45
127
  for reference_var in reference_vars:
46
128
  val = reference_var_to_values[reference_var][row_idx]
47
- if isinstance(val, float):
48
- val = truncate(val, 5)
49
- if isinstance(val, int):
50
- print(val)
51
- val = float(val)
52
- print(str(val))
53
129
  val = str(val)
54
130
  row.append(val)
55
131
  result.append("".join(row))
@@ -64,8 +140,6 @@ def compare_values(
64
140
  results_are_ordered: bool,
65
141
  ) -> bool:
66
142
 
67
- if len(reference_vars) > len(actual_vars):
68
- return False
69
143
  if len(reference_vars) < len(actual_vars):
70
144
  for combination in itertools.combinations(actual_vars, len(reference_vars)):
71
145
  if compare_values(
@@ -78,9 +152,9 @@ def compare_values(
78
152
  return True
79
153
  return False
80
154
 
81
- table = parse_dict2table(reference_vars, reference_var_to_values)
155
+ table = convert_table_dict2lines(reference_vars, reference_var_to_values)
82
156
  for permutation in itertools.permutations(actual_vars):
83
- actual_table = parse_dict2table(permutation, actual_var_to_values)
157
+ actual_table = convert_table_dict2lines(permutation, actual_var_to_values)
84
158
  if (results_are_ordered and table == actual_table) or (
85
159
  not results_are_ordered and Counter(table) == Counter(actual_table)
86
160
  ):
@@ -0,0 +1,25 @@
1
+ def compute_f1(recall: float | str | None, precision: float | str | None) -> float | None:
2
+ if recall is None or precision is None:
3
+ return None
4
+ recall = float(recall)
5
+ precision = float(precision)
6
+ if recall == 0.0 and precision == 0.0:
7
+ return 0.0
8
+ return 2 * (recall * precision) / (recall + precision)
9
+
10
+
11
+ def get_f1_dict(
12
+ input_dict: dict,
13
+ prefix: str
14
+ ) -> dict:
15
+ recall = input_dict.get(f"{prefix}_recall")
16
+ precision = input_dict.get(f"{prefix}_precision")
17
+ f1 = compute_f1(recall, precision)
18
+ if f1 is None:
19
+ return {}
20
+ result = {f"{prefix}_f1": f1}
21
+ recall_cost = input_dict.get(f"{prefix}_recall_cost")
22
+ precision_cost = input_dict.get(f"{prefix}_precision_cost")
23
+ if recall_cost is not None and precision_cost is not None:
24
+ result[f"{prefix}_f1_cost"] = recall_cost + precision_cost
25
+ return result
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "graphrag-eval"
3
- version = "4.0.0"
3
+ version = "5.0.0"
4
4
  description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
5
5
  authors = [
6
6
  {name = "Neli Hateva", email = "neli.hateva@graphwise.ai"},
@@ -11,7 +11,7 @@ license = "Apache-2.0"
11
11
  requires-python = ">=3.12,<3.13"
12
12
 
13
13
  [project.urls]
14
- repository = "https://github.com/Ontotext-AD/qa-eval"
14
+ repository = "https://github.com/Ontotext-AD/graphrag-eval"
15
15
 
16
16
  [build-system]
17
17
  requires = ["poetry-core>=2.0.0"]
@@ -35,4 +35,4 @@ langevals-ragas = "^0.1.12"
35
35
  optional = true
36
36
 
37
37
  [project.scripts]
38
- answer-correctness = "qa_eval.answer_evaluation:main"
38
+ answer-correctness = "graphrag_eval.answer_correctness:main"
@@ -1,55 +0,0 @@
1
- from typing import Iterable
2
-
3
-
4
- def recall_at_k(relevant_docs: Iterable, retrieved_docs: list, k: int = 10) -> float:
5
- """
6
- Calculates Recall@k.
7
-
8
- Args:
9
- relevant_docs (Iterable): A set of ground truth relevant document IDs.
10
- retrieved_docs (list): A list of retrieved document IDs, ordered by rank.
11
- k (int): The cutoff for the retrieval list.
12
-
13
- Returns:
14
- float: The Recall@k score.
15
- """
16
- retrieved_at_k = retrieved_docs[:k]
17
-
18
- relevant_set = set(relevant_docs)
19
- retrieved_set = set(retrieved_at_k)
20
- true_positives = len(relevant_set.intersection(retrieved_set))
21
-
22
- total_relevant = len(relevant_set)
23
-
24
- if total_relevant == 0:
25
- return 0.0
26
-
27
- return true_positives / total_relevant
28
-
29
-
30
- def average_precision(relevant_docs: Iterable, retrieved_docs: Iterable) -> float:
31
- """
32
- Calculates Average Precision (AP) for a single query.
33
-
34
- Args:
35
- relevant_docs (Iterable): A set of ground truth relevant document IDs.
36
- retrieved_docs (Iterable): A list of retrieved document IDs, ordered by rank.
37
-
38
- Returns:
39
- float: The Average Precision score.
40
- """
41
- relevant_set = set(relevant_docs)
42
- hits = 0
43
- sum_of_precisions = 0.0
44
-
45
- for i, doc_id in enumerate(retrieved_docs):
46
- if doc_id in relevant_set:
47
- hits += 1
48
- precision_at_k = hits / (i + 1)
49
- sum_of_precisions += precision_at_k
50
-
51
- total_relevant = len(relevant_set)
52
- if total_relevant == 0:
53
- return 0.0
54
-
55
- return sum_of_precisions / total_relevant
File without changes