graphrag-eval 4.0.0__tar.gz → 5.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,21 @@
1
+ Metadata-Version: 2.3
2
+ Name: graphrag-eval
3
+ Version: 5.0.1
4
+ Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
5
+ License: Apache-2.0
6
+ Author: Neli Hateva
7
+ Author-email: neli.hateva@graphwise.ai
8
+ Requires-Python: >=3.12,<3.13
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Provides-Extra: openai
13
+ Requires-Dist: langevals (==0.1.*) ; extra == "openai"
14
+ Requires-Dist: langevals-ragas (>=0.1.12,<0.2.0) ; extra == "openai"
15
+ Requires-Dist: openai (>=1.97.0,<2.0.0) ; extra == "openai"
16
+ Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
17
+ Description-Content-Type: text/markdown
18
+
1
19
  <p align="center">
2
20
  <img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
3
21
  </p>
@@ -36,7 +54,7 @@ graphrag-eval = {version = "*", extras = ["openai"]}
36
54
  ## Maintainers
37
55
 
38
56
  Developed and maintained by [Graphwise](https://graphwise.ai/).
39
- For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/qa-eval/issues).
57
+ For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
40
58
 
41
59
  ## Command Line Use
42
60
 
@@ -77,13 +95,14 @@ A reference corpus is a list of templates, each of which contains:
77
95
  - `question_text`: The natural language query passed to the LLM
78
96
  - `reference_steps`: (optional) A list of expected steps grouped by expected order of execution, where all steps in a group can be executed in any order relative to each other, but after all steps in the previous group and before all steps in the next group.
79
97
  - `reference_answer`: (optional) The expected answer to the question
98
+
80
99
  The assumption is that the final answer to the question is derived from the outputs of the steps, which are executed last (last level).
81
100
 
82
101
  Each step includes:
83
102
 
84
103
  - `name`: The type of step being performed (e.g., `sparql_query`)
85
104
  - `args`: Arguments of the step (e.g., arguments to a tool used in the step, such as a SPARQL query)
86
- - `output`: The expected output from the step
105
+ - `output`: The expected output from the step.
87
106
  - `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
88
107
  - `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
89
108
  - `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
@@ -99,7 +118,22 @@ The example corpus below illustrates a minimal but realistic Q&A dataset, showin
99
118
  question_text: List all transformers within Substation OSLO
100
119
  reference_answer: OSLO T1, OSLO T2
101
120
  reference_steps:
102
- - - name: sparql_query
121
+ - - name: retrieval
122
+ args:
123
+ query: transformers Substation OSLO
124
+ k: 2
125
+ output: |-
126
+ [
127
+ {
128
+ "id": "http://example.com/resource/doc/1",
129
+ "text": "Transformer OSLO T1 is in Substation Oslo."
130
+ },
131
+ {
132
+ "id": "http://example.com/resource/doc/2",
133
+ "text": "Transformer OSLO T2 is in Substation Oslo."
134
+ }
135
+ ]
136
+ - name: sparql_query
103
137
  args:
104
138
  query: |2
105
139
 
@@ -253,6 +287,16 @@ Below is an example response from the question-answering system for a single que
253
287
  "total_tokens": 298753,
254
288
  "elapsed_sec": 46.48961806297302,
255
289
  "actual_steps": [
290
+ {
291
+ "name": "retrieval",
292
+ "args": {
293
+ "query": "transformers Substation OSLO",
294
+ "k": 2
295
+ },
296
+ "id": "call_3",
297
+ "status": "success",
298
+ "output": "[\n {\n \"id\": \"http://example.com/resource/doc/1\",\n \"text\": \"Transformer OSLO T1 is in Substation Oslo.\"\n },\n {\n \"id\": \"http://example.com/resource/doc/2\",\n \"text\": \"Transformer OSLO T2 is in Substation Oslo.\"\n }\n]"
299
+ },
256
300
  {
257
301
  "name": "autocomplete_search",
258
302
  "args": {
@@ -323,7 +367,23 @@ The output is a list of statistics for each question from the reference Q&A data
323
367
  question_text: List all transformers within Substation OSLO
324
368
  reference_answer: OSLO T1, OSLO T2
325
369
  reference_steps:
326
- - - name: sparql_query
370
+ - - name: retrieval
371
+ args:
372
+ query: transformers Substation OSLO
373
+ k: 2
374
+ matches: call_3
375
+ output: |-
376
+ [
377
+ {
378
+ "id": "http://example.com/resource/doc/1",
379
+ "text": "Transformer OSLO T1 is in Substation Oslo."
380
+ },
381
+ {
382
+ "id": "http://example.com/resource/doc/2",
383
+ "text": "Transformer OSLO T2 is in Substation Oslo."
384
+ }
385
+ ]
386
+ - name: sparql_query
327
387
  args:
328
388
  query: |2
329
389
 
@@ -364,6 +424,31 @@ The output is a list of statistics for each question from the reference Q&A data
364
424
  answer_relevance: 0.9
365
425
  answer_relevance_cost: 0.0007
366
426
  actual_steps:
427
+ - name: retrieval
428
+ id: call_3
429
+ args:
430
+ query: transformers Substation OSLO
431
+ k: 2
432
+ status: success
433
+ output: |-
434
+ [
435
+ {
436
+ "id": "http://example.com/resource/doc/1",
437
+ "text": "Transformer OSLO T1 is in Substation Oslo."
438
+ },
439
+ {
440
+ "id": "http://example.com/resource/doc/2",
441
+ "text": "Transformer OSLO T2 is in Substation Oslo."
442
+ }
443
+ ]
444
+ retrieval_answer_recall: 1.0
445
+ retrieval_answer_recall_reason: The context contains all the transformers listed in the reference answer
446
+ retrieval_answer_recall_cost: 0.0007
447
+ retrieval_answer_precision: 1.0
448
+ retrieval_answer_precision_reason: The context contains only transformers listed in the reference answer
449
+ retrieval_answer_precision_cost: 0.0003
450
+ retrieval_answer_f1: 1.0
451
+ retrieval_answer_f1_cost: 0.001
367
452
  - name: autocomplete_search
368
453
  args:
369
454
  query: OSLO
@@ -470,12 +555,33 @@ The output is a list of statistics for each question from the reference Q&A data
470
555
  - `answer_relevance_error`: (optional) error message if answer relevance evaluation failed
471
556
  - `answer_relevance_cost`: The LLM use cost of computing `answer_relevance`, in US dollars
472
557
  - `actual_steps`: (optional) copy of the steps in the evaluation target, if specified there
473
- - `steps_score`: a real number between 0 and 1, computed by comparing the results of the last steps that were executed to the reference's last group of steps. If there is no match in the actual steps, then the score is `0`. Otherwise, it is calculated as the number of the matched steps on the last group divided by the total number of steps in the last group.
558
+ - `steps_score`: a real number between 0 and 1, computed by comparing the results of the last executed steps to the output of the reference's last group of steps.
559
+ - If there is no match in the actual steps, then the score is `0.0`
560
+ - If the executed step's name is "retrieval" and the last reference group contains a retrieval step, then the score is the [recall at k](#context-recallk) of the retrieved document ids with respect to the reference.
561
+ - Otherwise, the score is the number of the matched steps on the last group divided by the total number of steps in the last group.
474
562
  - `input_tokens`: input tokens usage
475
563
  - `output_tokens`: output tokens usage
476
564
  - `total_tokens`: total tokens usage
477
565
  - `elapsed_sec`: elapsed seconds
478
566
 
567
+ All `actual_steps` with `name` "retrieval" contain:
568
+ - `retrieval_answer_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
569
+ - `retrieval_answer_recall_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_recall`
570
+ - `retrieval_answer_recall_error`: (optional) error message if `retrieval_answer_recall` evaluation fails
571
+ - `retrieval_answer_recall_cost`: cost of evaluating `retrieval_answer_recall`, in US dollars
572
+ - `retrieval_answer_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
573
+ - `retrieval_answer_precision_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_precision`
574
+ - `retrieval_answer_precision_error`: (optional) error message if `retrieval_answer_precision` evaluation fails
575
+ - `retrieval_answer_precision_cost`: cost of evaluating `retrieval_answer_precision`, in US dollars
576
+ - `retrieval_answer_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_answer_recall` and `retrieval_answer_precision` succeed
577
+ - `retrieval_answer_f1_cost`: The sum of `retrieval_answer_recall_cost` and `retrieval_answer_precision_cost`
578
+ - `retrieval_context_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
579
+ - `retrieval_context_recall_error`: (optional) error message if `retrieval_context_recall` evaluation fails
580
+ - `retrieval_context_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
581
+ - `retrieval_context_precision_error`: (optional) error message if `retrieval_context_precision` evaluation fails
582
+ - `retrieval_context_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_context_recall` and `retrieval_context_precision` succeed
583
+
584
+
479
585
  #### Aggregates Keys
480
586
 
481
587
  The `aggregates` object provides aggregated evaluation metrics.
@@ -499,6 +605,9 @@ Aggregates are:
499
605
  - `once_per_sample`: how many times each step was executed, counted only once per question
500
606
  - `empty_results`: how many times the step was executed and returned empty results
501
607
  - `errors`: how many times the step was executed and resulted in error
608
+ - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` for all successful questions in this template
609
+ - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` for all successful questions in this template
610
+ - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` for all successful questions in this template
502
611
  - `micro`: statistics across questions, regardless of template. It includes:
503
612
  - `number_of_error_samples`: total number of questions, which resulted in error response
504
613
  - `number_of_success_samples`: total number of questions, which resulted in successful response
@@ -511,6 +620,9 @@ Aggregates are:
511
620
  - `answer_f1`: `sum`, `mean`, `median`, `min` and `max` for `answer_f1` of all successful questions
512
621
  - `answer_relevance`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance` of all successful questions
513
622
  - `answer_relevance_cost`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance_cost` of all successful questions
623
+ - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` of all successful questions
624
+ - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` of all successful questions
625
+ - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` of all successful questions
514
626
  - `steps_score`: `sum`, `mean`, `median`, `min` and `max` for `steps_score` of all successful questions
515
627
  - `macro`: averages across templates, i.e., the mean of each metric per template, averaged. It includes:
516
628
  - `input_tokens`: `mean` for `input_tokens`
@@ -522,6 +634,9 @@ Aggregates are:
522
634
  - `answer_f1`: `mean` for `answer_f1`
523
635
  - `answer_relevance`: `mean` for `answer_relevance`
524
636
  - `answer_relevance_cost`: `mean` for `answer_relevance_cost`
637
+ - `retrieval_context_recall`: `mean` for `retrieval_context_recall`
638
+ - `retrieval_context_precision`: `mean` for `retrieval_context_precision`
639
+ - `retrieval_context_f1`: `mean` for `retrieval_context_f1`
525
640
  - `steps_score`: `mean` for `steps_score`
526
641
 
527
642
  #### Example Aggregates
@@ -898,18 +1013,30 @@ macro:
898
1013
  mean: 25.911653497483996
899
1014
  ```
900
1015
 
1016
+ ### SPARQL queries comparison
1017
+
1018
+ The algorithm iterates over all subsets of columns in the actual result of the same size as in the reference result.
1019
+ For each subset, it compares the set of columns (skipping optional columns).
1020
+ It matches floating-point numbers up to a 1e-8 precision. It does not do this for special types such as duration.
1021
+
1022
+ The average time complexity is О(nr\*nc_ref!\*binomial(nc_act, nc_ref)), where
1023
+
1024
+ * *nr* is the number of rows in the actual result
1025
+ * *nc_ref* is the number of columns in the reference result
1026
+ * *nc_act* is the number of columns in the actual result
1027
+
901
1028
  ### Retrieval Evaluation
902
1029
 
903
- The following metrics are based on the ids of retrieved documents.
1030
+ The following metrics are based on the content of retrieved documents.
904
1031
 
905
- #### Recall@k Metric
1032
+ #### Context Recall@k
906
1033
 
907
1034
  The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we inclide in the first k spots?"
908
1035
  * **Formula**:
909
1036
  $`
910
1037
  \frac{\text{Number of relevant items in top k}}{\text{Number of relevant items}}
911
1038
  `$
912
- * **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the *total* number of relevant items.
1039
+ * **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the first 'k' relevant items.
913
1040
  * **Example**: Suppose there are 4 relevant documents for a given query. Suppose our system retrieves 3 of them in the top 5 results (`k=5`). Recall@5 is `3 / 4 = 0.75`.
914
1041
 
915
1042
  ```python
@@ -920,7 +1047,7 @@ recall_at_k(
920
1047
  ) # => 0.75
921
1048
  ```
922
1049
 
923
- #### Average Precision (AP) Metric
1050
+ #### Context Precision@k
924
1051
 
925
1052
  Evaluates a ranked list of recommendations by looking at the precision at the position of each correctly retrieved item. It rewards systems for placing relevant items higher up in the list. It's more sophisticated than just looking at precision at a single cutoff because it considers the entire ranking.
926
1053
  * **Formula**:
@@ -950,3 +1077,4 @@ average_precision(
950
1077
  retrieved_docs=[1, 4, 3, 5, 7]
951
1078
  ) # ~=> 0.8056
952
1079
  ```
1080
+
@@ -1,17 +1,3 @@
1
- Metadata-Version: 2.3
2
- Name: graphrag-eval
3
- Version: 4.0.0
4
- Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
5
- License: Apache-2.0
6
- Author: Neli Hateva
7
- Author-email: neli.hateva@graphwise.ai
8
- Requires-Python: >=3.12,<3.13
9
- Classifier: License :: OSI Approved :: Apache Software License
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.12
12
- Project-URL: Repository, https://github.com/Ontotext-AD/qa-eval
13
- Description-Content-Type: text/markdown
14
-
15
1
  <p align="center">
16
2
  <img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
17
3
  </p>
@@ -50,7 +36,7 @@ graphrag-eval = {version = "*", extras = ["openai"]}
50
36
  ## Maintainers
51
37
 
52
38
  Developed and maintained by [Graphwise](https://graphwise.ai/).
53
- For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/qa-eval/issues).
39
+ For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
54
40
 
55
41
  ## Command Line Use
56
42
 
@@ -91,13 +77,14 @@ A reference corpus is a list of templates, each of which contains:
91
77
  - `question_text`: The natural language query passed to the LLM
92
78
  - `reference_steps`: (optional) A list of expected steps grouped by expected order of execution, where all steps in a group can be executed in any order relative to each other, but after all steps in the previous group and before all steps in the next group.
93
79
  - `reference_answer`: (optional) The expected answer to the question
80
+
94
81
  The assumption is that the final answer to the question is derived from the outputs of the steps, which are executed last (last level).
95
82
 
96
83
  Each step includes:
97
84
 
98
85
  - `name`: The type of step being performed (e.g., `sparql_query`)
99
86
  - `args`: Arguments of the step (e.g., arguments to a tool used in the step, such as a SPARQL query)
100
- - `output`: The expected output from the step
87
+ - `output`: The expected output from the step.
101
88
  - `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
102
89
  - `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
103
90
  - `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
@@ -113,7 +100,22 @@ The example corpus below illustrates a minimal but realistic Q&A dataset, showin
113
100
  question_text: List all transformers within Substation OSLO
114
101
  reference_answer: OSLO T1, OSLO T2
115
102
  reference_steps:
116
- - - name: sparql_query
103
+ - - name: retrieval
104
+ args:
105
+ query: transformers Substation OSLO
106
+ k: 2
107
+ output: |-
108
+ [
109
+ {
110
+ "id": "http://example.com/resource/doc/1",
111
+ "text": "Transformer OSLO T1 is in Substation Oslo."
112
+ },
113
+ {
114
+ "id": "http://example.com/resource/doc/2",
115
+ "text": "Transformer OSLO T2 is in Substation Oslo."
116
+ }
117
+ ]
118
+ - name: sparql_query
117
119
  args:
118
120
  query: |2
119
121
 
@@ -267,6 +269,16 @@ Below is an example response from the question-answering system for a single que
267
269
  "total_tokens": 298753,
268
270
  "elapsed_sec": 46.48961806297302,
269
271
  "actual_steps": [
272
+ {
273
+ "name": "retrieval",
274
+ "args": {
275
+ "query": "transformers Substation OSLO",
276
+ "k": 2
277
+ },
278
+ "id": "call_3",
279
+ "status": "success",
280
+ "output": "[\n {\n \"id\": \"http://example.com/resource/doc/1\",\n \"text\": \"Transformer OSLO T1 is in Substation Oslo.\"\n },\n {\n \"id\": \"http://example.com/resource/doc/2\",\n \"text\": \"Transformer OSLO T2 is in Substation Oslo.\"\n }\n]"
281
+ },
270
282
  {
271
283
  "name": "autocomplete_search",
272
284
  "args": {
@@ -337,7 +349,23 @@ The output is a list of statistics for each question from the reference Q&A data
337
349
  question_text: List all transformers within Substation OSLO
338
350
  reference_answer: OSLO T1, OSLO T2
339
351
  reference_steps:
340
- - - name: sparql_query
352
+ - - name: retrieval
353
+ args:
354
+ query: transformers Substation OSLO
355
+ k: 2
356
+ matches: call_3
357
+ output: |-
358
+ [
359
+ {
360
+ "id": "http://example.com/resource/doc/1",
361
+ "text": "Transformer OSLO T1 is in Substation Oslo."
362
+ },
363
+ {
364
+ "id": "http://example.com/resource/doc/2",
365
+ "text": "Transformer OSLO T2 is in Substation Oslo."
366
+ }
367
+ ]
368
+ - name: sparql_query
341
369
  args:
342
370
  query: |2
343
371
 
@@ -378,6 +406,31 @@ The output is a list of statistics for each question from the reference Q&A data
378
406
  answer_relevance: 0.9
379
407
  answer_relevance_cost: 0.0007
380
408
  actual_steps:
409
+ - name: retrieval
410
+ id: call_3
411
+ args:
412
+ query: transformers Substation OSLO
413
+ k: 2
414
+ status: success
415
+ output: |-
416
+ [
417
+ {
418
+ "id": "http://example.com/resource/doc/1",
419
+ "text": "Transformer OSLO T1 is in Substation Oslo."
420
+ },
421
+ {
422
+ "id": "http://example.com/resource/doc/2",
423
+ "text": "Transformer OSLO T2 is in Substation Oslo."
424
+ }
425
+ ]
426
+ retrieval_answer_recall: 1.0
427
+ retrieval_answer_recall_reason: The context contains all the transformers listed in the reference answer
428
+ retrieval_answer_recall_cost: 0.0007
429
+ retrieval_answer_precision: 1.0
430
+ retrieval_answer_precision_reason: The context contains only transformers listed in the reference answer
431
+ retrieval_answer_precision_cost: 0.0003
432
+ retrieval_answer_f1: 1.0
433
+ retrieval_answer_f1_cost: 0.001
381
434
  - name: autocomplete_search
382
435
  args:
383
436
  query: OSLO
@@ -484,12 +537,33 @@ The output is a list of statistics for each question from the reference Q&A data
484
537
  - `answer_relevance_error`: (optional) error message if answer relevance evaluation failed
485
538
  - `answer_relevance_cost`: The LLM use cost of computing `answer_relevance`, in US dollars
486
539
  - `actual_steps`: (optional) copy of the steps in the evaluation target, if specified there
487
- - `steps_score`: a real number between 0 and 1, computed by comparing the results of the last steps that were executed to the reference's last group of steps. If there is no match in the actual steps, then the score is `0`. Otherwise, it is calculated as the number of the matched steps on the last group divided by the total number of steps in the last group.
540
+ - `steps_score`: a real number between 0 and 1, computed by comparing the results of the last executed steps to the output of the reference's last group of steps.
541
+ - If there is no match in the actual steps, then the score is `0.0`
542
+ - If the executed step's name is "retrieval" and the last reference group contains a retrieval step, then the score is the [recall at k](#context-recallk) of the retrieved document ids with respect to the reference.
543
+ - Otherwise, the score is the number of the matched steps on the last group divided by the total number of steps in the last group.
488
544
  - `input_tokens`: input tokens usage
489
545
  - `output_tokens`: output tokens usage
490
546
  - `total_tokens`: total tokens usage
491
547
  - `elapsed_sec`: elapsed seconds
492
548
 
549
+ All `actual_steps` with `name` "retrieval" contain:
550
+ - `retrieval_answer_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
551
+ - `retrieval_answer_recall_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_recall`
552
+ - `retrieval_answer_recall_error`: (optional) error message if `retrieval_answer_recall` evaluation fails
553
+ - `retrieval_answer_recall_cost`: cost of evaluating `retrieval_answer_recall`, in US dollars
554
+ - `retrieval_answer_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
555
+ - `retrieval_answer_precision_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_precision`
556
+ - `retrieval_answer_precision_error`: (optional) error message if `retrieval_answer_precision` evaluation fails
557
+ - `retrieval_answer_precision_cost`: cost of evaluating `retrieval_answer_precision`, in US dollars
558
+ - `retrieval_answer_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_answer_recall` and `retrieval_answer_precision` succeed
559
+ - `retrieval_answer_f1_cost`: The sum of `retrieval_answer_recall_cost` and `retrieval_answer_precision_cost`
560
+ - `retrieval_context_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
561
+ - `retrieval_context_recall_error`: (optional) error message if `retrieval_context_recall` evaluation fails
562
+ - `retrieval_context_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
563
+ - `retrieval_context_precision_error`: (optional) error message if `retrieval_context_precision` evaluation fails
564
+ - `retrieval_context_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_context_recall` and `retrieval_context_precision` succeed
565
+
566
+
493
567
  #### Aggregates Keys
494
568
 
495
569
  The `aggregates` object provides aggregated evaluation metrics.
@@ -513,6 +587,9 @@ Aggregates are:
513
587
  - `once_per_sample`: how many times each step was executed, counted only once per question
514
588
  - `empty_results`: how many times the step was executed and returned empty results
515
589
  - `errors`: how many times the step was executed and resulted in error
590
+ - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` for all successful questions in this template
591
+ - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` for all successful questions in this template
592
+ - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` for all successful questions in this template
516
593
  - `micro`: statistics across questions, regardless of template. It includes:
517
594
  - `number_of_error_samples`: total number of questions, which resulted in error response
518
595
  - `number_of_success_samples`: total number of questions, which resulted in successful response
@@ -525,6 +602,9 @@ Aggregates are:
525
602
  - `answer_f1`: `sum`, `mean`, `median`, `min` and `max` for `answer_f1` of all successful questions
526
603
  - `answer_relevance`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance` of all successful questions
527
604
  - `answer_relevance_cost`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance_cost` of all successful questions
605
+ - `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` of all successful questions
606
+ - `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` of all successful questions
607
+ - `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` of all successful questions
528
608
  - `steps_score`: `sum`, `mean`, `median`, `min` and `max` for `steps_score` of all successful questions
529
609
  - `macro`: averages across templates, i.e., the mean of each metric per template, averaged. It includes:
530
610
  - `input_tokens`: `mean` for `input_tokens`
@@ -536,6 +616,9 @@ Aggregates are:
536
616
  - `answer_f1`: `mean` for `answer_f1`
537
617
  - `answer_relevance`: `mean` for `answer_relevance`
538
618
  - `answer_relevance_cost`: `mean` for `answer_relevance_cost`
619
+ - `retrieval_context_recall`: `mean` for `retrieval_context_recall`
620
+ - `retrieval_context_precision`: `mean` for `retrieval_context_precision`
621
+ - `retrieval_context_f1`: `mean` for `retrieval_context_f1`
539
622
  - `steps_score`: `mean` for `steps_score`
540
623
 
541
624
  #### Example Aggregates
@@ -912,18 +995,30 @@ macro:
912
995
  mean: 25.911653497483996
913
996
  ```
914
997
 
998
+ ### SPARQL queries comparison
999
+
1000
+ The algorithm iterates over all subsets of columns in the actual result of the same size as in the reference result.
1001
+ For each subset, it compares the set of columns (skipping optional columns).
1002
+ It matches floating-point numbers up to a 1e-8 precision. It does not do this for special types such as duration.
1003
+
1004
+ The average time complexity is О(nr\*nc_ref!\*binomial(nc_act, nc_ref)), where
1005
+
1006
+ * *nr* is the number of rows in the actual result
1007
+ * *nc_ref* is the number of columns in the reference result
1008
+ * *nc_act* is the number of columns in the actual result
1009
+
915
1010
  ### Retrieval Evaluation
916
1011
 
917
- The following metrics are based on the ids of retrieved documents.
1012
+ The following metrics are based on the content of retrieved documents.
918
1013
 
919
- #### Recall@k Metric
1014
+ #### Context Recall@k
920
1015
 
921
1016
  The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we inclide in the first k spots?"
922
1017
  * **Formula**:
923
1018
  $`
924
1019
  \frac{\text{Number of relevant items in top k}}{\text{Number of relevant items}}
925
1020
  `$
926
- * **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the *total* number of relevant items.
1021
+ * **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the first 'k' relevant items.
927
1022
  * **Example**: Suppose there are 4 relevant documents for a given query. Suppose our system retrieves 3 of them in the top 5 results (`k=5`). Recall@5 is `3 / 4 = 0.75`.
928
1023
 
929
1024
  ```python
@@ -934,7 +1029,7 @@ recall_at_k(
934
1029
  ) # => 0.75
935
1030
  ```
936
1031
 
937
- #### Average Precision (AP) Metric
1032
+ #### Context Precision@k
938
1033
 
939
1034
  Evaluates a ranked list of recommendations by looking at the precision at the position of each correctly retrieved item. It rewards systems for placing relevant items higher up in the list. It's more sophisticated than just looking at precision at a single cutoff because it considers the entire ranking.
940
1035
  * **Formula**:
@@ -964,4 +1059,3 @@ average_precision(
964
1059
  retrieved_docs=[1, 4, 3, 5, 7]
965
1060
  ) # ~=> 0.8056
966
1061
  ```
967
-
@@ -16,7 +16,22 @@ METRICS = [
16
16
  "total_tokens",
17
17
  "elapsed_sec"
18
18
  ]
19
-
19
+ STEPS_METRICS = {
20
+ "retrieval": [
21
+ "retrieval_answer_precision",
22
+ "retrieval_answer_precision_cost",
23
+ "retrieval_answer_recall",
24
+ "retrieval_answer_recall_cost",
25
+ "retrieval_answer_f1",
26
+ "retrieval_answer_f1_cost",
27
+ "retrieval_context_precision",
28
+ "retrieval_context_precision_cost",
29
+ "retrieval_context_recall",
30
+ "retrieval_context_recall_cost",
31
+ "retrieval_context_f1",
32
+ "retrieval_context_f1_cost",
33
+ ]
34
+ }
20
35
  PROTECTED_METRICS = [
21
36
  "input_tokens",
22
37
  "output_tokens",
@@ -35,6 +50,19 @@ def stats_for_series(values: Iterable[int | float]) -> dict[str, float]:
35
50
  }
36
51
 
37
52
 
53
+ def update_step_metrics_per_template(
54
+ sample: dict,
55
+ step_metrics_per_template: dict,
56
+ template_id: str
57
+ ):
58
+ for step in sample.get("actual_steps", []):
59
+ if step["name"] in STEPS_METRICS:
60
+ for metric in STEPS_METRICS[step["name"]]:
61
+ value = step.get(metric)
62
+ if value is not None:
63
+ step_metrics_per_template[template_id][metric].append(value)
64
+
65
+
38
66
  def update_stats_per_template(
39
67
  sample: dict,
40
68
  stats_per_template: dict,
@@ -76,6 +104,7 @@ def compute_aggregates(samples: list[dict]) -> dict:
76
104
  number_of_samples_per_template_by_status = defaultdict(lambda: defaultdict(int))
77
105
  stats_per_template = defaultdict(lambda: defaultdict(list))
78
106
  steps_summary_per_template = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
107
+ step_metrics_per_template = defaultdict(lambda: defaultdict(list))
79
108
 
80
109
  # Compute per-template stats
81
110
  templates_ids = set()
@@ -94,6 +123,11 @@ def compute_aggregates(samples: list[dict]) -> dict:
94
123
  steps_summary_per_template,
95
124
  template_id
96
125
  )
126
+ update_step_metrics_per_template(
127
+ sample,
128
+ step_metrics_per_template,
129
+ template_id
130
+ )
97
131
 
98
132
  summary = {"per_template": {}}
99
133
 
@@ -115,6 +149,13 @@ def compute_aggregates(samples: list[dict]) -> dict:
115
149
  if series or metric in PROTECTED_METRICS:
116
150
  template_summary[metric] = stats_for_series(series)
117
151
 
152
+ # Add step metrics for the template
153
+ template_step_metrics = {}
154
+ for metric, values in step_metrics_per_template[template_id].items():
155
+ template_step_metrics[metric] = stats_for_series(values)
156
+ if template_step_metrics:
157
+ template_summary["steps"].update(template_step_metrics)
158
+
118
159
  summary["per_template"][template_id] = template_summary
119
160
 
120
161
  # Add micro stats
@@ -137,6 +178,17 @@ def compute_aggregates(samples: list[dict]) -> dict:
137
178
  if series or metric in PROTECTED_METRICS:
138
179
  summary["micro"][metric] = stats_for_series(series)
139
180
 
181
+ # Add micro step metrics
182
+ micro_step_metrics = defaultdict(list)
183
+ for template_metrics in step_metrics_per_template.values():
184
+ for metric, values in template_metrics.items():
185
+ micro_step_metrics[metric].extend(values)
186
+ step_metrics = {
187
+ metric: stats_for_series(values)
188
+ for metric, values in micro_step_metrics.items()
189
+ }
190
+ summary["micro"].update(step_metrics)
191
+
140
192
  # Add macro stats
141
193
  summary["macro"] = {}
142
194
  for metric in METRICS:
@@ -148,4 +200,17 @@ def compute_aggregates(samples: list[dict]) -> dict:
148
200
  if means or metric in PROTECTED_METRICS:
149
201
  summary["macro"][metric] = {"mean": mean(means) if means else 0}
150
202
 
203
+ # Add macro step metrics
204
+ macro_step_metrics = defaultdict(list)
205
+ for template_id, template_summary in summary["per_template"].items():
206
+ if "steps" in template_summary:
207
+ for metric, stats in template_summary["steps"].items():
208
+ if "mean" in stats:
209
+ macro_step_metrics[metric].append(stats["mean"])
210
+ step_metrics = {
211
+ metric: {"mean": mean(values) if values else 0}
212
+ for metric, values in macro_step_metrics.items()
213
+ }
214
+ summary["macro"].update(step_metrics)
215
+
151
216
  return summary
@@ -48,7 +48,7 @@ def run_evaluation(
48
48
  actual_result,
49
49
  )
50
50
  )
51
- if "steps" in actual_result:
51
+ if "actual_steps" in actual_result:
52
52
  eval_result.update(
53
53
  get_steps_evaluation_result_dict(question, actual_result)
54
54
  )
@@ -1,13 +1,14 @@
1
1
  import json
2
2
  from collections import defaultdict
3
3
 
4
- from .retrieval import recall_at_k
4
+ from .retrieval_context_ids import recall_at_k
5
5
  from .sparql import compare_sparql_results
6
6
 
7
7
 
8
8
  def compare_steps_outputs(reference: dict, actual: dict) -> float:
9
- ref_output = reference["output"]
9
+ ref_output = reference.get("output")
10
10
  act_output = actual["output"]
11
+ assert ref_output, "Reference step output is mandatory"
11
12
  if reference.get("output_media_type") == "application/sparql-results+json":
12
13
  return compare_sparql_results(
13
14
  json.loads(ref_output),
@@ -17,9 +18,11 @@ def compare_steps_outputs(reference: dict, actual: dict) -> float:
17
18
  )
18
19
  if reference.get("output_media_type") == "application/json":
19
20
  return float(json.loads(ref_output) == json.loads(act_output))
20
- if reference["name"] == "retrieval":
21
- k = reference["args"]["k"]
22
- return recall_at_k(ref_output, act_output, k)
21
+ if reference["name"] == actual["name"] == "retrieval":
22
+ ref_contexts_ids = [c["id"] for c in json.loads(ref_output)]
23
+ act_contexts_ids = [c["id"] for c in json.loads(act_output)]
24
+ k = actual["args"]["k"]
25
+ return recall_at_k(ref_contexts_ids, act_contexts_ids, k)
23
26
  return float(ref_output == act_output)
24
27
 
25
28
 
@@ -95,9 +98,11 @@ def get_steps_matches(
95
98
 
96
99
  def evaluate_steps(
97
100
  reference_steps_groups: list[list[dict]],
98
- actual_steps: list[dict]
101
+ actual_steps: list[dict],
102
+ matches: list[tuple[int, int, int, float]] | None = None
99
103
  ) -> float:
100
- matches = get_steps_matches(reference_steps_groups, actual_steps)
104
+ if matches is None:
105
+ matches = get_steps_matches(reference_steps_groups, actual_steps)
101
106
  matches_by_group = defaultdict(list)
102
107
  scores_by_group = defaultdict(float)
103
108
  for ref_group_idx, ref_match_idx, actual_idx, score in matches:
@@ -110,11 +115,33 @@ def evaluate_steps(
110
115
 
111
116
 
112
117
  def get_steps_evaluation_result_dict(reference: dict, target: dict) -> dict:
113
- act_steps = target["steps"]
114
118
  eval_result = {}
119
+ act_steps = target.get("actual_steps", [])
115
120
  eval_result["actual_steps"] = act_steps
121
+ for act_step in act_steps:
122
+ if act_step["name"] == "retrieval":
123
+ from .retrieval_answer import get_retrieval_evaluation_dict
124
+ result = get_retrieval_evaluation_dict(
125
+ question_text=reference["question_text"],
126
+ reference_answer=reference.get("reference_answer"),
127
+ actual_answer=target.get("actual_answer"),
128
+ actual_contexts=json.loads(act_step["output"])
129
+ )
130
+ act_step.update(result)
116
131
  if "reference_steps" in reference:
117
132
  ref_steps = reference["reference_steps"]
118
- steps_score = evaluate_steps(ref_steps, act_steps)
133
+ matches = get_steps_matches(ref_steps, act_steps)
134
+ steps_score = evaluate_steps(ref_steps, act_steps, matches)
119
135
  eval_result["steps_score"] = steps_score
136
+ for ref_group_idx, ref_match_idx, act_idx, _ in matches:
137
+ ref_step = ref_steps[ref_group_idx][ref_match_idx]
138
+ act_step = act_steps[act_idx]
139
+ if ref_step["name"] == "retrieval":
140
+ from .retrieval_context_texts import \
141
+ get_retrieval_evaluation_dict
142
+ res = get_retrieval_evaluation_dict(
143
+ reference_contexts=json.loads(ref_step["output"]),
144
+ actual_contexts=json.loads(act_step["output"])
145
+ )
146
+ act_step.update(res)
120
147
  return eval_result
@@ -0,0 +1,62 @@
1
+ from langevals_ragas.response_context_recall import (
2
+ RagasResponseContextRecallEntry,
3
+ RagasResponseContextRecallEvaluator,
4
+ )
5
+ from langevals_ragas.response_context_precision import (
6
+ RagasResponseContextPrecisionEntry,
7
+ RagasResponseContextPrecisionEvaluator,
8
+ )
9
+
10
+ from graphrag_eval.util import get_f1_dict
11
+
12
+
13
+ def _evaluate(
14
+ evaluator: RagasResponseContextRecallEvaluator | RagasResponseContextPrecisionEvaluator,
15
+ entry: RagasResponseContextRecallEntry | RagasResponseContextPrecisionEntry,
16
+ metric: str
17
+ ) -> dict[str, float | str]:
18
+ try:
19
+ result = evaluator.evaluate(entry)
20
+ if result.status == "processed":
21
+ return {
22
+ f"retrieval_answer_{metric}": result.score,
23
+ f"retrieval_answer_{metric}_cost": result.cost.amount,
24
+ f"retrieval_answer_{metric}_reason": result.details
25
+ }
26
+ else:
27
+ return {
28
+ f"retrieval_answer_{metric}_error": result.details
29
+ }
30
+ except Exception as e:
31
+ return {
32
+ f"retrieval_answer_{metric}_error": str(e)
33
+ }
34
+
35
+
36
+ def get_retrieval_evaluation_dict(
37
+ question_text: str,
38
+ actual_contexts: list[dict[str, str]],
39
+ reference_answer: str | None = None,
40
+ actual_answer: str | None = None,
41
+ model_name : str = "openai/gpt-4o-mini",
42
+ max_tokens : int = 65_536
43
+ ) -> dict:
44
+ if not reference_answer and not actual_answer:
45
+ return {}
46
+ settings_dict = {
47
+ "model": model_name,
48
+ "max_tokens": max_tokens
49
+ }
50
+ entry = RagasResponseContextPrecisionEntry(
51
+ input=question_text,
52
+ expected_output=reference_answer,
53
+ output=actual_answer,
54
+ contexts=[a["text"] for a in actual_contexts]
55
+ )
56
+ result = {}
57
+ evaluator = RagasResponseContextRecallEvaluator(settings=settings_dict)
58
+ result.update(_evaluate(evaluator, entry, "recall"))
59
+ evaluator = RagasResponseContextPrecisionEvaluator(settings=settings_dict)
60
+ result.update(_evaluate(evaluator, entry, "precision"))
61
+ result.update(get_f1_dict(result, "retrieval_answer"))
62
+ return result
@@ -0,0 +1,50 @@
1
+ from typing import Iterable
2
+
3
+
4
+ def recall_at_k(relevant_ids: list, retrieved_ids: list, k: int = 10) -> float:
5
+ """
6
+ Calculates Recall@k.
7
+
8
+ Args:
9
+ relevant_ids (list): A list of ground truth relevant document IDs.
10
+ retrieved_ids (list): A list of retrieved document IDs, ordered by rank.
11
+ k (int): The cutoff for the retrieval list.
12
+
13
+ Returns:
14
+ float: The Recall@k score.
15
+ """
16
+ retrieved_at_k = retrieved_ids[:k]
17
+ relevant_at_k = relevant_ids[:k]
18
+ true_positives = len(set(relevant_at_k).intersection(set(retrieved_at_k)))
19
+ total_relevant = len(relevant_at_k)
20
+ if total_relevant == 0:
21
+ return 0.0
22
+ return true_positives / total_relevant
23
+
24
+
25
+ def average_precision(relevant_ids: Iterable, retrieved_ids: Iterable) -> float:
26
+ """
27
+ Calculates Average Precision (AP) for a single query.
28
+
29
+ Args:
30
+ relevant_ids (Iterable): A set of ground truth relevant document IDs.
31
+ retrieved_ids (Iterable): A list of retrieved document IDs, ordered by rank.
32
+
33
+ Returns:
34
+ float: The Average Precision score.
35
+ """
36
+ relevant_set = set(relevant_ids)
37
+ hits = 0
38
+ sum_of_precisions = 0.0
39
+
40
+ for i, doc_id in enumerate(retrieved_ids):
41
+ if doc_id in relevant_set:
42
+ hits += 1
43
+ precision_at_k = hits / (i + 1)
44
+ sum_of_precisions += precision_at_k
45
+
46
+ total_relevant = len(relevant_set)
47
+ if total_relevant == 0:
48
+ return 0.0
49
+
50
+ return sum_of_precisions / total_relevant
@@ -0,0 +1,59 @@
1
+ from langevals_ragas.context_precision import (
2
+ RagasContextPrecisionEntry,
3
+ RagasContextPrecisionEvaluator,
4
+ )
5
+ from langevals_ragas.context_recall import (
6
+ RagasContextRecallEntry,
7
+ RagasContextRecallEvaluator,
8
+ )
9
+
10
+ from graphrag_eval.util import get_f1_dict
11
+
12
+
13
+ def _evaluate(
14
+ entry: RagasContextRecallEntry | RagasContextPrecisionEntry,
15
+ evauator: RagasContextRecallEvaluator | RagasContextPrecisionEvaluator,
16
+ metric: str
17
+ ) -> dict:
18
+ try:
19
+ result = evauator.evaluate(entry)
20
+ if result.status == "processed":
21
+ result_dict = {
22
+ f"retrieval_context_{metric}": result.score,
23
+ }
24
+ if result.details:
25
+ result_dict[f"retrieval_context_{metric}_reason"] = result.details
26
+ if result.cost is not None:
27
+ result_dict[f"retrieval_context_{metric}_cost"] = result.cost.amount
28
+ return result_dict
29
+ else:
30
+ return {
31
+ f"retrieval_context_{metric}_error": result.details,
32
+ }
33
+ except Exception as e:
34
+ return {
35
+ f"retrieval_context_{metric}_error": str(e),
36
+ }
37
+
38
+
39
+ def get_retrieval_evaluation_dict(
40
+ reference_contexts: list[dict[str, str]],
41
+ actual_contexts: list[dict[str, str]],
42
+ model_name : str = "openai/gpt-4o-mini",
43
+ max_tokens : int = 65_536
44
+ ) -> dict:
45
+ settings_dict = {
46
+ "model": model_name,
47
+ "max_tokens": max_tokens
48
+ }
49
+ entry = RagasContextRecallEntry(
50
+ expected_contexts=[a["text"] for a in reference_contexts],
51
+ contexts=[a["text"] for a in actual_contexts]
52
+ )
53
+ result = {}
54
+ evaluator = RagasContextRecallEvaluator(settings=settings_dict)
55
+ result.update(_evaluate(entry, evaluator, "recall"))
56
+ evaluator = RagasContextPrecisionEvaluator(settings=settings_dict)
57
+ result.update(_evaluate(entry, evaluator, "precision"))
58
+ result.update(get_f1_dict(result, "retrieval_context"))
59
+ return result
@@ -1,10 +1,31 @@
1
1
  from collections import Counter
2
+ import re
2
3
  from typing import Union
3
4
  import itertools
4
5
  import math
5
6
 
6
-
7
- def truncate(number, decimals=0):
7
+ XSD_NUMERIC_TYPES = {
8
+ "http://www.w3.org/2001/XMLSchema#integer",
9
+ "http://www.w3.org/2001/XMLSchema#int",
10
+ "http://www.w3.org/2001/XMLSchema#long",
11
+ "http://www.w3.org/2001/XMLSchema#short",
12
+ "http://www.w3.org/2001/XMLSchema#byte",
13
+ "http://www.w3.org/2001/XMLSchema#nonNegativeInteger",
14
+ "http://www.w3.org/2001/XMLSchema#positiveInteger",
15
+ "http://www.w3.org/2001/XMLSchema#unsignedLong",
16
+ "http://www.w3.org/2001/XMLSchema#unsignedInt",
17
+ "http://www.w3.org/2001/XMLSchema#unsignedShort",
18
+ "http://www.w3.org/2001/XMLSchema#unsignedByte",
19
+ }
20
+ XSD_FLOAT_TYPES = {
21
+ "http://www.w3.org/2001/XMLSchema#decimal",
22
+ "http://www.w3.org/2001/XMLSchema#double",
23
+ "http://www.w3.org/2001/XMLSchema#float",
24
+ }
25
+ XSD_BOOLEAN = "http://www.w3.org/2001/XMLSchema#boolean"
26
+
27
+
28
+ def truncate(number: float, decimals: int = 0) -> float:
8
29
  """
9
30
  Truncates a float to a certain number of decimal places.
10
31
  """
@@ -19,37 +40,92 @@ def truncate(number, decimals=0):
19
40
  return math.trunc(number * factor) / factor
20
41
 
21
42
 
43
+ def parse_sparql_term(term: dict) -> Union[str, float, bool, None]:
44
+ if not isinstance(term, dict):
45
+ return term
46
+
47
+ term_type = term.get("type")
48
+ value = term.get("value")
49
+
50
+ if term_type in ("literal", "typed-literal"):
51
+ datatype = term.get("datatype")
52
+ if not datatype:
53
+ return value
54
+
55
+ if datatype in XSD_NUMERIC_TYPES:
56
+ try:
57
+ return int(value)
58
+ except (ValueError, TypeError):
59
+ return value
60
+ elif datatype in XSD_FLOAT_TYPES:
61
+ try:
62
+ value = float(value)
63
+ return truncate(value, 5)
64
+ except (ValueError, TypeError):
65
+ return value
66
+ elif datatype == XSD_BOOLEAN:
67
+ return value.lower() in ("true", "1")
68
+ else:
69
+ return value
70
+
71
+ return value
72
+
73
+
22
74
  def get_var_to_values(
23
75
  vars_: list[str],
24
76
  bindings: list[dict],
25
77
  ) -> dict[str, list]:
26
- var_to_values = dict()
78
+ var_to_values = {}
27
79
  for var in vars_:
28
80
  var_to_values[var] = []
29
81
  for binding in bindings:
30
82
  if var in binding:
31
- var_to_values[var].append(binding[var]["value"])
83
+ var_to_values[var].append(parse_sparql_term(binding[var]))
32
84
  else:
33
85
  var_to_values[var].append(None)
34
86
  return dict(var_to_values)
35
87
 
36
88
 
37
- def parse_dict2table(
89
+ def convert_table_dict2lines(
38
90
  reference_vars: Union[list[str], tuple[str, ...]],
39
91
  reference_var_to_values: dict[str, list],
40
92
  ) -> list[str]:
93
+ """Converts a dictionary of lists (columns) into a list of row strings.
94
+
95
+ This function takes a dictionary where keys are column headers and values are
96
+ lists of column data. It transforms this column-oriented data into a list
97
+ of rows, where each row is a single string formed by concatenating the
98
+ string representation of its cell values.
99
+
100
+ It assumes that all lists in the `reference_var_to_values` dictionary
101
+ have the same length.
102
+
103
+ Args:
104
+ reference_vars: An ordered list or tuple of keys that defines the
105
+ column order for the output rows.
106
+ reference_var_to_values: A dictionary mapping column names (keys) to
107
+ lists of their corresponding values.
108
+
109
+ Returns:
110
+ A list of strings, where each string is a concatenation of the values
111
+ for a single row, ordered according to `reference_vars`.
112
+
113
+ Example:
114
+ >>> columns = ['name', 'age', 'city']
115
+ >>> data = {
116
+ ... 'name': ['Alice', 'Bob'],
117
+ ... 'age': [30, 25],
118
+ ... 'city': ['New York', 'Los Angeles']
119
+ ... }
120
+ >>> dict2lines(columns, data)
121
+ ['Alice30New York', 'Bob25Los Angeles']
122
+ """
41
123
  result = []
42
124
  num_rows = len(reference_var_to_values[reference_vars[0]])
43
125
  for row_idx in range(num_rows):
44
126
  row = []
45
127
  for reference_var in reference_vars:
46
128
  val = reference_var_to_values[reference_var][row_idx]
47
- if isinstance(val, float):
48
- val = truncate(val, 5)
49
- if isinstance(val, int):
50
- print(val)
51
- val = float(val)
52
- print(str(val))
53
129
  val = str(val)
54
130
  row.append(val)
55
131
  result.append("".join(row))
@@ -64,8 +140,6 @@ def compare_values(
64
140
  results_are_ordered: bool,
65
141
  ) -> bool:
66
142
 
67
- if len(reference_vars) > len(actual_vars):
68
- return False
69
143
  if len(reference_vars) < len(actual_vars):
70
144
  for combination in itertools.combinations(actual_vars, len(reference_vars)):
71
145
  if compare_values(
@@ -78,9 +152,9 @@ def compare_values(
78
152
  return True
79
153
  return False
80
154
 
81
- table = parse_dict2table(reference_vars, reference_var_to_values)
155
+ table = convert_table_dict2lines(reference_vars, reference_var_to_values)
82
156
  for permutation in itertools.permutations(actual_vars):
83
- actual_table = parse_dict2table(permutation, actual_var_to_values)
157
+ actual_table = convert_table_dict2lines(permutation, actual_var_to_values)
84
158
  if (results_are_ordered and table == actual_table) or (
85
159
  not results_are_ordered and Counter(table) == Counter(actual_table)
86
160
  ):
@@ -0,0 +1,25 @@
1
+ def compute_f1(recall: float | str | None, precision: float | str | None) -> float | None:
2
+ if recall is None or precision is None:
3
+ return None
4
+ recall = float(recall)
5
+ precision = float(precision)
6
+ if recall == 0.0 and precision == 0.0:
7
+ return 0.0
8
+ return 2 * (recall * precision) / (recall + precision)
9
+
10
+
11
+ def get_f1_dict(
12
+ input_dict: dict,
13
+ prefix: str
14
+ ) -> dict:
15
+ recall = input_dict.get(f"{prefix}_recall")
16
+ precision = input_dict.get(f"{prefix}_precision")
17
+ f1 = compute_f1(recall, precision)
18
+ if f1 is None:
19
+ return {}
20
+ result = {f"{prefix}_f1": f1}
21
+ recall_cost = input_dict.get(f"{prefix}_recall_cost")
22
+ precision_cost = input_dict.get(f"{prefix}_precision_cost")
23
+ if recall_cost is not None and precision_cost is not None:
24
+ result[f"{prefix}_f1_cost"] = recall_cost + precision_cost
25
+ return result
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "graphrag-eval"
3
- version = "4.0.0"
3
+ version = "5.0.1"
4
4
  description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
5
5
  authors = [
6
6
  {name = "Neli Hateva", email = "neli.hateva@graphwise.ai"},
@@ -11,20 +11,15 @@ license = "Apache-2.0"
11
11
  requires-python = ">=3.12,<3.13"
12
12
 
13
13
  [project.urls]
14
- repository = "https://github.com/Ontotext-AD/qa-eval"
14
+ repository = "https://github.com/Ontotext-AD/graphrag-eval"
15
15
 
16
- [build-system]
17
- requires = ["poetry-core>=2.0.0"]
18
- build-backend = "poetry.core.masonry.api"
19
-
20
- [tool.poetry.group.test.dependencies]
21
- pytest = "<9,>=8"
22
- pytest-cov = "<7,>=6"
23
- jsonlines = "4.0.0"
24
- pyyaml = "^6.0.2"
16
+ [tool.poetry.dependencies]
17
+ openai = { version = "^1.97.0", optional = true }
18
+ langevals = { version = "0.1.*", optional = true }
19
+ langevals-ragas = { version = "^0.1.12", optional = true }
25
20
 
26
- [tool.poetry.group.test]
27
- optional = true
21
+ [tool.poetry.extras]
22
+ openai = ["openai", "langevals", "langevals-ragas"]
28
23
 
29
24
  [tool.poetry.group.openai.dependencies]
30
25
  openai = "^1.97.0"
@@ -34,5 +29,18 @@ langevals-ragas = "^0.1.12"
34
29
  [tool.poetry.group.openai]
35
30
  optional = true
36
31
 
32
+ [tool.poetry.group.test.dependencies]
33
+ pytest = "<9,>=8"
34
+ pytest-cov = "<7,>=6"
35
+ jsonlines = "4.0.0"
36
+ pyyaml = "^6.0.2"
37
+
38
+ [tool.poetry.group.test]
39
+ optional = true
40
+
37
41
  [project.scripts]
38
- answer-correctness = "qa_eval.answer_evaluation:main"
42
+ answer-correctness = "graphrag_eval.answer_correctness:main"
43
+
44
+ [build-system]
45
+ requires = ["poetry-core>=2.0.0"]
46
+ build-backend = "poetry.core.masonry.api"
@@ -1,55 +0,0 @@
1
- from typing import Iterable
2
-
3
-
4
- def recall_at_k(relevant_docs: Iterable, retrieved_docs: list, k: int = 10) -> float:
5
- """
6
- Calculates Recall@k.
7
-
8
- Args:
9
- relevant_docs (Iterable): A set of ground truth relevant document IDs.
10
- retrieved_docs (list): A list of retrieved document IDs, ordered by rank.
11
- k (int): The cutoff for the retrieval list.
12
-
13
- Returns:
14
- float: The Recall@k score.
15
- """
16
- retrieved_at_k = retrieved_docs[:k]
17
-
18
- relevant_set = set(relevant_docs)
19
- retrieved_set = set(retrieved_at_k)
20
- true_positives = len(relevant_set.intersection(retrieved_set))
21
-
22
- total_relevant = len(relevant_set)
23
-
24
- if total_relevant == 0:
25
- return 0.0
26
-
27
- return true_positives / total_relevant
28
-
29
-
30
- def average_precision(relevant_docs: Iterable, retrieved_docs: Iterable) -> float:
31
- """
32
- Calculates Average Precision (AP) for a single query.
33
-
34
- Args:
35
- relevant_docs (Iterable): A set of ground truth relevant document IDs.
36
- retrieved_docs (Iterable): A list of retrieved document IDs, ordered by rank.
37
-
38
- Returns:
39
- float: The Average Precision score.
40
- """
41
- relevant_set = set(relevant_docs)
42
- hits = 0
43
- sum_of_precisions = 0.0
44
-
45
- for i, doc_id in enumerate(retrieved_docs):
46
- if doc_id in relevant_set:
47
- hits += 1
48
- precision_at_k = hits / (i + 1)
49
- sum_of_precisions += precision_at_k
50
-
51
- total_relevant = len(relevant_set)
52
- if total_relevant == 0:
53
- return 0.0
54
-
55
- return sum_of_precisions / total_relevant
File without changes