graphrag-eval 5.1.0__tar.gz → 5.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/PKG-INFO +83 -81
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/README.md +82 -80
- graphrag_eval-5.1.2/graphrag_eval/__init__.py +2 -0
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/graphrag_eval/aggregation.py +103 -97
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/graphrag_eval/answer_correctness.py +11 -12
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/graphrag_eval/evaluation.py +1 -1
- graphrag_eval-5.1.2/graphrag_eval/steps/__init__.py +0 -0
- graphrag_eval-5.1.0/graphrag_eval/steps/__init__.py → graphrag_eval-5.1.2/graphrag_eval/steps/evaluation.py +64 -54
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/pyproject.toml +1 -1
- graphrag_eval-5.1.0/graphrag_eval/__init__.py +0 -4
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/LICENSE +0 -0
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/graphrag_eval/answer_relevance.py +0 -0
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/graphrag_eval/prompts/template.md +0 -0
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/graphrag_eval/steps/retrieval_answer.py +0 -0
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/graphrag_eval/steps/retrieval_context_ids.py +0 -0
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/graphrag_eval/steps/retrieval_context_texts.py +0 -0
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/graphrag_eval/steps/sparql.py +0 -0
- {graphrag_eval-5.1.0 → graphrag_eval-5.1.2}/graphrag_eval/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: graphrag-eval
|
|
3
|
-
Version: 5.1.
|
|
3
|
+
Version: 5.1.2
|
|
4
4
|
Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: Philip Ganchev
|
|
@@ -24,8 +24,7 @@ Description-Content-Type: text/markdown
|
|
|
24
24
|
|
|
25
25
|
# QA Evaluation
|
|
26
26
|
|
|
27
|
-
This is a Python module for assessing the quality of question-answering systems such as ones based on LLM agents, based on a set of questions and reference answers for them. This includes evaluating the final answer and the steps used
|
|
28
|
-
to reach the answer (such as orchestrated and executed steps), compared to the given reference steps.
|
|
27
|
+
This is a Python module for assessing the quality of question-answering systems such as ones based on LLM agents, based on a set of questions and reference answers for them. This includes evaluating the final answer and the steps used to reach the answer (such as orchestrated and executed steps), compared to the given reference steps.
|
|
29
28
|
|
|
30
29
|
## License
|
|
31
30
|
|
|
@@ -72,24 +71,24 @@ We plan to improve CLI support in future releases.
|
|
|
72
71
|
|
|
73
72
|
To evaluate answers and/or steps:
|
|
74
73
|
1. Install this package: section [Install](#Installation)
|
|
75
|
-
1. Format the
|
|
76
|
-
1. Format the answers and/or steps you want to evaluate: section [
|
|
74
|
+
1. Format the dataset of questions and reference answers and/or steps: section [Reference Q&A Data](#Reference-qa-Data)
|
|
75
|
+
1. Format the answers and/or steps you want to evaluate: section [Responses to evaluate](#Responses-to-evaluate)
|
|
77
76
|
1. To evaluate answer relevance:
|
|
78
77
|
1. Include `actual_answer` in the target data to evaluate
|
|
79
78
|
1. Set environment variable `OPENAI_API_KEY` appropriately
|
|
80
79
|
1. To evaluate answer correctness:
|
|
81
|
-
1. Include `reference_answer` in the reference
|
|
80
|
+
1. Include `reference_answer` in the reference dataset and `actual_answer` in the target data to evaluate
|
|
82
81
|
1. Set environment variable `OPENAI_API_KEY` appropriately
|
|
83
82
|
1. To evaluate steps:
|
|
84
|
-
1. Include `reference_steps` in the reference
|
|
85
|
-
1. Call the evaluation function with the reference
|
|
83
|
+
1. Include `reference_steps` in the reference data and `actual_steps` in target data to evaluate
|
|
84
|
+
1. Call the evaluation function with the reference data and target data: section [Usage Code](#Usage-Code)
|
|
86
85
|
1. Call the aggregation function with the evaluation results
|
|
87
86
|
|
|
88
87
|
Answer evaluation (correctness and relevance) uses the LLM `openai/gpt-4o-mini`.
|
|
89
88
|
|
|
90
|
-
### Reference Q&A
|
|
89
|
+
### Reference Q&A Data
|
|
91
90
|
|
|
92
|
-
A reference
|
|
91
|
+
A reference dataset is a list of templates, each of which contains:
|
|
93
92
|
|
|
94
93
|
- `template_id`: Unique template identifier
|
|
95
94
|
- `questions`: A list of questions derived from this template, where each includes:
|
|
@@ -109,9 +108,9 @@ Each step includes:
|
|
|
109
108
|
- `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
|
|
110
109
|
- `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
|
|
111
110
|
|
|
112
|
-
####
|
|
111
|
+
#### Reference Data
|
|
113
112
|
|
|
114
|
-
The example
|
|
113
|
+
The example data below illustrates a minimal but realistic Q&A dataset, showing two templates with associated questions and steps.
|
|
115
114
|
|
|
116
115
|
```yaml
|
|
117
116
|
- template_id: list_all_transformers_within_Substation_SUBSTATION
|
|
@@ -277,9 +276,9 @@ The example corpus below illustrates a minimal but realistic Q&A dataset, showin
|
|
|
277
276
|
|
|
278
277
|
The module is agnostic to the specific LLM agent implementation and model; it depends solely on the format of the response.
|
|
279
278
|
|
|
280
|
-
###
|
|
279
|
+
### Responses to evaluate
|
|
281
280
|
|
|
282
|
-
|
|
281
|
+
Given a question, if the question-answering system successfully responds, to evaluate the response, call `run_evaluation()` with the response formatted as in the example below. (On the other hand, if an error occurs while generating a response, format it as in [Target Input on Error](#target-input-on-error).)
|
|
283
282
|
|
|
284
283
|
```json
|
|
285
284
|
{
|
|
@@ -332,9 +331,9 @@ Below is an example response from the question-answering system for a single que
|
|
|
332
331
|
}
|
|
333
332
|
```
|
|
334
333
|
|
|
335
|
-
####
|
|
334
|
+
#### Target Input on Error
|
|
336
335
|
|
|
337
|
-
If an error occurs
|
|
336
|
+
If an error occurs while the question-answering system is generating a response, and you want to tally this error, the input to `run_evaluate()` should be like:
|
|
338
337
|
|
|
339
338
|
```json
|
|
340
339
|
{
|
|
@@ -344,22 +343,22 @@ If an error occurs during generating a response to a question, the expected targ
|
|
|
344
343
|
}
|
|
345
344
|
```
|
|
346
345
|
|
|
347
|
-
###
|
|
346
|
+
### Usage Code
|
|
348
347
|
|
|
349
348
|
```python
|
|
350
349
|
from graphrag_eval import run_evaluation, compute_aggregates
|
|
351
350
|
|
|
352
|
-
reference_qas: list[dict] = [] # read your
|
|
351
|
+
reference_qas: list[dict] = [] # read your reference data
|
|
353
352
|
chat_responses: dict = {} # call your implementation to get the response
|
|
354
353
|
evaluation_results = run_evaluation(reference_qas, chat_responses)
|
|
355
354
|
aggregates = compute_aggregates(evaluation_results)
|
|
356
355
|
```
|
|
357
356
|
|
|
358
|
-
`evaluation_results` is a list of statistics for each question, as in section [
|
|
357
|
+
`evaluation_results` is a list of statistics for each question, as in section [Evaluation Results](#Evaluation-results). The format is explained in section [Output Keys](#output-keys)
|
|
359
358
|
|
|
360
359
|
If your chat responses contain actual answers, set your environment variable `OPENAI_API_KEY` before running the code above.
|
|
361
360
|
|
|
362
|
-
###
|
|
361
|
+
### Evaluation Results
|
|
363
362
|
|
|
364
363
|
The output is a list of statistics for each question from the reference Q&A dataset. Here is an example of statistics for one question:
|
|
365
364
|
|
|
@@ -584,69 +583,72 @@ All `actual_steps` with `name` "retrieval" contain:
|
|
|
584
583
|
|
|
585
584
|
#### Aggregates Keys
|
|
586
585
|
|
|
587
|
-
The `aggregates` object provides aggregated evaluation metrics.
|
|
588
|
-
|
|
589
|
-
|
|
586
|
+
The `aggregates` object provides aggregated evaluation metrics. These aggregates support analysis of agent quality, token efficiency, and execution performance. Aggregates are computed:
|
|
587
|
+
1. per question template, and
|
|
588
|
+
1. over all questions in the dataset, using micro and macro averaging
|
|
589
|
+
|
|
590
590
|
Aggregates are:
|
|
591
591
|
- `per_template`: a dictionary mapping a template identifier to the following statistics:
|
|
592
592
|
- `number_of_error_samples`: number of questions for this template, which resulted in error response
|
|
593
593
|
- `number_of_success_samples`: number of questions for this template, which resulted in successful response
|
|
594
|
-
- `
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
- `
|
|
605
|
-
- `
|
|
606
|
-
- `
|
|
607
|
-
- `
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
594
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics over all non-error responses for this template for the following metrics:
|
|
595
|
+
- `input_tokens`
|
|
596
|
+
- `output_tokens`
|
|
597
|
+
- `total_tokens`
|
|
598
|
+
- `elapsed_sec`
|
|
599
|
+
- `answer_recall`
|
|
600
|
+
- `answer_precision`
|
|
601
|
+
- `answer_f1`
|
|
602
|
+
- `answer_relevance`
|
|
603
|
+
- `steps_score`
|
|
604
|
+
- `retrieval_answer_recall`
|
|
605
|
+
- `retrieval_answer_precision`
|
|
606
|
+
- `retrieval_answer_f1`
|
|
607
|
+
- `retrieval_context_recall`
|
|
608
|
+
- `retrieval_context_precision`
|
|
609
|
+
- `retrieval_context_f1`
|
|
610
|
+
- `steps`: includes:
|
|
611
|
+
- `steps`: for each step type how many times it was executed
|
|
612
|
+
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
613
|
+
- `empty_results`: how many times the step was executed and returned empty results
|
|
614
|
+
- `errors`: how many times the step was executed and resulted in error
|
|
614
615
|
- `micro`: statistics across questions, regardless of template. It includes:
|
|
615
616
|
- `number_of_error_samples`: total number of questions, which resulted in error response
|
|
616
617
|
- `number_of_success_samples`: total number of questions, which resulted in successful response
|
|
617
|
-
- `
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
- `
|
|
634
|
-
|
|
635
|
-
- `
|
|
636
|
-
- `
|
|
637
|
-
- `
|
|
638
|
-
- `
|
|
639
|
-
- `
|
|
640
|
-
- `
|
|
641
|
-
- `
|
|
642
|
-
- `
|
|
643
|
-
- `
|
|
644
|
-
- `
|
|
645
|
-
- `
|
|
646
|
-
- `
|
|
647
|
-
- `
|
|
648
|
-
- `
|
|
649
|
-
- `
|
|
618
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics over all non-error responses for the following metrics:
|
|
619
|
+
- `input_tokens`
|
|
620
|
+
- `output_tokens`
|
|
621
|
+
- `total_tokens`
|
|
622
|
+
- `elapsed_sec`
|
|
623
|
+
- `answer_recall`
|
|
624
|
+
- `answer_precision`
|
|
625
|
+
- `answer_f1`
|
|
626
|
+
- `answer_relevance`
|
|
627
|
+
- `answer_relevance_cost`
|
|
628
|
+
- `retrieval_answer_recall`
|
|
629
|
+
- `retrieval_answer_precision`
|
|
630
|
+
- `retrieval_answer_f1`
|
|
631
|
+
- `retrieval_context_recall`
|
|
632
|
+
- `retrieval_context_precision`
|
|
633
|
+
- `retrieval_context_f1`
|
|
634
|
+
- `steps_score`
|
|
635
|
+
- `macro`: averages across templates, i.e., the mean of each metric per template, averaged. It includes the following means:
|
|
636
|
+
- `input_tokens`
|
|
637
|
+
- `output_tokens`
|
|
638
|
+
- `total_tokens`
|
|
639
|
+
- `elapsed_sec`
|
|
640
|
+
- `answer_recall`
|
|
641
|
+
- `answer_precision`
|
|
642
|
+
- `answer_f1`
|
|
643
|
+
- `answer_relevance`
|
|
644
|
+
- `answer_relevance_cost`
|
|
645
|
+
- `retrieval_answer_recall`
|
|
646
|
+
- `retrieval_answer_precision`
|
|
647
|
+
- `retrieval_answer_f1`
|
|
648
|
+
- `retrieval_context_recall`
|
|
649
|
+
- `retrieval_context_precision`
|
|
650
|
+
- `retrieval_context_f1`
|
|
651
|
+
- `steps_score`
|
|
650
652
|
|
|
651
653
|
#### Example Aggregates
|
|
652
654
|
|
|
@@ -674,11 +676,11 @@ per_template:
|
|
|
674
676
|
min: 1.0
|
|
675
677
|
max: 1.0
|
|
676
678
|
answer_relevance:
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
679
|
+
min: 0.9
|
|
680
|
+
max: 0.9
|
|
681
|
+
mean: 0.9
|
|
682
|
+
median: 0.9
|
|
683
|
+
sum: 0.9
|
|
682
684
|
answer_relevance_cost:
|
|
683
685
|
min: 0.0007
|
|
684
686
|
max: 0.0007
|
|
@@ -4,8 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
# QA Evaluation
|
|
6
6
|
|
|
7
|
-
This is a Python module for assessing the quality of question-answering systems such as ones based on LLM agents, based on a set of questions and reference answers for them. This includes evaluating the final answer and the steps used
|
|
8
|
-
to reach the answer (such as orchestrated and executed steps), compared to the given reference steps.
|
|
7
|
+
This is a Python module for assessing the quality of question-answering systems such as ones based on LLM agents, based on a set of questions and reference answers for them. This includes evaluating the final answer and the steps used to reach the answer (such as orchestrated and executed steps), compared to the given reference steps.
|
|
9
8
|
|
|
10
9
|
## License
|
|
11
10
|
|
|
@@ -52,24 +51,24 @@ We plan to improve CLI support in future releases.
|
|
|
52
51
|
|
|
53
52
|
To evaluate answers and/or steps:
|
|
54
53
|
1. Install this package: section [Install](#Installation)
|
|
55
|
-
1. Format the
|
|
56
|
-
1. Format the answers and/or steps you want to evaluate: section [
|
|
54
|
+
1. Format the dataset of questions and reference answers and/or steps: section [Reference Q&A Data](#Reference-qa-Data)
|
|
55
|
+
1. Format the answers and/or steps you want to evaluate: section [Responses to evaluate](#Responses-to-evaluate)
|
|
57
56
|
1. To evaluate answer relevance:
|
|
58
57
|
1. Include `actual_answer` in the target data to evaluate
|
|
59
58
|
1. Set environment variable `OPENAI_API_KEY` appropriately
|
|
60
59
|
1. To evaluate answer correctness:
|
|
61
|
-
1. Include `reference_answer` in the reference
|
|
60
|
+
1. Include `reference_answer` in the reference dataset and `actual_answer` in the target data to evaluate
|
|
62
61
|
1. Set environment variable `OPENAI_API_KEY` appropriately
|
|
63
62
|
1. To evaluate steps:
|
|
64
|
-
1. Include `reference_steps` in the reference
|
|
65
|
-
1. Call the evaluation function with the reference
|
|
63
|
+
1. Include `reference_steps` in the reference data and `actual_steps` in target data to evaluate
|
|
64
|
+
1. Call the evaluation function with the reference data and target data: section [Usage Code](#Usage-Code)
|
|
66
65
|
1. Call the aggregation function with the evaluation results
|
|
67
66
|
|
|
68
67
|
Answer evaluation (correctness and relevance) uses the LLM `openai/gpt-4o-mini`.
|
|
69
68
|
|
|
70
|
-
### Reference Q&A
|
|
69
|
+
### Reference Q&A Data
|
|
71
70
|
|
|
72
|
-
A reference
|
|
71
|
+
A reference dataset is a list of templates, each of which contains:
|
|
73
72
|
|
|
74
73
|
- `template_id`: Unique template identifier
|
|
75
74
|
- `questions`: A list of questions derived from this template, where each includes:
|
|
@@ -89,9 +88,9 @@ Each step includes:
|
|
|
89
88
|
- `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
|
|
90
89
|
- `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
|
|
91
90
|
|
|
92
|
-
####
|
|
91
|
+
#### Reference Data
|
|
93
92
|
|
|
94
|
-
The example
|
|
93
|
+
The example data below illustrates a minimal but realistic Q&A dataset, showing two templates with associated questions and steps.
|
|
95
94
|
|
|
96
95
|
```yaml
|
|
97
96
|
- template_id: list_all_transformers_within_Substation_SUBSTATION
|
|
@@ -257,9 +256,9 @@ The example corpus below illustrates a minimal but realistic Q&A dataset, showin
|
|
|
257
256
|
|
|
258
257
|
The module is agnostic to the specific LLM agent implementation and model; it depends solely on the format of the response.
|
|
259
258
|
|
|
260
|
-
###
|
|
259
|
+
### Responses to evaluate
|
|
261
260
|
|
|
262
|
-
|
|
261
|
+
Given a question, if the question-answering system successfully responds, to evaluate the response, call `run_evaluation()` with the response formatted as in the example below. (On the other hand, if an error occurs while generating a response, format it as in [Target Input on Error](#target-input-on-error).)
|
|
263
262
|
|
|
264
263
|
```json
|
|
265
264
|
{
|
|
@@ -312,9 +311,9 @@ Below is an example response from the question-answering system for a single que
|
|
|
312
311
|
}
|
|
313
312
|
```
|
|
314
313
|
|
|
315
|
-
####
|
|
314
|
+
#### Target Input on Error
|
|
316
315
|
|
|
317
|
-
If an error occurs
|
|
316
|
+
If an error occurs while the question-answering system is generating a response, and you want to tally this error, the input to `run_evaluate()` should be like:
|
|
318
317
|
|
|
319
318
|
```json
|
|
320
319
|
{
|
|
@@ -324,22 +323,22 @@ If an error occurs during generating a response to a question, the expected targ
|
|
|
324
323
|
}
|
|
325
324
|
```
|
|
326
325
|
|
|
327
|
-
###
|
|
326
|
+
### Usage Code
|
|
328
327
|
|
|
329
328
|
```python
|
|
330
329
|
from graphrag_eval import run_evaluation, compute_aggregates
|
|
331
330
|
|
|
332
|
-
reference_qas: list[dict] = [] # read your
|
|
331
|
+
reference_qas: list[dict] = [] # read your reference data
|
|
333
332
|
chat_responses: dict = {} # call your implementation to get the response
|
|
334
333
|
evaluation_results = run_evaluation(reference_qas, chat_responses)
|
|
335
334
|
aggregates = compute_aggregates(evaluation_results)
|
|
336
335
|
```
|
|
337
336
|
|
|
338
|
-
`evaluation_results` is a list of statistics for each question, as in section [
|
|
337
|
+
`evaluation_results` is a list of statistics for each question, as in section [Evaluation Results](#Evaluation-results). The format is explained in section [Output Keys](#output-keys)
|
|
339
338
|
|
|
340
339
|
If your chat responses contain actual answers, set your environment variable `OPENAI_API_KEY` before running the code above.
|
|
341
340
|
|
|
342
|
-
###
|
|
341
|
+
### Evaluation Results
|
|
343
342
|
|
|
344
343
|
The output is a list of statistics for each question from the reference Q&A dataset. Here is an example of statistics for one question:
|
|
345
344
|
|
|
@@ -564,69 +563,72 @@ All `actual_steps` with `name` "retrieval" contain:
|
|
|
564
563
|
|
|
565
564
|
#### Aggregates Keys
|
|
566
565
|
|
|
567
|
-
The `aggregates` object provides aggregated evaluation metrics.
|
|
568
|
-
|
|
569
|
-
|
|
566
|
+
The `aggregates` object provides aggregated evaluation metrics. These aggregates support analysis of agent quality, token efficiency, and execution performance. Aggregates are computed:
|
|
567
|
+
1. per question template, and
|
|
568
|
+
1. over all questions in the dataset, using micro and macro averaging
|
|
569
|
+
|
|
570
570
|
Aggregates are:
|
|
571
571
|
- `per_template`: a dictionary mapping a template identifier to the following statistics:
|
|
572
572
|
- `number_of_error_samples`: number of questions for this template, which resulted in error response
|
|
573
573
|
- `number_of_success_samples`: number of questions for this template, which resulted in successful response
|
|
574
|
-
- `
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
- `
|
|
585
|
-
- `
|
|
586
|
-
- `
|
|
587
|
-
- `
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
574
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics over all non-error responses for this template for the following metrics:
|
|
575
|
+
- `input_tokens`
|
|
576
|
+
- `output_tokens`
|
|
577
|
+
- `total_tokens`
|
|
578
|
+
- `elapsed_sec`
|
|
579
|
+
- `answer_recall`
|
|
580
|
+
- `answer_precision`
|
|
581
|
+
- `answer_f1`
|
|
582
|
+
- `answer_relevance`
|
|
583
|
+
- `steps_score`
|
|
584
|
+
- `retrieval_answer_recall`
|
|
585
|
+
- `retrieval_answer_precision`
|
|
586
|
+
- `retrieval_answer_f1`
|
|
587
|
+
- `retrieval_context_recall`
|
|
588
|
+
- `retrieval_context_precision`
|
|
589
|
+
- `retrieval_context_f1`
|
|
590
|
+
- `steps`: includes:
|
|
591
|
+
- `steps`: for each step type how many times it was executed
|
|
592
|
+
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
593
|
+
- `empty_results`: how many times the step was executed and returned empty results
|
|
594
|
+
- `errors`: how many times the step was executed and resulted in error
|
|
594
595
|
- `micro`: statistics across questions, regardless of template. It includes:
|
|
595
596
|
- `number_of_error_samples`: total number of questions, which resulted in error response
|
|
596
597
|
- `number_of_success_samples`: total number of questions, which resulted in successful response
|
|
597
|
-
- `
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
- `
|
|
614
|
-
|
|
615
|
-
- `
|
|
616
|
-
- `
|
|
617
|
-
- `
|
|
618
|
-
- `
|
|
619
|
-
- `
|
|
620
|
-
- `
|
|
621
|
-
- `
|
|
622
|
-
- `
|
|
623
|
-
- `
|
|
624
|
-
- `
|
|
625
|
-
- `
|
|
626
|
-
- `
|
|
627
|
-
- `
|
|
628
|
-
- `
|
|
629
|
-
- `
|
|
598
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics over all non-error responses for the following metrics:
|
|
599
|
+
- `input_tokens`
|
|
600
|
+
- `output_tokens`
|
|
601
|
+
- `total_tokens`
|
|
602
|
+
- `elapsed_sec`
|
|
603
|
+
- `answer_recall`
|
|
604
|
+
- `answer_precision`
|
|
605
|
+
- `answer_f1`
|
|
606
|
+
- `answer_relevance`
|
|
607
|
+
- `answer_relevance_cost`
|
|
608
|
+
- `retrieval_answer_recall`
|
|
609
|
+
- `retrieval_answer_precision`
|
|
610
|
+
- `retrieval_answer_f1`
|
|
611
|
+
- `retrieval_context_recall`
|
|
612
|
+
- `retrieval_context_precision`
|
|
613
|
+
- `retrieval_context_f1`
|
|
614
|
+
- `steps_score`
|
|
615
|
+
- `macro`: averages across templates, i.e., the mean of each metric per template, averaged. It includes the following means:
|
|
616
|
+
- `input_tokens`
|
|
617
|
+
- `output_tokens`
|
|
618
|
+
- `total_tokens`
|
|
619
|
+
- `elapsed_sec`
|
|
620
|
+
- `answer_recall`
|
|
621
|
+
- `answer_precision`
|
|
622
|
+
- `answer_f1`
|
|
623
|
+
- `answer_relevance`
|
|
624
|
+
- `answer_relevance_cost`
|
|
625
|
+
- `retrieval_answer_recall`
|
|
626
|
+
- `retrieval_answer_precision`
|
|
627
|
+
- `retrieval_answer_f1`
|
|
628
|
+
- `retrieval_context_recall`
|
|
629
|
+
- `retrieval_context_precision`
|
|
630
|
+
- `retrieval_context_f1`
|
|
631
|
+
- `steps_score`
|
|
630
632
|
|
|
631
633
|
#### Example Aggregates
|
|
632
634
|
|
|
@@ -654,11 +656,11 @@ per_template:
|
|
|
654
656
|
min: 1.0
|
|
655
657
|
max: 1.0
|
|
656
658
|
answer_relevance:
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
659
|
+
min: 0.9
|
|
660
|
+
max: 0.9
|
|
661
|
+
mean: 0.9
|
|
662
|
+
median: 0.9
|
|
663
|
+
sum: 0.9
|
|
662
664
|
answer_relevance_cost:
|
|
663
665
|
min: 0.0007
|
|
664
666
|
max: 0.0007
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from collections import defaultdict
|
|
3
|
+
from collections.abc import Sequence
|
|
3
4
|
from statistics import mean, median
|
|
4
|
-
from typing import Any, Iterable
|
|
5
|
-
|
|
5
|
+
from typing import Any, Collection, Iterable
|
|
6
6
|
|
|
7
7
|
METRICS = [
|
|
8
8
|
"answer_recall",
|
|
@@ -32,7 +32,7 @@ STEPS_METRICS = {
|
|
|
32
32
|
"retrieval_context_f1_cost",
|
|
33
33
|
]
|
|
34
34
|
}
|
|
35
|
-
|
|
35
|
+
RETAINED_METRICS = [
|
|
36
36
|
"input_tokens",
|
|
37
37
|
"output_tokens",
|
|
38
38
|
"total_tokens",
|
|
@@ -50,124 +50,96 @@ def stats_for_series(values: Iterable[int | float]) -> dict[str, float]:
|
|
|
50
50
|
}
|
|
51
51
|
|
|
52
52
|
|
|
53
|
-
def
|
|
54
|
-
sample: dict,
|
|
55
|
-
|
|
56
|
-
template_id: str
|
|
53
|
+
def update_step_metrics(
|
|
54
|
+
sample: dict[str, Any],
|
|
55
|
+
template_step_metrics: dict[str, list[float]],
|
|
57
56
|
):
|
|
58
57
|
for step in sample.get("actual_steps", []):
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
step_metrics_per_template[template_id][metric].append(value)
|
|
58
|
+
for metric in STEPS_METRICS.get(step["name"], []):
|
|
59
|
+
value = step.get(metric)
|
|
60
|
+
if value is not None:
|
|
61
|
+
template_step_metrics[metric].append(value)
|
|
64
62
|
|
|
65
63
|
|
|
66
|
-
def
|
|
67
|
-
sample: dict,
|
|
68
|
-
|
|
69
|
-
template_id: str
|
|
64
|
+
def update_stats(
|
|
65
|
+
sample: dict[str, float | int],
|
|
66
|
+
template_stats: dict[str, list[float | int]],
|
|
70
67
|
):
|
|
71
68
|
for metric in METRICS:
|
|
72
69
|
value = sample.get(metric)
|
|
73
70
|
if value is not None:
|
|
74
|
-
|
|
71
|
+
template_stats[metric].append(value)
|
|
75
72
|
|
|
76
73
|
|
|
77
|
-
def
|
|
78
|
-
sample: dict,
|
|
79
|
-
|
|
80
|
-
template_id: str
|
|
74
|
+
def update_steps_summary(
|
|
75
|
+
sample: dict[str, Any],
|
|
76
|
+
template_steps_summary: dict,
|
|
81
77
|
):
|
|
82
78
|
seen = set()
|
|
83
79
|
for step in sample.get("actual_steps", []):
|
|
84
80
|
name = step["name"]
|
|
85
|
-
template_steps_summary = steps_summary_per_template[template_id]
|
|
86
81
|
template_steps_summary["total"][name] += 1
|
|
87
82
|
if step["status"] == "error":
|
|
88
83
|
template_steps_summary["errors"][name] += 1
|
|
89
84
|
if name not in seen:
|
|
90
85
|
seen.add(name)
|
|
91
86
|
template_steps_summary["once_per_sample"][name] += 1
|
|
92
|
-
|
|
93
87
|
if step["status"] != "error":
|
|
94
88
|
try:
|
|
95
89
|
res = json.loads(step["output"])
|
|
96
|
-
if "results" in res and "bindings" in res["results"]:
|
|
90
|
+
if isinstance(res, dict) and "results" in res and "bindings" in res["results"]:
|
|
97
91
|
if not res["results"]["bindings"]:
|
|
98
92
|
template_steps_summary["empty_results"][name] += 1
|
|
99
93
|
except json.decoder.JSONDecodeError:
|
|
100
94
|
pass
|
|
101
95
|
|
|
102
96
|
|
|
103
|
-
def
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
for sample in samples:
|
|
112
|
-
template_id = sample["template_id"]
|
|
113
|
-
templates_ids.add(template_id)
|
|
114
|
-
|
|
115
|
-
if "error" in sample:
|
|
116
|
-
number_of_samples_per_template_by_status[template_id]["error"] += 1
|
|
117
|
-
continue
|
|
118
|
-
number_of_samples_per_template_by_status[template_id]["success"] += 1
|
|
119
|
-
|
|
120
|
-
update_stats_per_template(sample, stats_per_template, template_id)
|
|
121
|
-
update_steps_summary_per_template(
|
|
122
|
-
sample,
|
|
123
|
-
steps_summary_per_template,
|
|
124
|
-
template_id
|
|
125
|
-
)
|
|
126
|
-
update_step_metrics_per_template(
|
|
127
|
-
sample,
|
|
128
|
-
step_metrics_per_template,
|
|
129
|
-
template_id
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
summary = {"per_template": {}}
|
|
97
|
+
def compute_per_template_stats(
|
|
98
|
+
templates_ids: Collection[str],
|
|
99
|
+
number_of_samples_per_template_by_status: dict[str, dict[str, int]],
|
|
100
|
+
stats_per_template: dict[str, dict[str, Sequence[int]]],
|
|
101
|
+
steps_summary_per_template: dict[str, dict[str, dict[str, int]]],
|
|
102
|
+
step_metrics_per_template: dict[str, dict[str, Sequence[int]]],
|
|
103
|
+
) -> dict[str, dict[str, Any]]:
|
|
104
|
+
summary = {}
|
|
133
105
|
|
|
134
106
|
# Add per-template stats
|
|
135
107
|
for template_id in templates_ids:
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
"
|
|
108
|
+
n_by_status = number_of_samples_per_template_by_status[template_id]
|
|
109
|
+
summary[template_id] = {
|
|
110
|
+
"number_of_error_samples": n_by_status["error"],
|
|
111
|
+
"number_of_success_samples": n_by_status["success"],
|
|
139
112
|
}
|
|
113
|
+
for metric in METRICS:
|
|
114
|
+
series = stats_per_template[template_id].get(metric, [])
|
|
115
|
+
if series or metric in RETAINED_METRICS:
|
|
116
|
+
summary[template_id][metric] = stats_for_series(series)
|
|
140
117
|
steps_summary = {
|
|
141
118
|
k1: {k2: v2 for k2, v2 in v1.items()}
|
|
142
119
|
for k1, v1 in steps_summary_per_template[template_id].items()
|
|
143
120
|
}
|
|
144
121
|
if steps_summary:
|
|
145
|
-
|
|
146
|
-
for metric in METRICS:
|
|
147
|
-
results_for_template = stats_per_template[template_id]
|
|
148
|
-
series = results_for_template.get(metric, [])
|
|
149
|
-
if series or metric in PROTECTED_METRICS:
|
|
150
|
-
template_summary[metric] = stats_for_series(series)
|
|
122
|
+
summary[template_id]["steps"] = steps_summary
|
|
151
123
|
|
|
152
124
|
# Add step metrics for the template
|
|
153
125
|
template_step_metrics = {}
|
|
154
126
|
for metric, values in step_metrics_per_template[template_id].items():
|
|
155
127
|
template_step_metrics[metric] = stats_for_series(values)
|
|
156
128
|
if template_step_metrics:
|
|
157
|
-
|
|
129
|
+
summary[template_id]["steps"].update(template_step_metrics)
|
|
130
|
+
return summary
|
|
158
131
|
|
|
159
|
-
summary["per_template"][template_id] = template_summary
|
|
160
132
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
)
|
|
170
|
-
}
|
|
133
|
+
def compute_micro_stats(
|
|
134
|
+
number_of_samples_per_template_by_status,
|
|
135
|
+
stats_per_template,
|
|
136
|
+
step_metrics_per_template
|
|
137
|
+
) -> dict:
|
|
138
|
+
values = number_of_samples_per_template_by_status.values()
|
|
139
|
+
micro_summary = defaultdict(dict, {
|
|
140
|
+
"number_of_error_samples": sum(v["error"] for v in values),
|
|
141
|
+
"number_of_success_samples": sum(v["success"] for v in values)
|
|
142
|
+
})
|
|
171
143
|
for metric in METRICS:
|
|
172
144
|
series = [
|
|
173
145
|
i
|
|
@@ -175,42 +147,76 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
175
147
|
for i in values[metric]
|
|
176
148
|
if values.get(metric) is not None
|
|
177
149
|
]
|
|
178
|
-
if series or metric in
|
|
179
|
-
|
|
150
|
+
if series or metric in RETAINED_METRICS:
|
|
151
|
+
micro_summary[metric] = stats_for_series(series)
|
|
180
152
|
|
|
181
153
|
# Add micro step metrics
|
|
182
154
|
micro_step_metrics = defaultdict(list)
|
|
183
155
|
for template_metrics in step_metrics_per_template.values():
|
|
184
156
|
for metric, values in template_metrics.items():
|
|
185
157
|
micro_step_metrics[metric].extend(values)
|
|
186
|
-
|
|
187
|
-
metric
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
summary["micro"].update(step_metrics)
|
|
158
|
+
for metric, values in micro_step_metrics.items():
|
|
159
|
+
micro_summary[metric] = stats_for_series(values)
|
|
160
|
+
return dict(micro_summary)
|
|
161
|
+
|
|
191
162
|
|
|
192
|
-
|
|
193
|
-
|
|
163
|
+
def compute_macro_stats(
|
|
164
|
+
summary_per_template: dict[str, dict[str, dict[str, Any]]]
|
|
165
|
+
) -> dict:
|
|
166
|
+
macro_summary = defaultdict(dict)
|
|
194
167
|
for metric in METRICS:
|
|
195
168
|
means = [
|
|
196
169
|
values[metric]["mean"]
|
|
197
|
-
for
|
|
170
|
+
for values in summary_per_template.values()
|
|
198
171
|
if values.get(metric) is not None
|
|
199
172
|
]
|
|
200
|
-
if means or metric in
|
|
201
|
-
|
|
173
|
+
if means or metric in RETAINED_METRICS:
|
|
174
|
+
macro_summary[metric]["mean"] = mean(means or [0])
|
|
202
175
|
|
|
203
176
|
# Add macro step metrics
|
|
204
177
|
macro_step_metrics = defaultdict(list)
|
|
205
|
-
for
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
178
|
+
for template_summary in summary_per_template.values():
|
|
179
|
+
for metric, stats in template_summary.get("steps", {}).items():
|
|
180
|
+
if "mean" in stats:
|
|
181
|
+
macro_step_metrics[metric].append(stats["mean"])
|
|
182
|
+
for metric, values in macro_step_metrics.items():
|
|
183
|
+
macro_summary[metric]["mean"] = mean(values or [0])
|
|
184
|
+
return dict(macro_summary)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def compute_aggregates(samples: list[dict]) -> dict:
|
|
188
|
+
number_of_samples_per_template_by_status = defaultdict(lambda: defaultdict(int))
|
|
189
|
+
stats_per_template = defaultdict(lambda: defaultdict(list))
|
|
190
|
+
steps_summary_per_template = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
|
191
|
+
step_metrics_per_template = defaultdict(lambda: defaultdict(list))
|
|
215
192
|
|
|
193
|
+
# Compute per-template stats
|
|
194
|
+
templates_ids = set()
|
|
195
|
+
for sample in samples:
|
|
196
|
+
template_id = sample["template_id"]
|
|
197
|
+
templates_ids.add(template_id)
|
|
198
|
+
|
|
199
|
+
if "error" in sample:
|
|
200
|
+
number_of_samples_per_template_by_status[template_id]["error"] += 1
|
|
201
|
+
continue
|
|
202
|
+
number_of_samples_per_template_by_status[template_id]["success"] += 1
|
|
203
|
+
update_stats(sample, stats_per_template[template_id])
|
|
204
|
+
update_steps_summary(sample, steps_summary_per_template[template_id])
|
|
205
|
+
update_step_metrics(sample, step_metrics_per_template[template_id])
|
|
206
|
+
|
|
207
|
+
summary = {
|
|
208
|
+
"per_template": compute_per_template_stats(
|
|
209
|
+
templates_ids,
|
|
210
|
+
number_of_samples_per_template_by_status,
|
|
211
|
+
stats_per_template,
|
|
212
|
+
steps_summary_per_template,
|
|
213
|
+
step_metrics_per_template,
|
|
214
|
+
),
|
|
215
|
+
"micro": compute_micro_stats(
|
|
216
|
+
number_of_samples_per_template_by_status,
|
|
217
|
+
stats_per_template,
|
|
218
|
+
step_metrics_per_template
|
|
219
|
+
)
|
|
220
|
+
}
|
|
221
|
+
summary["macro"] = compute_macro_stats(summary["per_template"])
|
|
216
222
|
return summary
|
|
@@ -4,6 +4,8 @@ from pathlib import Path
|
|
|
4
4
|
from openai import OpenAI
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
|
|
7
|
+
from graphrag_eval.util import compute_f1
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
IN_FILE_PATH = "../data/data-1.tsv"
|
|
9
11
|
PROMPT_FILE_PATH = Path(__file__).parent / "prompts" / "template.md"
|
|
@@ -21,14 +23,11 @@ def compute_recall_precision_f1(
|
|
|
21
23
|
) -> tuple[float | None, float | None, float | None]:
|
|
22
24
|
recall = None
|
|
23
25
|
precision = None
|
|
24
|
-
f1 = None
|
|
25
26
|
if n_true_pos is not None and n_pos:
|
|
26
27
|
recall = n_true_pos / n_pos
|
|
27
28
|
if n_true_pos is not None and n_pred_pos:
|
|
28
29
|
precision = n_true_pos / n_pred_pos
|
|
29
|
-
|
|
30
|
-
f1 = 2 * (precision * recall) / (precision + recall)
|
|
31
|
-
return recall, precision, f1
|
|
30
|
+
return recall, precision, compute_f1(recall, precision)
|
|
32
31
|
|
|
33
32
|
|
|
34
33
|
def extract_response_values(
|
|
@@ -41,20 +40,20 @@ def extract_response_values(
|
|
|
41
40
|
return None, None, None, "", msg
|
|
42
41
|
vals = vals[:4]
|
|
43
42
|
try:
|
|
44
|
-
n_ref,
|
|
43
|
+
n_ref, n_actual, n_matching = map(int, vals[:3])
|
|
45
44
|
except ValueError:
|
|
46
|
-
msg = f"
|
|
45
|
+
msg = f"Claims counts should be ints: {vals}"
|
|
47
46
|
return None, None, None, vals[3], msg
|
|
48
47
|
if any([
|
|
49
48
|
n_ref < 1,
|
|
50
|
-
|
|
49
|
+
n_actual < 1,
|
|
51
50
|
n_matching < 0,
|
|
52
51
|
n_matching > n_ref,
|
|
53
|
-
n_matching >
|
|
52
|
+
n_matching > n_actual
|
|
54
53
|
]):
|
|
55
|
-
msg = f"Invalid
|
|
54
|
+
msg = f"Invalid claims counts combination: {n_ref}\t{n_actual}\t{n_matching}"
|
|
56
55
|
return None, None, None, vals[3], msg
|
|
57
|
-
return n_ref,
|
|
56
|
+
return n_ref, n_actual, n_matching, vals[3], ""
|
|
58
57
|
|
|
59
58
|
|
|
60
59
|
class AnswerCorrectnessEvaluator:
|
|
@@ -96,7 +95,7 @@ class AnswerCorrectnessEvaluator:
|
|
|
96
95
|
def get_correctness_dict(
|
|
97
96
|
self,
|
|
98
97
|
reference: dict,
|
|
99
|
-
|
|
98
|
+
actual: dict,
|
|
100
99
|
):
|
|
101
100
|
result = {}
|
|
102
101
|
result["reference_answer"] = reference["reference_answer"]
|
|
@@ -104,7 +103,7 @@ class AnswerCorrectnessEvaluator:
|
|
|
104
103
|
self.evaluate_answer(
|
|
105
104
|
reference["question_text"],
|
|
106
105
|
reference["reference_answer"],
|
|
107
|
-
|
|
106
|
+
actual["actual_answer"],
|
|
108
107
|
)
|
|
109
108
|
if error:
|
|
110
109
|
result["answer_eval_error"] = error
|
|
File without changes
|
|
@@ -1,41 +1,49 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from collections import defaultdict
|
|
3
|
+
from typing import Any
|
|
4
|
+
from collections.abc import Sequence
|
|
3
5
|
|
|
4
6
|
from .retrieval_context_ids import recall_at_k
|
|
5
7
|
from .sparql import compare_sparql_results
|
|
6
8
|
|
|
7
9
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
Match = tuple[int, int, int, float]
|
|
11
|
+
Step = dict[str, Any]
|
|
12
|
+
StepsGroup = Sequence[Step] # We will index into a group
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def compare_steps_outputs(reference_step: Step, actual_step: Step) -> float:
|
|
16
|
+
reference_output = reference_step.get("output")
|
|
17
|
+
actual_output = actual_step["output"]
|
|
18
|
+
assert reference_output, "Reference step output is mandatory"
|
|
19
|
+
reference_output_media_type = reference_step.get("output_media_type")
|
|
20
|
+
if reference_output_media_type == "application/sparql-results+json":
|
|
13
21
|
return compare_sparql_results(
|
|
14
|
-
json.loads(
|
|
15
|
-
json.loads(
|
|
16
|
-
|
|
17
|
-
|
|
22
|
+
json.loads(reference_output),
|
|
23
|
+
json.loads(actual_output),
|
|
24
|
+
reference_step["required_columns"],
|
|
25
|
+
reference_step.get("ordered", False),
|
|
18
26
|
)
|
|
19
|
-
if
|
|
20
|
-
return float(json.loads(
|
|
21
|
-
if
|
|
22
|
-
ref_contexts_ids = [c["id"] for c in json.loads(
|
|
23
|
-
act_contexts_ids = [c["id"] for c in json.loads(
|
|
24
|
-
k =
|
|
27
|
+
if reference_step.get("output_media_type") == "application/json":
|
|
28
|
+
return float(json.loads(reference_output) == json.loads(actual_output))
|
|
29
|
+
if reference_step["name"] == actual_step["name"] == "retrieval":
|
|
30
|
+
ref_contexts_ids = [c["id"] for c in json.loads(reference_output)]
|
|
31
|
+
act_contexts_ids = [c["id"] for c in json.loads(actual_output)]
|
|
32
|
+
k = actual_step["args"]["k"]
|
|
25
33
|
return recall_at_k(ref_contexts_ids, act_contexts_ids, k)
|
|
26
|
-
return float(
|
|
34
|
+
return float(reference_output == actual_output)
|
|
27
35
|
|
|
28
36
|
|
|
29
37
|
def match_group_by_output(
|
|
30
|
-
|
|
38
|
+
reference_groups: Sequence[StepsGroup],
|
|
31
39
|
group_idx: int,
|
|
32
|
-
actual_steps:
|
|
40
|
+
actual_steps: Sequence[Step],
|
|
33
41
|
candidates_by_name: dict[str, list[int]],
|
|
34
|
-
) -> list[
|
|
42
|
+
) -> list[Match]:
|
|
35
43
|
used_actual_indices = set()
|
|
36
44
|
matches = []
|
|
37
45
|
|
|
38
|
-
reference_group =
|
|
46
|
+
reference_group = reference_groups[group_idx]
|
|
39
47
|
for reference_idx, reference_step in enumerate(reference_group):
|
|
40
48
|
name = reference_step["name"]
|
|
41
49
|
candidates = reversed(candidates_by_name.get(name, []))
|
|
@@ -52,8 +60,8 @@ def match_group_by_output(
|
|
|
52
60
|
|
|
53
61
|
|
|
54
62
|
def collect_possible_matches_by_name_and_status(
|
|
55
|
-
group:
|
|
56
|
-
actual_steps:
|
|
63
|
+
group: StepsGroup,
|
|
64
|
+
actual_steps: Sequence[Step],
|
|
57
65
|
search_upto: int,
|
|
58
66
|
) -> dict[str, list[int]]:
|
|
59
67
|
group_by_name = defaultdict(list)
|
|
@@ -63,14 +71,14 @@ def collect_possible_matches_by_name_and_status(
|
|
|
63
71
|
if actual_steps[j]["status"] == "success":
|
|
64
72
|
group_by_name[name].append(j)
|
|
65
73
|
|
|
66
|
-
reference_names = {
|
|
74
|
+
reference_names = {step["name"] for step in group}
|
|
67
75
|
return {name: group_by_name[name] for name in reference_names if name in group_by_name}
|
|
68
76
|
|
|
69
77
|
|
|
70
78
|
def get_steps_matches(
|
|
71
|
-
|
|
72
|
-
actual_steps:
|
|
73
|
-
) -> list[
|
|
79
|
+
reference_groups: Sequence[StepsGroup],
|
|
80
|
+
actual_steps: Sequence[Step],
|
|
81
|
+
) -> list[Match]:
|
|
74
82
|
# when we have autocomplete
|
|
75
83
|
# matches = []
|
|
76
84
|
# search_upto = len(actual_steps)
|
|
@@ -91,57 +99,59 @@ def get_steps_matches(
|
|
|
91
99
|
# return matches
|
|
92
100
|
|
|
93
101
|
# for now, we have only the last step(s)
|
|
94
|
-
last_group =
|
|
95
|
-
candidates = collect_possible_matches_by_name_and_status(
|
|
96
|
-
|
|
102
|
+
last_group = reference_groups[-1]
|
|
103
|
+
candidates = collect_possible_matches_by_name_and_status(
|
|
104
|
+
last_group,
|
|
105
|
+
actual_steps,
|
|
106
|
+
len(actual_steps)
|
|
107
|
+
)
|
|
108
|
+
return match_group_by_output(reference_groups, -1, actual_steps, candidates)
|
|
97
109
|
|
|
98
110
|
|
|
99
111
|
def evaluate_steps(
|
|
100
|
-
reference_steps_groups:
|
|
101
|
-
actual_steps:
|
|
102
|
-
matches:
|
|
112
|
+
reference_steps_groups: Sequence[StepsGroup],
|
|
113
|
+
actual_steps: Sequence[Step],
|
|
114
|
+
matches: Sequence[Match] | None = None
|
|
103
115
|
) -> float:
|
|
104
116
|
if matches is None:
|
|
105
117
|
matches = get_steps_matches(reference_steps_groups, actual_steps)
|
|
106
|
-
matches_by_group = defaultdict(list)
|
|
107
118
|
scores_by_group = defaultdict(float)
|
|
108
119
|
for ref_group_idx, ref_match_idx, actual_idx, score in matches:
|
|
109
|
-
matches_by_group[ref_group_idx].append(ref_match_idx)
|
|
110
120
|
scores_by_group[ref_group_idx] += score
|
|
111
121
|
reference_steps_groups[ref_group_idx][ref_match_idx]["matches"] \
|
|
112
122
|
= actual_steps[actual_idx]["id"]
|
|
113
|
-
|
|
114
|
-
return scores_by_group[
|
|
123
|
+
group_idx = -1 # For now, consider only the last reference group of steps
|
|
124
|
+
return scores_by_group[group_idx] / len(reference_steps_groups[group_idx])
|
|
115
125
|
|
|
116
126
|
|
|
117
|
-
def get_steps_evaluation_result_dict(reference: dict,
|
|
127
|
+
def get_steps_evaluation_result_dict(reference: dict, actual: dict) -> dict:
|
|
118
128
|
eval_result = {}
|
|
119
|
-
|
|
120
|
-
eval_result["actual_steps"] =
|
|
121
|
-
for
|
|
122
|
-
if
|
|
129
|
+
actual_steps = actual.get("actual_steps", [])
|
|
130
|
+
eval_result["actual_steps"] = actual_steps
|
|
131
|
+
for actual_step in actual_steps:
|
|
132
|
+
if actual_step["name"] == "retrieval":
|
|
123
133
|
from .retrieval_answer import get_retrieval_evaluation_dict
|
|
124
134
|
result = get_retrieval_evaluation_dict(
|
|
125
135
|
question_text=reference["question_text"],
|
|
126
136
|
reference_answer=reference.get("reference_answer"),
|
|
127
|
-
actual_answer=
|
|
128
|
-
actual_contexts=json.loads(
|
|
137
|
+
actual_answer=actual.get("actual_answer"),
|
|
138
|
+
actual_contexts=json.loads(actual_step["output"])
|
|
129
139
|
)
|
|
130
|
-
|
|
140
|
+
actual_step.update(result)
|
|
131
141
|
if "reference_steps" in reference:
|
|
132
|
-
|
|
133
|
-
matches = get_steps_matches(
|
|
134
|
-
steps_score
|
|
135
|
-
|
|
142
|
+
reference_steps = reference["reference_steps"]
|
|
143
|
+
matches = get_steps_matches(reference_steps, actual_steps)
|
|
144
|
+
eval_result["steps_score"] \
|
|
145
|
+
= evaluate_steps(reference_steps, actual_steps, matches)
|
|
136
146
|
for ref_group_idx, ref_match_idx, act_idx, _ in matches:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
if
|
|
147
|
+
reference_step = reference_steps[ref_group_idx][ref_match_idx]
|
|
148
|
+
actual_step = actual_steps[act_idx]
|
|
149
|
+
if reference_step["name"] == "retrieval":
|
|
140
150
|
from .retrieval_context_texts import \
|
|
141
151
|
get_retrieval_evaluation_dict
|
|
142
152
|
res = get_retrieval_evaluation_dict(
|
|
143
|
-
reference_contexts=json.loads(
|
|
144
|
-
actual_contexts=json.loads(
|
|
153
|
+
reference_contexts=json.loads(reference_step["output"]),
|
|
154
|
+
actual_contexts=json.loads(actual_step["output"])
|
|
145
155
|
)
|
|
146
|
-
|
|
156
|
+
actual_step.update(res)
|
|
147
157
|
return eval_result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "graphrag-eval"
|
|
3
|
-
version = "5.1.
|
|
3
|
+
version = "5.1.2"
|
|
4
4
|
description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|