scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +57 -12
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +4 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +36 -19
- scorebook/evaluate/_sync/evaluate.py +36 -19
- scorebook/evaluate/evaluate_helpers.py +4 -3
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +7 -16
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +29 -12
- scorebook/types.py +3 -3
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook/utils/progress_bars.py +58 -786
- scorebook-0.0.15.dist-info/METADATA +300 -0
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -105
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.13.dist-info/METADATA +0 -389
- scorebook-0.0.13.dist-info/RECORD +0 -50
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
scorebook/metrics/f1.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""F1 metric implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from sklearn.metrics import f1_score
|
|
6
|
+
|
|
7
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@scorebook_metric
|
|
11
|
+
class F1(MetricBase):
|
|
12
|
+
"""F1 score metric for evaluating model predictions using scikit-learn.
|
|
13
|
+
|
|
14
|
+
F1 = 2 * (Precision * Recall) / (Precision + Recall)
|
|
15
|
+
where:
|
|
16
|
+
- Precision = TP / (TP + FP)
|
|
17
|
+
- Recall = TP / (TP + FN)
|
|
18
|
+
|
|
19
|
+
This metric can handle both binary and multi-class classification tasks.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
average: The averaging method(s) for multi-class classification.
|
|
23
|
+
Can be a single string or list of strings:
|
|
24
|
+
- 'macro': Unweighted mean across labels
|
|
25
|
+
- 'micro': Global calculation counting total TP, FP, FN
|
|
26
|
+
- 'weighted': Weighted mean by support
|
|
27
|
+
- 'all': All three methods simultaneously
|
|
28
|
+
- List of methods: Calculate multiple methods
|
|
29
|
+
Defaults to 'macro'.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, average: Union[str, List[str]] = "macro", **kwargs: Any) -> None:
|
|
33
|
+
"""Initialize F1 metric with specified averaging method(s).
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
average: Averaging method(s) - string or list of strings.
|
|
37
|
+
Options: 'macro', 'micro', 'weighted', 'all', or a list of methods.
|
|
38
|
+
Defaults to 'macro'.
|
|
39
|
+
**kwargs: Additional keyword arguments passed to scikit-learn's f1_score function.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
ValueError: If average contains invalid methods or combines 'all' with others.
|
|
43
|
+
"""
|
|
44
|
+
# Normalize to list for validation
|
|
45
|
+
averages = [average] if isinstance(average, str) else average
|
|
46
|
+
|
|
47
|
+
# Validate
|
|
48
|
+
valid = {"macro", "micro", "weighted", "all"}
|
|
49
|
+
if not all(a in valid for a in averages):
|
|
50
|
+
raise ValueError(f"Invalid average method(s). Must be from {valid}.")
|
|
51
|
+
if len(averages) > 1 and "all" in averages:
|
|
52
|
+
raise ValueError("'all' cannot be combined with other methods.")
|
|
53
|
+
|
|
54
|
+
self.average = average
|
|
55
|
+
self.kwargs = kwargs
|
|
56
|
+
|
|
57
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
58
|
+
"""Calculate F1 score between predictions and references using scikit-learn.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
outputs: A list of inference outputs.
|
|
62
|
+
labels: A list of ground truth labels.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Tuple containing:
|
|
66
|
+
- aggregate_scores (Dict[str, float]): Dictionary with F1 scores
|
|
67
|
+
keyed by averaging method (e.g., {"F1 (macro)": 0.85} or
|
|
68
|
+
{"F1 (macro)": 0.85, "F1 (micro)": 0.82}).
|
|
69
|
+
- item_scores (List[bool]): True/False list indicating correct
|
|
70
|
+
predictions.
|
|
71
|
+
|
|
72
|
+
"""
|
|
73
|
+
# Normalize to list of methods to calculate
|
|
74
|
+
if isinstance(self.average, str):
|
|
75
|
+
methods = ["macro", "micro", "weighted"] if self.average == "all" else [self.average]
|
|
76
|
+
else:
|
|
77
|
+
methods = self.average
|
|
78
|
+
|
|
79
|
+
# Handle empty lists
|
|
80
|
+
if not outputs:
|
|
81
|
+
return {f"F1 ({method})": 0.0 for method in methods}, []
|
|
82
|
+
|
|
83
|
+
# Calculate F1 score using scikit-learn with configured averaging method
|
|
84
|
+
# Default zero_division=0 unless overridden in kwargs
|
|
85
|
+
kwargs = {"zero_division": 0, **self.kwargs}
|
|
86
|
+
|
|
87
|
+
# Calculate item scores (correctness of each prediction)
|
|
88
|
+
item_scores = [output == label for output, label in zip(outputs, labels)]
|
|
89
|
+
|
|
90
|
+
# Calculate F1 for each method
|
|
91
|
+
aggregate_scores = {
|
|
92
|
+
f"F1 ({method})": f1_score(labels, outputs, average=method, **kwargs)
|
|
93
|
+
for method in methods
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return aggregate_scores, item_scores
|
scorebook/metrics/precision.py
CHANGED
|
@@ -1,19 +1,94 @@
|
|
|
1
1
|
"""Precision metric implementation for Scorebook."""
|
|
2
2
|
|
|
3
|
-
from typing import Any, Dict, List, Tuple
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from scorebook.metrics.metric_registry import MetricRegistry
|
|
5
|
+
from sklearn.metrics import precision_score
|
|
7
6
|
|
|
7
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
|
|
10
|
+
@scorebook_metric
|
|
10
11
|
class Precision(MetricBase):
|
|
11
|
-
"""Precision metric for
|
|
12
|
+
"""Precision score metric for evaluating model predictions using scikit-learn.
|
|
12
13
|
|
|
13
14
|
Precision = TP / (TP + FP)
|
|
15
|
+
|
|
16
|
+
This metric can handle both binary and multi-class classification tasks.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
average: The averaging method(s) for multi-class classification.
|
|
20
|
+
Can be a single string or list of strings:
|
|
21
|
+
- 'macro': Unweighted mean across labels
|
|
22
|
+
- 'micro': Global calculation counting total TP, FP
|
|
23
|
+
- 'weighted': Weighted mean by support
|
|
24
|
+
- 'all': All three methods simultaneously
|
|
25
|
+
- List of methods: Calculate multiple methods
|
|
26
|
+
Defaults to 'macro'.
|
|
14
27
|
"""
|
|
15
28
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
29
|
+
def __init__(self, average: Union[str, List[str]] = "macro", **kwargs: Any) -> None:
|
|
30
|
+
"""Initialize Precision metric with specified averaging method(s).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
average: Averaging method(s) - string or list of strings.
|
|
34
|
+
Options: 'macro', 'micro', 'weighted', 'all', or a list of methods.
|
|
35
|
+
Defaults to 'macro'.
|
|
36
|
+
**kwargs: Additional keyword arguments passed to sklearn's precision_score.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If average contains invalid methods or combines 'all' with others.
|
|
40
|
+
"""
|
|
41
|
+
# Normalize to list for validation
|
|
42
|
+
averages = [average] if isinstance(average, str) else average
|
|
43
|
+
|
|
44
|
+
# Validate
|
|
45
|
+
valid = {"macro", "micro", "weighted", "all"}
|
|
46
|
+
if not all(a in valid for a in averages):
|
|
47
|
+
raise ValueError(f"Invalid average method(s). Must be from {valid}.")
|
|
48
|
+
if len(averages) > 1 and "all" in averages:
|
|
49
|
+
raise ValueError("'all' cannot be combined with other methods.")
|
|
50
|
+
|
|
51
|
+
self.average = average
|
|
52
|
+
self.kwargs = kwargs
|
|
53
|
+
|
|
54
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
55
|
+
"""Calculate Precision score between predictions and references using scikit-learn.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
outputs: A list of inference outputs.
|
|
59
|
+
labels: A list of ground truth labels.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Tuple containing:
|
|
63
|
+
- aggregate_scores (Dict[str, float]): Dictionary with Precision scores
|
|
64
|
+
keyed by averaging method (e.g., {"Precision (macro)": 0.85} or
|
|
65
|
+
{"Precision (macro)": 0.85, "Precision (micro)": 0.82}).
|
|
66
|
+
- item_scores (List[bool]): True/False list indicating correct
|
|
67
|
+
predictions.
|
|
68
|
+
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
# Normalize to list of methods to calculate
|
|
72
|
+
if isinstance(self.average, str):
|
|
73
|
+
methods = ["macro", "micro", "weighted"] if self.average == "all" else [self.average]
|
|
74
|
+
else:
|
|
75
|
+
methods = self.average
|
|
76
|
+
|
|
77
|
+
# Handle empty lists
|
|
78
|
+
if not outputs:
|
|
79
|
+
return {f"Precision ({method})": 0.0 for method in methods}, []
|
|
80
|
+
|
|
81
|
+
# Calculate Precision score using scikit-learn with configured averaging method
|
|
82
|
+
# Default zero_division=0 unless overridden in kwargs
|
|
83
|
+
kwargs = {"zero_division": 0, **self.kwargs}
|
|
84
|
+
|
|
85
|
+
# Calculate item scores (correctness of each prediction)
|
|
86
|
+
item_scores = [output == label for output, label in zip(outputs, labels)]
|
|
87
|
+
|
|
88
|
+
# Calculate Precision for each method
|
|
89
|
+
aggregate_scores = {
|
|
90
|
+
f"Precision ({method})": precision_score(labels, outputs, average=method, **kwargs)
|
|
91
|
+
for method in methods
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return aggregate_scores, item_scores
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Recall metric implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from sklearn.metrics import recall_score
|
|
6
|
+
|
|
7
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@scorebook_metric
|
|
11
|
+
class Recall(MetricBase):
|
|
12
|
+
"""Recall score metric for evaluating model predictions using scikit-learn.
|
|
13
|
+
|
|
14
|
+
Recall = TP / (TP + FN)
|
|
15
|
+
|
|
16
|
+
This metric can handle both binary and multi-class classification tasks.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
average: The averaging method(s) for multi-class classification.
|
|
20
|
+
Can be a single string or list of strings:
|
|
21
|
+
- 'macro': Unweighted mean across labels
|
|
22
|
+
- 'micro': Global calculation counting total TP, FP, FN
|
|
23
|
+
- 'weighted': Weighted mean by support
|
|
24
|
+
- 'all': All three methods simultaneously
|
|
25
|
+
- List of methods: Calculate multiple methods
|
|
26
|
+
Defaults to 'macro'.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, average: Union[str, List[str]] = "macro", **kwargs: Any) -> None:
|
|
30
|
+
"""Initialize Recall metric with specified averaging method(s).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
average: Averaging method(s) - string or list of strings.
|
|
34
|
+
Options: 'macro', 'micro', 'weighted', 'all', or a list of methods.
|
|
35
|
+
Defaults to 'macro'.
|
|
36
|
+
**kwargs: Additional keyword arguments passed to sklearn's recall_score.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If average contains invalid methods or combines 'all' with others.
|
|
40
|
+
"""
|
|
41
|
+
# Normalize to list for validation
|
|
42
|
+
averages = [average] if isinstance(average, str) else average
|
|
43
|
+
|
|
44
|
+
# Validate
|
|
45
|
+
valid = {"macro", "micro", "weighted", "all"}
|
|
46
|
+
if not all(a in valid for a in averages):
|
|
47
|
+
raise ValueError(f"Invalid average method(s). Must be from {valid}.")
|
|
48
|
+
if len(averages) > 1 and "all" in averages:
|
|
49
|
+
raise ValueError("'all' cannot be combined with other methods.")
|
|
50
|
+
|
|
51
|
+
self.average = average
|
|
52
|
+
self.kwargs = kwargs
|
|
53
|
+
|
|
54
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
55
|
+
"""Calculate Recall score between predictions and references using scikit-learn.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
outputs: A list of inference outputs.
|
|
59
|
+
labels: A list of ground truth labels.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Tuple containing:
|
|
63
|
+
- aggregate_scores (Dict[str, float]): Dictionary with Recall scores
|
|
64
|
+
keyed by averaging method (e.g., {"Recall (macro)": 0.85} or
|
|
65
|
+
{"Recall (macro)": 0.85, "Recall (micro)": 0.82}).
|
|
66
|
+
- item_scores (List[bool]): True/False list indicating correct
|
|
67
|
+
predictions.
|
|
68
|
+
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
# Normalize to list of methods to calculate
|
|
72
|
+
if isinstance(self.average, str):
|
|
73
|
+
methods = ["macro", "micro", "weighted"] if self.average == "all" else [self.average]
|
|
74
|
+
else:
|
|
75
|
+
methods = self.average
|
|
76
|
+
|
|
77
|
+
# Handle empty lists
|
|
78
|
+
if not outputs:
|
|
79
|
+
return {f"Recall ({method})": 0.0 for method in methods}, []
|
|
80
|
+
|
|
81
|
+
# Calculate Recall score using scikit-learn with configured averaging method
|
|
82
|
+
# Default zero_division=0 unless overridden in kwargs
|
|
83
|
+
kwargs = {"zero_division": 0, **self.kwargs}
|
|
84
|
+
|
|
85
|
+
# Calculate item scores (correctness of each prediction)
|
|
86
|
+
item_scores = [output == label for output, label in zip(outputs, labels)]
|
|
87
|
+
|
|
88
|
+
# Calculate Recall for each method
|
|
89
|
+
aggregate_scores = {
|
|
90
|
+
f"Recall ({method})": recall_score(labels, outputs, average=method, **kwargs)
|
|
91
|
+
for method in methods
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return aggregate_scores, item_scores
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""ROUGE metric implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from rouge_score import rouge_scorer
|
|
7
|
+
|
|
8
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@scorebook_metric
|
|
12
|
+
class ROUGE(MetricBase):
|
|
13
|
+
"""ROUGE metric for evaluating text generation quality.
|
|
14
|
+
|
|
15
|
+
ROUGE (Recall-Oriented Understudy for Gisting Evaluation) measures
|
|
16
|
+
the overlap between generated text and reference text.
|
|
17
|
+
Returns ROUGE-1 and ROUGE-L F1 scores.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, rouge_types: Optional[List[str]] = None, **kwargs: Any) -> None:
|
|
21
|
+
"""Initialize the ROUGE metric.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
rouge_types: List of ROUGE types to calculate (e.g., ["rouge1", "rouge2", "rougeL"]).
|
|
25
|
+
Defaults to ["rouge1", "rougeL"].
|
|
26
|
+
**kwargs: Additional keyword arguments to pass to RougeScorer
|
|
27
|
+
(e.g., use_stemmer, split_summaries, tokenizer).
|
|
28
|
+
Defaults to use_stemmer=True if not provided.
|
|
29
|
+
"""
|
|
30
|
+
if rouge_types is None:
|
|
31
|
+
warnings.warn(
|
|
32
|
+
"No rouge_types specified, defaulting to ['rouge1', 'rougeL']",
|
|
33
|
+
UserWarning,
|
|
34
|
+
stacklevel=2,
|
|
35
|
+
)
|
|
36
|
+
rouge_types = ["rouge1", "rougeL"]
|
|
37
|
+
if "use_stemmer" not in kwargs:
|
|
38
|
+
warnings.warn(
|
|
39
|
+
"use_stemmer not specified, defaulting to True",
|
|
40
|
+
UserWarning,
|
|
41
|
+
stacklevel=2,
|
|
42
|
+
)
|
|
43
|
+
kwargs["use_stemmer"] = True
|
|
44
|
+
self.rouge_types = rouge_types
|
|
45
|
+
self.scorer = rouge_scorer.RougeScorer(rouge_types, **kwargs)
|
|
46
|
+
|
|
47
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
48
|
+
"""Calculate ROUGE scores between predictions and references.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
outputs: A list of generated text outputs.
|
|
52
|
+
labels: A list of reference text labels.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
A tuple containing:
|
|
56
|
+
- aggregate_scores: Dict with average F1 scores for each configured ROUGE type
|
|
57
|
+
- item_scores: List of dicts with F1 scores for each configured ROUGE type
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
if not outputs: # Handle empty lists
|
|
61
|
+
return {rouge_type: 0.0 for rouge_type in self.rouge_types}, []
|
|
62
|
+
|
|
63
|
+
# Calculate item scores
|
|
64
|
+
item_scores = []
|
|
65
|
+
for output, label in zip(outputs, labels):
|
|
66
|
+
# Convert to strings if needed
|
|
67
|
+
output_str = str(output) if output is not None else ""
|
|
68
|
+
label_str = str(label) if label is not None else ""
|
|
69
|
+
|
|
70
|
+
# Calculate ROUGE scores
|
|
71
|
+
scores = self.scorer.score(output_str, label_str)
|
|
72
|
+
|
|
73
|
+
# Extract F1 scores (fmeasure) for all configured rouge types
|
|
74
|
+
item_score = {
|
|
75
|
+
rouge_type: scores[rouge_type].fmeasure for rouge_type in self.rouge_types
|
|
76
|
+
}
|
|
77
|
+
item_scores.append(item_score)
|
|
78
|
+
|
|
79
|
+
# Calculate aggregate scores (average of all items for each rouge type)
|
|
80
|
+
aggregate_scores = {
|
|
81
|
+
rouge_type: sum(item[rouge_type] for item in item_scores) / len(item_scores)
|
|
82
|
+
for rouge_type in self.rouge_types
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return aggregate_scores, item_scores
|
scorebook/score/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any, Dict, List, Literal, Optional, Union, cast
|
|
3
3
|
|
|
4
|
+
from scorebook.dashboard.upload_results import upload_result_async
|
|
4
5
|
from scorebook.exceptions import DataMismatchError, ParameterValidationError
|
|
5
6
|
from scorebook.score.score_helpers import (
|
|
6
7
|
calculate_metric_scores_async,
|
|
@@ -8,9 +9,9 @@ from scorebook.score.score_helpers import (
|
|
|
8
9
|
resolve_metrics,
|
|
9
10
|
validate_items,
|
|
10
11
|
)
|
|
11
|
-
from scorebook.trismik.upload_results import upload_result_async
|
|
12
12
|
from scorebook.types import Metrics
|
|
13
|
-
from scorebook.utils import resolve_show_progress, resolve_upload_results
|
|
13
|
+
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
14
|
+
from scorebook.utils.progress_bars import scoring_progress_context
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
16
17
|
|
scorebook/score/_sync/score.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any, Dict, List, Literal, Optional, Union, cast
|
|
3
3
|
|
|
4
|
+
from scorebook.dashboard.upload_results import upload_result
|
|
4
5
|
from scorebook.exceptions import DataMismatchError, ParameterValidationError
|
|
5
6
|
from scorebook.score.score_helpers import (
|
|
6
7
|
calculate_metric_scores,
|
|
@@ -8,9 +9,9 @@ from scorebook.score.score_helpers import (
|
|
|
8
9
|
resolve_metrics,
|
|
9
10
|
validate_items,
|
|
10
11
|
)
|
|
11
|
-
from scorebook.trismik.upload_results import upload_result
|
|
12
12
|
from scorebook.types import Metrics
|
|
13
|
-
from scorebook.utils import resolve_show_progress, resolve_upload_results
|
|
13
|
+
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
14
|
+
from scorebook.utils.progress_bars import scoring_progress_context
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
16
17
|
|
scorebook/score/score_helpers.py
CHANGED
|
@@ -4,10 +4,10 @@ import logging
|
|
|
4
4
|
from typing import Any, Dict, List, Mapping, Optional, Type, Union
|
|
5
5
|
|
|
6
6
|
from scorebook.exceptions import DataMismatchError, ParameterValidationError
|
|
7
|
-
from scorebook.metrics.metric_base import MetricBase
|
|
8
|
-
from scorebook.metrics.metric_registry import MetricRegistry
|
|
7
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
8
|
+
from scorebook.metrics.core.metric_registry import MetricRegistry
|
|
9
9
|
from scorebook.types import MetricScore
|
|
10
|
-
from scorebook.utils import is_awaitable
|
|
10
|
+
from scorebook.utils.async_utils import is_awaitable
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
@@ -89,7 +89,7 @@ async def calculate_metric_scores_async(
|
|
|
89
89
|
for metric in metrics:
|
|
90
90
|
|
|
91
91
|
if progress_bar is not None:
|
|
92
|
-
progress_bar.
|
|
92
|
+
progress_bar.set_postfix(metric=metric.name)
|
|
93
93
|
|
|
94
94
|
if is_awaitable(metric.score):
|
|
95
95
|
aggregate_scores, item_scores = await metric.score(outputs, labels)
|
|
@@ -134,7 +134,7 @@ def calculate_metric_scores(
|
|
|
134
134
|
for metric in metrics:
|
|
135
135
|
|
|
136
136
|
if progress_bar is not None:
|
|
137
|
-
progress_bar.
|
|
137
|
+
progress_bar.set_postfix(metric=metric.name)
|
|
138
138
|
|
|
139
139
|
if is_awaitable(metric.score):
|
|
140
140
|
raise ParameterValidationError(
|
|
@@ -164,18 +164,27 @@ def format_results(
|
|
|
164
164
|
hyperparameters = hyperparameters or {}
|
|
165
165
|
dataset_name = dataset_name or "scored_items"
|
|
166
166
|
|
|
167
|
+
# Detect key collisions across all metrics (for both aggregate and item scores)
|
|
168
|
+
all_keys: Dict[str, set] = {}
|
|
169
|
+
for metric_score in metric_scores:
|
|
170
|
+
for key in metric_score.aggregate_scores.keys():
|
|
171
|
+
all_keys.setdefault(key, set()).add(metric_score.metric_name)
|
|
172
|
+
# Also check item_scores keys if they are dicts
|
|
173
|
+
if metric_score.item_scores and isinstance(metric_score.item_scores[0], dict):
|
|
174
|
+
for key in metric_score.item_scores[0].keys():
|
|
175
|
+
all_keys.setdefault(key, set()).add(metric_score.metric_name)
|
|
176
|
+
colliding_keys = {k for k, metrics in all_keys.items() if len(metrics) > 1}
|
|
177
|
+
|
|
167
178
|
# Build aggregate results
|
|
168
|
-
aggregate_result = {
|
|
179
|
+
aggregate_result: Dict[str, Any] = {
|
|
169
180
|
"dataset": dataset_name,
|
|
170
181
|
**hyperparameters,
|
|
171
182
|
}
|
|
172
183
|
|
|
173
|
-
# Add aggregate scores from metrics
|
|
184
|
+
# Add aggregate scores from metrics (flat, with suffix on collision)
|
|
174
185
|
for metric_score in metric_scores:
|
|
175
186
|
for key, value in metric_score.aggregate_scores.items():
|
|
176
|
-
score_key =
|
|
177
|
-
key if key == metric_score.metric_name else f"{metric_score.metric_name}_{key}"
|
|
178
|
-
)
|
|
187
|
+
score_key = f"{key}_{metric_score.metric_name}" if key in colliding_keys else key
|
|
179
188
|
aggregate_result[score_key] = value
|
|
180
189
|
|
|
181
190
|
# Build item results
|
|
@@ -193,10 +202,18 @@ def format_results(
|
|
|
193
202
|
if inputs is not None and inputs[idx] is not None:
|
|
194
203
|
item_result["input"] = inputs[idx]
|
|
195
204
|
|
|
196
|
-
# Add item-level metric scores
|
|
205
|
+
# Add item-level metric scores (flat, with suffix on collision)
|
|
197
206
|
for metric_score in metric_scores:
|
|
198
207
|
if idx < len(metric_score.item_scores):
|
|
199
|
-
|
|
208
|
+
item_scores = metric_score.item_scores[idx]
|
|
209
|
+
if isinstance(item_scores, dict):
|
|
210
|
+
for key, value in item_scores.items():
|
|
211
|
+
score_key = (
|
|
212
|
+
f"{key}_{metric_score.metric_name}" if key in colliding_keys else key
|
|
213
|
+
)
|
|
214
|
+
item_result[score_key] = value
|
|
215
|
+
else:
|
|
216
|
+
item_result[metric_score.metric_name] = item_scores
|
|
200
217
|
|
|
201
218
|
item_results.append(item_result)
|
|
202
219
|
|
scorebook/types.py
CHANGED
|
@@ -3,12 +3,12 @@
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from typing import Any, Dict, List, Optional, Sequence, Type, Union
|
|
5
5
|
|
|
6
|
-
from scorebook.eval_datasets import EvalDataset
|
|
7
|
-
from scorebook.metrics.metric_base import MetricBase
|
|
6
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
7
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
8
8
|
|
|
9
9
|
# Type alias for metrics parameter
|
|
10
10
|
Metrics = Union[
|
|
11
|
-
str,
|
|
11
|
+
str, MetricBase, Type[MetricBase], Sequence[Union[str, MetricBase, Type[MetricBase]]]
|
|
12
12
|
]
|
|
13
13
|
|
|
14
14
|
|
scorebook/utils/__init__.py
CHANGED
|
@@ -1,23 +1 @@
|
|
|
1
1
|
"""Utility functions and common helpers for the Scorebook framework."""
|
|
2
|
-
|
|
3
|
-
from contextlib import nullcontext
|
|
4
|
-
|
|
5
|
-
from scorebook.utils.async_utils import async_nullcontext, is_awaitable
|
|
6
|
-
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
7
|
-
from scorebook.utils.io_helpers import validate_path
|
|
8
|
-
from scorebook.utils.progress_bars import evaluation_progress_context, scoring_progress_context
|
|
9
|
-
from scorebook.utils.render_template import render_template
|
|
10
|
-
from scorebook.utils.transform_helpers import expand_dict
|
|
11
|
-
|
|
12
|
-
__all__ = [
|
|
13
|
-
"async_nullcontext",
|
|
14
|
-
"nullcontext",
|
|
15
|
-
"is_awaitable",
|
|
16
|
-
"resolve_show_progress",
|
|
17
|
-
"resolve_upload_results",
|
|
18
|
-
"validate_path",
|
|
19
|
-
"expand_dict",
|
|
20
|
-
"evaluation_progress_context",
|
|
21
|
-
"scoring_progress_context",
|
|
22
|
-
"render_template",
|
|
23
|
-
]
|
|
@@ -17,7 +17,7 @@ def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool
|
|
|
17
17
|
bool: Whether to upload results to Trismik
|
|
18
18
|
"""
|
|
19
19
|
if upload_results == "auto":
|
|
20
|
-
from scorebook.
|
|
20
|
+
from scorebook.dashboard.credentials import get_token
|
|
21
21
|
|
|
22
22
|
upload_results = get_token() is not None
|
|
23
23
|
logger.debug("Auto upload results resolved to: %s", upload_results)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Mock LLM utilities for testing and demonstrations."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import random
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, List
|
|
7
|
+
|
|
8
|
+
# Load the mock data once at module initialization
|
|
9
|
+
_DATA_PATH = Path(__file__).parent / "data" / "mock_llm_data.json"
|
|
10
|
+
with open(_DATA_PATH, "r", encoding="utf-8") as f:
|
|
11
|
+
_MOCK_DATA = json.load(f)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def mock_llm(inputs: List[Any], **hyperparameters: Any) -> List[str]:
|
|
15
|
+
"""Mock LLM that returns answers based on pre-recorded accuracy data."""
|
|
16
|
+
|
|
17
|
+
results = []
|
|
18
|
+
all_choices = ["A", "B", "C", "D", "E"]
|
|
19
|
+
|
|
20
|
+
for item in inputs:
|
|
21
|
+
item_id = item["id"]
|
|
22
|
+
|
|
23
|
+
# Look up the item in our mock data
|
|
24
|
+
if item_id not in _MOCK_DATA:
|
|
25
|
+
# If item not found, return random answer
|
|
26
|
+
results.append(random.choice(all_choices))
|
|
27
|
+
continue
|
|
28
|
+
|
|
29
|
+
item_data = _MOCK_DATA[item_id]
|
|
30
|
+
correct_answer = item_data["answer"]
|
|
31
|
+
was_accurate = item_data["accuracy"]
|
|
32
|
+
|
|
33
|
+
if was_accurate:
|
|
34
|
+
# Return the correct answer
|
|
35
|
+
results.append(correct_answer)
|
|
36
|
+
else:
|
|
37
|
+
# Return a random incorrect answer
|
|
38
|
+
incorrect_choices = [choice for choice in all_choices if choice != correct_answer]
|
|
39
|
+
results.append(random.choice(incorrect_choices))
|
|
40
|
+
|
|
41
|
+
return results
|