scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +57 -12
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +4 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +36 -19
- scorebook/evaluate/_sync/evaluate.py +36 -19
- scorebook/evaluate/evaluate_helpers.py +4 -3
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +7 -16
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +29 -12
- scorebook/types.py +3 -3
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook/utils/progress_bars.py +58 -786
- scorebook-0.0.15.dist-info/METADATA +300 -0
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -105
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.13.dist-info/METADATA +0 -389
- scorebook-0.0.13.dist-info/RECORD +0 -50
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Tutorials - Score - Example 3 - F1 Metric Scoring."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
10
|
+
from scorebook import score
|
|
11
|
+
from scorebook.metrics.f1 import F1
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def main() -> Any:
|
|
15
|
+
"""Score pre-computed model predictions using F1 metric.
|
|
16
|
+
|
|
17
|
+
This example demonstrates how to score NER (Named Entity Recognition)
|
|
18
|
+
predictions using the F1 metric with different averaging methods.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# Sample NER predictions (in CoNLL format with BIO tags)
|
|
22
|
+
model_predictions = [
|
|
23
|
+
{"output": "O", "label": "O"},
|
|
24
|
+
{"output": "B-PER", "label": "B-PER"},
|
|
25
|
+
{"output": "I-PER", "label": "I-PER"},
|
|
26
|
+
{"output": "O", "label": "O"},
|
|
27
|
+
{"output": "B-LOC", "label": "B-LOC"},
|
|
28
|
+
{"output": "O", "label": "O"},
|
|
29
|
+
{"output": "B-ORG", "label": "B-LOC"}, # Misclassification
|
|
30
|
+
{"output": "O", "label": "B-MISC"}, # Missed entity
|
|
31
|
+
{"output": "B-PER", "label": "B-PER"},
|
|
32
|
+
{"output": "O", "label": "O"},
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
print(f"Scoring {len(model_predictions)} NER predictions\n")
|
|
36
|
+
|
|
37
|
+
# Score with all averaging methods at once
|
|
38
|
+
print("All averaging methods:")
|
|
39
|
+
results_all = score(
|
|
40
|
+
items=model_predictions,
|
|
41
|
+
metrics=F1(average="all"),
|
|
42
|
+
upload_results=False,
|
|
43
|
+
)
|
|
44
|
+
pprint(results_all["aggregate_results"])
|
|
45
|
+
|
|
46
|
+
# Score with specific combination of methods
|
|
47
|
+
print("\nMicro and weighted averaging:")
|
|
48
|
+
results_combo = score(
|
|
49
|
+
items=model_predictions,
|
|
50
|
+
metrics=F1(average=["micro", "weighted"]),
|
|
51
|
+
upload_results=False,
|
|
52
|
+
)
|
|
53
|
+
pprint(results_combo["aggregate_results"])
|
|
54
|
+
|
|
55
|
+
return results_all
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == "__main__":
|
|
59
|
+
load_dotenv()
|
|
60
|
+
log_file = setup_logging(experiment_id="3-scoring_f1_metric", base_dir=Path(__file__).parent)
|
|
61
|
+
output_dir = Path(__file__).parent / "results"
|
|
62
|
+
output_dir.mkdir(exist_ok=True)
|
|
63
|
+
results_dict = main()
|
|
64
|
+
save_results_to_json(results_dict, output_dir, "3-scoring_f1_metric_output.json")
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Tutorials - Score - Example 4 - Scoring Models with ROUGE."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
10
|
+
|
|
11
|
+
from scorebook import score
|
|
12
|
+
from scorebook.metrics.rouge import ROUGE
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Score text generation predictions using ROUGE metric.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how to score generated summaries
|
|
19
|
+
against reference summaries using ROUGE scores.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# Prepare a list of items with generated summaries and reference summaries
|
|
23
|
+
model_predictions = [
|
|
24
|
+
{
|
|
25
|
+
"output": "A woman donated her kidney to a stranger. This sparked a chain of six kidney transplants.",
|
|
26
|
+
"label": "Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.",
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"output": "Scientists discovered a new species of frog in the Amazon rainforest. The frog has unique markings that distinguish it from other species.",
|
|
30
|
+
"label": "A new frog species with distinctive blue and yellow stripes was found in the Amazon. Researchers say this discovery highlights the biodiversity of the region.",
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"output": "The technology company released its quarterly earnings report showing strong growth.",
|
|
34
|
+
"label": "Tech giant announces record quarterly revenue driven by cloud services and AI products.",
|
|
35
|
+
},
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Score the predictions against labels using the ROUGE metric
|
|
39
|
+
results = score(
|
|
40
|
+
items=model_predictions,
|
|
41
|
+
metrics=ROUGE(rouge_types=["rouge1", "rougeL"], use_stemmer=True),
|
|
42
|
+
upload_results=False, # Disable uploading for this example
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
print("\nResults:")
|
|
46
|
+
pprint(results)
|
|
47
|
+
|
|
48
|
+
# Display individual item scores
|
|
49
|
+
print("\n\nIndividual ROUGE Scores:")
|
|
50
|
+
for i, item_score in enumerate(results["item_results"]):
|
|
51
|
+
print(f"\nItem {i+1}:")
|
|
52
|
+
print(f" ROUGE-1 F1: {item_score['rouge1']:.4f}")
|
|
53
|
+
print(f" ROUGE-L F1: {item_score['rougeL']:.4f}")
|
|
54
|
+
|
|
55
|
+
return results
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == "__main__":
|
|
59
|
+
load_dotenv()
|
|
60
|
+
log_file = setup_logging(experiment_id="4-scoring_model_rouge", base_dir=Path(__file__).parent)
|
|
61
|
+
output_dir = Path(__file__).parent / "results"
|
|
62
|
+
output_dir.mkdir(exist_ok=True)
|
|
63
|
+
results_dict = main()
|
|
64
|
+
save_results_to_json(results_dict, output_dir, "4-scoring_model_rouge_output.json")
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Tutorials - Score - Example 5 - Scoring Models with Exact Match."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
10
|
+
|
|
11
|
+
from scorebook import score
|
|
12
|
+
from scorebook.metrics.exactmatch import ExactMatch
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Score text predictions using Exact Match metric.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how to compare model outputs against
|
|
19
|
+
reference labels using exact string matching with configurable
|
|
20
|
+
preprocessing options.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
# Prepare a list of items with model outputs and expected labels
|
|
24
|
+
# Note: outputs may have different casing or extra whitespace
|
|
25
|
+
model_predictions = [
|
|
26
|
+
{"output": "Paris", "label": "Paris"}, # Exact match
|
|
27
|
+
{"output": "LONDON", "label": "London"}, # Different case
|
|
28
|
+
{"output": " Berlin ", "label": "Berlin"}, # Extra whitespace
|
|
29
|
+
{"output": " NEW YORK ", "label": "new york"}, # Both case and whitespace
|
|
30
|
+
{"output": "Tokyo", "label": "Kyoto"}, # No match
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
print(f"Scoring {len(model_predictions)} predictions\n")
|
|
34
|
+
|
|
35
|
+
# Score with default settings (case_insensitive=True, strip=True)
|
|
36
|
+
print("Default settings (case_insensitive=True, strip=True):")
|
|
37
|
+
results_default = score(
|
|
38
|
+
items=model_predictions,
|
|
39
|
+
metrics=ExactMatch(),
|
|
40
|
+
upload_results=False,
|
|
41
|
+
)
|
|
42
|
+
pprint(results_default["aggregate_results"])
|
|
43
|
+
print(f"Item matches: {[item['exact_match'] for item in results_default['item_results']]}")
|
|
44
|
+
|
|
45
|
+
# Score with case-sensitive matching
|
|
46
|
+
print("\nCase-sensitive matching (case_insensitive=False, strip=True):")
|
|
47
|
+
results_case_sensitive = score(
|
|
48
|
+
items=model_predictions,
|
|
49
|
+
metrics=ExactMatch(case_insensitive=False),
|
|
50
|
+
upload_results=False,
|
|
51
|
+
)
|
|
52
|
+
pprint(results_case_sensitive["aggregate_results"])
|
|
53
|
+
print(f"Item matches: {[item['exact_match'] for item in results_case_sensitive['item_results']]}")
|
|
54
|
+
|
|
55
|
+
# Score without stripping whitespace
|
|
56
|
+
print("\nWithout stripping (case_insensitive=True, strip=False):")
|
|
57
|
+
results_no_strip = score(
|
|
58
|
+
items=model_predictions,
|
|
59
|
+
metrics=ExactMatch(strip=False),
|
|
60
|
+
upload_results=False,
|
|
61
|
+
)
|
|
62
|
+
pprint(results_no_strip["aggregate_results"])
|
|
63
|
+
print(f"Item matches: {[item['exact_match'] for item in results_no_strip['item_results']]}")
|
|
64
|
+
|
|
65
|
+
# Score with strict matching (no preprocessing)
|
|
66
|
+
print("\nStrict matching (case_insensitive=False, strip=False):")
|
|
67
|
+
results_strict = score(
|
|
68
|
+
items=model_predictions,
|
|
69
|
+
metrics=ExactMatch(case_insensitive=False, strip=False),
|
|
70
|
+
upload_results=False,
|
|
71
|
+
)
|
|
72
|
+
pprint(results_strict["aggregate_results"])
|
|
73
|
+
print(f"Item matches: {[item['exact_match'] for item in results_strict['item_results']]}")
|
|
74
|
+
|
|
75
|
+
return results_default
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == "__main__":
|
|
79
|
+
load_dotenv()
|
|
80
|
+
log_file = setup_logging(experiment_id="5-scoring_model_exact_match", base_dir=Path(__file__).parent)
|
|
81
|
+
output_dir = Path(__file__).parent / "results"
|
|
82
|
+
output_dir.mkdir(exist_ok=True)
|
|
83
|
+
results_dict = main()
|
|
84
|
+
save_results_to_json(results_dict, output_dir, "5-scoring_model_exact_match_output.json")
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Tutorials - Score - Example 6 - Scoring with BertScore."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from scorebook.metrics.bertscore import BertScore
|
|
10
|
+
|
|
11
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
12
|
+
from scorebook import score
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Score pre-computed model predictions using Scorebook.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how to score generated model predictions.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# Prepare a list of items with generated summaries and reference summaries
|
|
22
|
+
model_predictions = [
|
|
23
|
+
{
|
|
24
|
+
"output": "A woman donated her kidney to a stranger. This sparked a chain of six kidney transplants.",
|
|
25
|
+
"label": "Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.",
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"output": "Scientists discovered a new species of frog in the Amazon rainforest. The frog has unique markings that distinguish it from other species.",
|
|
29
|
+
"label": "A new frog species with distinctive blue and yellow stripes was found in the Amazon. Researchers say this discovery highlights the biodiversity of the region.",
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"output": "The technology company released its quarterly earnings report showing strong growth.",
|
|
33
|
+
"label": "Tech giant announces record quarterly revenue driven by cloud services and AI products.",
|
|
34
|
+
},
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
# Score the predictions against labels using the BertScore metric
|
|
38
|
+
results = score(
|
|
39
|
+
items=model_predictions,
|
|
40
|
+
metrics=BertScore,
|
|
41
|
+
upload_results=False, # Disable uploading for this example
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
print("\nResults:")
|
|
45
|
+
pprint(results)
|
|
46
|
+
|
|
47
|
+
return results
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
if __name__ == "__main__":
|
|
51
|
+
load_dotenv()
|
|
52
|
+
log_file = setup_logging(experiment_id="6-scoring_model_bertscore", base_dir=Path(__file__).parent)
|
|
53
|
+
output_dir = Path(__file__).parent / "results"
|
|
54
|
+
output_dir.mkdir(exist_ok=True)
|
|
55
|
+
results_dict = main()
|
|
56
|
+
save_results_to_json(results_dict, output_dir, "6-scoring_model_bertscore_output.json")
|
|
57
|
+
|
|
File without changes
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Tutorials - Evaluate - Example 1 - Evaluating Local Models."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any, List
|
|
6
|
+
|
|
7
|
+
import transformers
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
|
|
10
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
11
|
+
|
|
12
|
+
from scorebook import EvalDataset, evaluate
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Run a simple Scorebook evaluation on a local model.
|
|
17
|
+
|
|
18
|
+
This example demonstrates the fundamental workflow for evaluating a model using Scorebook.
|
|
19
|
+
|
|
20
|
+
It shows how to:
|
|
21
|
+
1. Create an evaluation dataset from a list of evaluation items
|
|
22
|
+
2. Define an inference function using Hugging Face's transformers library
|
|
23
|
+
3. Run the evaluation and collect results
|
|
24
|
+
|
|
25
|
+
This serves as a starting point for understanding Scorebook's core evaluation capabilities.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
# Create a list of evaluation items
|
|
29
|
+
evaluation_items = [
|
|
30
|
+
{"question": "What is 2 + 2?", "answer": "4"},
|
|
31
|
+
{"question": "What is the capital of France?", "answer": "Paris"},
|
|
32
|
+
{"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
# Create an evaluation dataset
|
|
36
|
+
evaluation_dataset = EvalDataset.from_list(
|
|
37
|
+
name="basic_questions", # Dataset name
|
|
38
|
+
metrics="accuracy", # Metric/Metrics used to calculate scores
|
|
39
|
+
items=evaluation_items, # List of evaluation items
|
|
40
|
+
input="question", # Key for the input field in evaluation items
|
|
41
|
+
label="answer", # Key for the label field in evaluation items
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Create a model
|
|
45
|
+
pipeline = transformers.pipeline(
|
|
46
|
+
"text-generation",
|
|
47
|
+
model="microsoft/Phi-4-mini-instruct",
|
|
48
|
+
model_kwargs={"torch_dtype": "auto"},
|
|
49
|
+
device_map="auto",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Define an inference function
|
|
53
|
+
def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
54
|
+
"""Return a list of model outputs for a list of inputs.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
inputs: Input values from an EvalDataset.
|
|
58
|
+
hyperparameters: Model hyperparameters.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
The model outputs for a list of inputs.
|
|
62
|
+
"""
|
|
63
|
+
inference_outputs = []
|
|
64
|
+
for model_input in inputs:
|
|
65
|
+
|
|
66
|
+
# Wrap inputs in the model's message format
|
|
67
|
+
messages = [
|
|
68
|
+
{
|
|
69
|
+
"role": "system",
|
|
70
|
+
"content": hyperparameters.get("system_message"),
|
|
71
|
+
},
|
|
72
|
+
{"role": "user", "content": model_input},
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
# Run inference on the item
|
|
76
|
+
output = pipeline(messages, temperature=hyperparameters.get("temperature"))
|
|
77
|
+
|
|
78
|
+
# Extract and collect the output generated from the model's response
|
|
79
|
+
inference_outputs.append(output[0]["generated_text"][-1]["content"])
|
|
80
|
+
|
|
81
|
+
return inference_outputs
|
|
82
|
+
|
|
83
|
+
# Evaluate a model against an evaluation dataset
|
|
84
|
+
results = evaluate(
|
|
85
|
+
inference,
|
|
86
|
+
evaluation_dataset,
|
|
87
|
+
hyperparameters={
|
|
88
|
+
"temperature": 0.7,
|
|
89
|
+
"system_message": "Answer the question directly and concisely.",
|
|
90
|
+
},
|
|
91
|
+
return_items=True,
|
|
92
|
+
upload_results=False, # Disable uploading for this example
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
print("\nEvaluation Results:")
|
|
96
|
+
pprint(results)
|
|
97
|
+
return results
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
load_dotenv()
|
|
102
|
+
log_file = setup_logging(experiment_id="1-evaluating_local_models", base_dir=Path(__file__).parent)
|
|
103
|
+
output_dir = Path(__file__).parent / "results"
|
|
104
|
+
output_dir.mkdir(exist_ok=True)
|
|
105
|
+
results_dict = main()
|
|
106
|
+
save_results_to_json(results_dict, output_dir, "1-evaluating_local_models_output.json")
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Tutorials - Evaluate - Example 2 - Evaluating Local Models with Batching."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any, List
|
|
6
|
+
|
|
7
|
+
import transformers
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
|
|
10
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
11
|
+
|
|
12
|
+
from scorebook import EvalDataset, evaluate
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Run a Scorebook evaluation using local batch inference.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how to perform batch inference locally.
|
|
19
|
+
|
|
20
|
+
This approach offers several benefits:
|
|
21
|
+
1. Improved throughput by processing multiple items in parallel
|
|
22
|
+
2. Better GPU utilization through batched tensor operations
|
|
23
|
+
3. More efficient memory usage compared to sequential processing
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# Initialize the pipeline with appropriate settings for batch processing
|
|
27
|
+
model_name = "google/flan-t5-small"
|
|
28
|
+
|
|
29
|
+
# Task is text2text-generation for seq2seq models
|
|
30
|
+
pipeline = transformers.pipeline(
|
|
31
|
+
"text2text-generation",
|
|
32
|
+
model=model_name,
|
|
33
|
+
torch_dtype="auto",
|
|
34
|
+
device_map="auto", # will pick up gpu if available
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Define a batch inference function
|
|
38
|
+
def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
39
|
+
"""Process multiple inputs through the model in batches.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
inputs: Input values from an EvalDataset.
|
|
43
|
+
hyperparameters: Model hyperparameters including batch_size and max_new_tokens.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
List of model outputs for all inputs.
|
|
47
|
+
"""
|
|
48
|
+
# Preprocess: Convert inputs to strings
|
|
49
|
+
preprocessed_inputs = [str(input_val) for input_val in inputs]
|
|
50
|
+
|
|
51
|
+
# Run batch inference
|
|
52
|
+
raw_results = pipeline(
|
|
53
|
+
preprocessed_inputs,
|
|
54
|
+
batch_size=hyperparameters["batch_size"],
|
|
55
|
+
max_new_tokens=hyperparameters["max_new_tokens"],
|
|
56
|
+
pad_token_id=pipeline.tokenizer.eos_token_id,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Postprocess: Extract and clean the generated text
|
|
60
|
+
final_outputs = [str(result["generated_text"]).strip() for result in raw_results]
|
|
61
|
+
|
|
62
|
+
return final_outputs
|
|
63
|
+
|
|
64
|
+
# Create a list of evaluation items
|
|
65
|
+
evaluation_items = [
|
|
66
|
+
{"question": "What is 2 + 2?", "answer": "4"},
|
|
67
|
+
{"question": "What is the capital of France?", "answer": "Paris"},
|
|
68
|
+
{"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
# Create an evaluation dataset
|
|
72
|
+
evaluation_dataset = EvalDataset.from_list(
|
|
73
|
+
name="basic_questions", # Dataset name
|
|
74
|
+
metrics="accuracy", # Metric/Metrics used to calculate scores
|
|
75
|
+
items=evaluation_items, # List of evaluation items
|
|
76
|
+
input="question", # Key for the input field in evaluation items
|
|
77
|
+
label="answer", # Key for the label field in evaluation items
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# Define hyperparameters
|
|
82
|
+
hyperparameters = {
|
|
83
|
+
"max_new_tokens": 128,
|
|
84
|
+
"batch_size": 2,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
# Run the evaluation with batch inference
|
|
88
|
+
results = evaluate(
|
|
89
|
+
inference,
|
|
90
|
+
evaluation_dataset,
|
|
91
|
+
hyperparameters=hyperparameters,
|
|
92
|
+
return_aggregates=True, # Include aggregate results for each configuration
|
|
93
|
+
return_items=True, # Include results for individual items
|
|
94
|
+
return_output=True, # Include model outputs for debugging
|
|
95
|
+
upload_results=False, # Disable uploading for this example
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
pprint(results)
|
|
99
|
+
return results
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
load_dotenv()
|
|
104
|
+
log_file = setup_logging(experiment_id="2-evaluating_local_models_with_batching", base_dir=Path(__file__).parent)
|
|
105
|
+
output_dir = Path(__file__).parent / "results"
|
|
106
|
+
output_dir.mkdir(exist_ok=True)
|
|
107
|
+
results_dict = main()
|
|
108
|
+
save_results_to_json(results_dict, output_dir, "2-evaluating_local_models_with_batching_output.json")
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Tutorials - Evaluate - Example 3 - Evaluating Cloud Models."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from pprint import pprint
|
|
6
|
+
from typing import Any, List
|
|
7
|
+
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
from openai import AsyncOpenAI
|
|
10
|
+
|
|
11
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
12
|
+
|
|
13
|
+
from scorebook import EvalDataset, evaluate_async
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def main() -> Any:
|
|
17
|
+
"""Run an evaluation using a cloud-hosted model.
|
|
18
|
+
|
|
19
|
+
This example demonstrates how to evaluate cloud-hosted models using OpenAI's API directly.
|
|
20
|
+
|
|
21
|
+
Prerequisites:
|
|
22
|
+
- OpenAI API key set in environment variable OPENAI_API_KEY
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
# Initialize OpenAI client
|
|
26
|
+
client = AsyncOpenAI()
|
|
27
|
+
model_name = "gpt-4o-mini"
|
|
28
|
+
|
|
29
|
+
# Define an async inference function
|
|
30
|
+
async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
31
|
+
"""Process inputs through OpenAI's API.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
inputs: Input values from an EvalDataset.
|
|
35
|
+
hyperparameters: Model hyperparameters including system_message and temperature.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
List of model outputs for all inputs.
|
|
39
|
+
"""
|
|
40
|
+
outputs = []
|
|
41
|
+
for input_val in inputs:
|
|
42
|
+
# Build messages for OpenAI API
|
|
43
|
+
messages = [
|
|
44
|
+
{
|
|
45
|
+
"role": "system",
|
|
46
|
+
"content": hyperparameters.get(
|
|
47
|
+
"system_message", "You are a helpful assistant."
|
|
48
|
+
),
|
|
49
|
+
},
|
|
50
|
+
{"role": "user", "content": str(input_val)},
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
# Call OpenAI API
|
|
54
|
+
try:
|
|
55
|
+
response = await client.chat.completions.create(
|
|
56
|
+
model=model_name,
|
|
57
|
+
messages=messages,
|
|
58
|
+
temperature=hyperparameters.get("temperature", 0.7),
|
|
59
|
+
)
|
|
60
|
+
output = response.choices[0].message.content.strip()
|
|
61
|
+
except Exception as e:
|
|
62
|
+
output = f"Error: {str(e)}"
|
|
63
|
+
|
|
64
|
+
outputs.append(output)
|
|
65
|
+
|
|
66
|
+
return outputs
|
|
67
|
+
|
|
68
|
+
# Create a list of evaluation items
|
|
69
|
+
evaluation_items = [
|
|
70
|
+
{"question": "What is 2 + 2?", "answer": "4"},
|
|
71
|
+
{"question": "What is the capital of France?", "answer": "Paris"},
|
|
72
|
+
{"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
# Create an evaluation dataset
|
|
76
|
+
evaluation_dataset = EvalDataset.from_list(
|
|
77
|
+
name="basic_questions", # Dataset name
|
|
78
|
+
metrics="accuracy", # Metric/Metrics used to calculate scores
|
|
79
|
+
items=evaluation_items, # List of evaluation items
|
|
80
|
+
input="question", # Key for the input field in evaluation items
|
|
81
|
+
label="answer", # Key for the label field in evaluation items
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Run evaluation
|
|
85
|
+
results = await evaluate_async(
|
|
86
|
+
inference,
|
|
87
|
+
evaluation_dataset,
|
|
88
|
+
hyperparameters={
|
|
89
|
+
"system_message": (
|
|
90
|
+
"Answer the question directly. Provide only the answer, without context."
|
|
91
|
+
),
|
|
92
|
+
"temperature": 0.7,
|
|
93
|
+
},
|
|
94
|
+
return_items=True,
|
|
95
|
+
return_output=True,
|
|
96
|
+
upload_results=False,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
pprint(results)
|
|
100
|
+
return results
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
load_dotenv()
|
|
105
|
+
log_file = setup_logging(experiment_id="3-evaluating_cloud_models", base_dir=Path(__file__).parent)
|
|
106
|
+
output_dir = Path(__file__).parent / "results"
|
|
107
|
+
output_dir.mkdir(exist_ok=True)
|
|
108
|
+
results_dict = asyncio.run(main())
|
|
109
|
+
save_results_to_json(results_dict, output_dir, "3-evaluating_cloud_models_output.json")
|