scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. scorebook/__init__.py +12 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +57 -12
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +4 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +36 -19
  11. scorebook/evaluate/_sync/evaluate.py +36 -19
  12. scorebook/evaluate/evaluate_helpers.py +4 -3
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/README.md +121 -0
  17. scorebook/metrics/__init__.py +7 -16
  18. scorebook/metrics/accuracy.py +2 -6
  19. scorebook/metrics/bertscore.py +50 -0
  20. scorebook/metrics/bleu.py +82 -0
  21. scorebook/metrics/core/__init__.py +1 -0
  22. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  23. scorebook/metrics/core/metric_registry.py +195 -0
  24. scorebook/metrics/exactmatch.py +95 -0
  25. scorebook/metrics/f1.py +96 -0
  26. scorebook/metrics/precision.py +84 -9
  27. scorebook/metrics/recall.py +94 -0
  28. scorebook/metrics/rouge.py +85 -0
  29. scorebook/score/__init__.py +0 -5
  30. scorebook/score/_async/score_async.py +3 -2
  31. scorebook/score/_sync/score.py +3 -2
  32. scorebook/score/score_helpers.py +29 -12
  33. scorebook/types.py +3 -3
  34. scorebook/utils/__init__.py +0 -22
  35. scorebook/utils/common_helpers.py +1 -1
  36. scorebook/utils/mock_llm/__init__.py +41 -0
  37. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  38. scorebook/utils/progress_bars.py +58 -786
  39. scorebook-0.0.15.dist-info/METADATA +300 -0
  40. scorebook-0.0.15.dist-info/RECORD +110 -0
  41. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  42. tutorials/README.md +147 -0
  43. tutorials/__init__.py +5 -0
  44. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  45. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  46. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  47. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  48. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  49. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  50. tutorials/examples/1-score/__init__.py +0 -0
  51. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  52. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  53. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  54. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  55. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  56. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  57. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  58. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  59. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  60. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  61. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  62. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  63. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  64. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  65. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  66. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  67. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  68. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  69. tutorials/examples/6-providers/aws/__init__.py +1 -0
  70. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  71. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  72. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  73. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  74. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  75. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  76. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  77. tutorials/examples/__init__.py +0 -0
  78. tutorials/notebooks/1-scoring.ipynb +162 -0
  79. tutorials/notebooks/2-evaluating.ipynb +316 -0
  80. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  81. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  82. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  83. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  84. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  85. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  86. tutorials/quickstarts/getting_started.ipynb +197 -0
  87. tutorials/utils/__init__.py +35 -0
  88. tutorials/utils/args_parser.py +132 -0
  89. tutorials/utils/output.py +23 -0
  90. tutorials/utils/setup.py +98 -0
  91. scorebook/metrics/metric_registry.py +0 -105
  92. scorebook/trismik/__init__.py +0 -10
  93. scorebook-0.0.13.dist-info/METADATA +0 -389
  94. scorebook-0.0.13.dist-info/RECORD +0 -50
  95. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  96. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,64 @@
1
+ """Tutorials - Score - Example 3 - F1 Metric Scoring."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from tutorials.utils import save_results_to_json, setup_logging
10
+ from scorebook import score
11
+ from scorebook.metrics.f1 import F1
12
+
13
+
14
+ def main() -> Any:
15
+ """Score pre-computed model predictions using F1 metric.
16
+
17
+ This example demonstrates how to score NER (Named Entity Recognition)
18
+ predictions using the F1 metric with different averaging methods.
19
+ """
20
+
21
+ # Sample NER predictions (in CoNLL format with BIO tags)
22
+ model_predictions = [
23
+ {"output": "O", "label": "O"},
24
+ {"output": "B-PER", "label": "B-PER"},
25
+ {"output": "I-PER", "label": "I-PER"},
26
+ {"output": "O", "label": "O"},
27
+ {"output": "B-LOC", "label": "B-LOC"},
28
+ {"output": "O", "label": "O"},
29
+ {"output": "B-ORG", "label": "B-LOC"}, # Misclassification
30
+ {"output": "O", "label": "B-MISC"}, # Missed entity
31
+ {"output": "B-PER", "label": "B-PER"},
32
+ {"output": "O", "label": "O"},
33
+ ]
34
+
35
+ print(f"Scoring {len(model_predictions)} NER predictions\n")
36
+
37
+ # Score with all averaging methods at once
38
+ print("All averaging methods:")
39
+ results_all = score(
40
+ items=model_predictions,
41
+ metrics=F1(average="all"),
42
+ upload_results=False,
43
+ )
44
+ pprint(results_all["aggregate_results"])
45
+
46
+ # Score with specific combination of methods
47
+ print("\nMicro and weighted averaging:")
48
+ results_combo = score(
49
+ items=model_predictions,
50
+ metrics=F1(average=["micro", "weighted"]),
51
+ upload_results=False,
52
+ )
53
+ pprint(results_combo["aggregate_results"])
54
+
55
+ return results_all
56
+
57
+
58
+ if __name__ == "__main__":
59
+ load_dotenv()
60
+ log_file = setup_logging(experiment_id="3-scoring_f1_metric", base_dir=Path(__file__).parent)
61
+ output_dir = Path(__file__).parent / "results"
62
+ output_dir.mkdir(exist_ok=True)
63
+ results_dict = main()
64
+ save_results_to_json(results_dict, output_dir, "3-scoring_f1_metric_output.json")
@@ -0,0 +1,64 @@
1
+ """Tutorials - Score - Example 4 - Scoring Models with ROUGE."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from tutorials.utils import save_results_to_json, setup_logging
10
+
11
+ from scorebook import score
12
+ from scorebook.metrics.rouge import ROUGE
13
+
14
+
15
+ def main() -> Any:
16
+ """Score text generation predictions using ROUGE metric.
17
+
18
+ This example demonstrates how to score generated summaries
19
+ against reference summaries using ROUGE scores.
20
+ """
21
+
22
+ # Prepare a list of items with generated summaries and reference summaries
23
+ model_predictions = [
24
+ {
25
+ "output": "A woman donated her kidney to a stranger. This sparked a chain of six kidney transplants.",
26
+ "label": "Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.",
27
+ },
28
+ {
29
+ "output": "Scientists discovered a new species of frog in the Amazon rainforest. The frog has unique markings that distinguish it from other species.",
30
+ "label": "A new frog species with distinctive blue and yellow stripes was found in the Amazon. Researchers say this discovery highlights the biodiversity of the region.",
31
+ },
32
+ {
33
+ "output": "The technology company released its quarterly earnings report showing strong growth.",
34
+ "label": "Tech giant announces record quarterly revenue driven by cloud services and AI products.",
35
+ },
36
+ ]
37
+
38
+ # Score the predictions against labels using the ROUGE metric
39
+ results = score(
40
+ items=model_predictions,
41
+ metrics=ROUGE(rouge_types=["rouge1", "rougeL"], use_stemmer=True),
42
+ upload_results=False, # Disable uploading for this example
43
+ )
44
+
45
+ print("\nResults:")
46
+ pprint(results)
47
+
48
+ # Display individual item scores
49
+ print("\n\nIndividual ROUGE Scores:")
50
+ for i, item_score in enumerate(results["item_results"]):
51
+ print(f"\nItem {i+1}:")
52
+ print(f" ROUGE-1 F1: {item_score['rouge1']:.4f}")
53
+ print(f" ROUGE-L F1: {item_score['rougeL']:.4f}")
54
+
55
+ return results
56
+
57
+
58
+ if __name__ == "__main__":
59
+ load_dotenv()
60
+ log_file = setup_logging(experiment_id="4-scoring_model_rouge", base_dir=Path(__file__).parent)
61
+ output_dir = Path(__file__).parent / "results"
62
+ output_dir.mkdir(exist_ok=True)
63
+ results_dict = main()
64
+ save_results_to_json(results_dict, output_dir, "4-scoring_model_rouge_output.json")
@@ -0,0 +1,84 @@
1
+ """Tutorials - Score - Example 5 - Scoring Models with Exact Match."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from tutorials.utils import save_results_to_json, setup_logging
10
+
11
+ from scorebook import score
12
+ from scorebook.metrics.exactmatch import ExactMatch
13
+
14
+
15
+ def main() -> Any:
16
+ """Score text predictions using Exact Match metric.
17
+
18
+ This example demonstrates how to compare model outputs against
19
+ reference labels using exact string matching with configurable
20
+ preprocessing options.
21
+ """
22
+
23
+ # Prepare a list of items with model outputs and expected labels
24
+ # Note: outputs may have different casing or extra whitespace
25
+ model_predictions = [
26
+ {"output": "Paris", "label": "Paris"}, # Exact match
27
+ {"output": "LONDON", "label": "London"}, # Different case
28
+ {"output": " Berlin ", "label": "Berlin"}, # Extra whitespace
29
+ {"output": " NEW YORK ", "label": "new york"}, # Both case and whitespace
30
+ {"output": "Tokyo", "label": "Kyoto"}, # No match
31
+ ]
32
+
33
+ print(f"Scoring {len(model_predictions)} predictions\n")
34
+
35
+ # Score with default settings (case_insensitive=True, strip=True)
36
+ print("Default settings (case_insensitive=True, strip=True):")
37
+ results_default = score(
38
+ items=model_predictions,
39
+ metrics=ExactMatch(),
40
+ upload_results=False,
41
+ )
42
+ pprint(results_default["aggregate_results"])
43
+ print(f"Item matches: {[item['exact_match'] for item in results_default['item_results']]}")
44
+
45
+ # Score with case-sensitive matching
46
+ print("\nCase-sensitive matching (case_insensitive=False, strip=True):")
47
+ results_case_sensitive = score(
48
+ items=model_predictions,
49
+ metrics=ExactMatch(case_insensitive=False),
50
+ upload_results=False,
51
+ )
52
+ pprint(results_case_sensitive["aggregate_results"])
53
+ print(f"Item matches: {[item['exact_match'] for item in results_case_sensitive['item_results']]}")
54
+
55
+ # Score without stripping whitespace
56
+ print("\nWithout stripping (case_insensitive=True, strip=False):")
57
+ results_no_strip = score(
58
+ items=model_predictions,
59
+ metrics=ExactMatch(strip=False),
60
+ upload_results=False,
61
+ )
62
+ pprint(results_no_strip["aggregate_results"])
63
+ print(f"Item matches: {[item['exact_match'] for item in results_no_strip['item_results']]}")
64
+
65
+ # Score with strict matching (no preprocessing)
66
+ print("\nStrict matching (case_insensitive=False, strip=False):")
67
+ results_strict = score(
68
+ items=model_predictions,
69
+ metrics=ExactMatch(case_insensitive=False, strip=False),
70
+ upload_results=False,
71
+ )
72
+ pprint(results_strict["aggregate_results"])
73
+ print(f"Item matches: {[item['exact_match'] for item in results_strict['item_results']]}")
74
+
75
+ return results_default
76
+
77
+
78
+ if __name__ == "__main__":
79
+ load_dotenv()
80
+ log_file = setup_logging(experiment_id="5-scoring_model_exact_match", base_dir=Path(__file__).parent)
81
+ output_dir = Path(__file__).parent / "results"
82
+ output_dir.mkdir(exist_ok=True)
83
+ results_dict = main()
84
+ save_results_to_json(results_dict, output_dir, "5-scoring_model_exact_match_output.json")
@@ -0,0 +1,57 @@
1
+ """Tutorials - Score - Example 6 - Scoring with BertScore."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from scorebook.metrics.bertscore import BertScore
10
+
11
+ from tutorials.utils import save_results_to_json, setup_logging
12
+ from scorebook import score
13
+
14
+
15
+ def main() -> Any:
16
+ """Score pre-computed model predictions using Scorebook.
17
+
18
+ This example demonstrates how to score generated model predictions.
19
+ """
20
+
21
+ # Prepare a list of items with generated summaries and reference summaries
22
+ model_predictions = [
23
+ {
24
+ "output": "A woman donated her kidney to a stranger. This sparked a chain of six kidney transplants.",
25
+ "label": "Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.",
26
+ },
27
+ {
28
+ "output": "Scientists discovered a new species of frog in the Amazon rainforest. The frog has unique markings that distinguish it from other species.",
29
+ "label": "A new frog species with distinctive blue and yellow stripes was found in the Amazon. Researchers say this discovery highlights the biodiversity of the region.",
30
+ },
31
+ {
32
+ "output": "The technology company released its quarterly earnings report showing strong growth.",
33
+ "label": "Tech giant announces record quarterly revenue driven by cloud services and AI products.",
34
+ },
35
+ ]
36
+
37
+ # Score the predictions against labels using the BertScore metric
38
+ results = score(
39
+ items=model_predictions,
40
+ metrics=BertScore,
41
+ upload_results=False, # Disable uploading for this example
42
+ )
43
+
44
+ print("\nResults:")
45
+ pprint(results)
46
+
47
+ return results
48
+
49
+
50
+ if __name__ == "__main__":
51
+ load_dotenv()
52
+ log_file = setup_logging(experiment_id="6-scoring_model_bertscore", base_dir=Path(__file__).parent)
53
+ output_dir = Path(__file__).parent / "results"
54
+ output_dir.mkdir(exist_ok=True)
55
+ results_dict = main()
56
+ save_results_to_json(results_dict, output_dir, "6-scoring_model_bertscore_output.json")
57
+
File without changes
@@ -0,0 +1,106 @@
1
+ """Tutorials - Evaluate - Example 1 - Evaluating Local Models."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any, List
6
+
7
+ import transformers
8
+ from dotenv import load_dotenv
9
+
10
+ from tutorials.utils import save_results_to_json, setup_logging
11
+
12
+ from scorebook import EvalDataset, evaluate
13
+
14
+
15
+ def main() -> Any:
16
+ """Run a simple Scorebook evaluation on a local model.
17
+
18
+ This example demonstrates the fundamental workflow for evaluating a model using Scorebook.
19
+
20
+ It shows how to:
21
+ 1. Create an evaluation dataset from a list of evaluation items
22
+ 2. Define an inference function using Hugging Face's transformers library
23
+ 3. Run the evaluation and collect results
24
+
25
+ This serves as a starting point for understanding Scorebook's core evaluation capabilities.
26
+ """
27
+
28
+ # Create a list of evaluation items
29
+ evaluation_items = [
30
+ {"question": "What is 2 + 2?", "answer": "4"},
31
+ {"question": "What is the capital of France?", "answer": "Paris"},
32
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
33
+ ]
34
+
35
+ # Create an evaluation dataset
36
+ evaluation_dataset = EvalDataset.from_list(
37
+ name="basic_questions", # Dataset name
38
+ metrics="accuracy", # Metric/Metrics used to calculate scores
39
+ items=evaluation_items, # List of evaluation items
40
+ input="question", # Key for the input field in evaluation items
41
+ label="answer", # Key for the label field in evaluation items
42
+ )
43
+
44
+ # Create a model
45
+ pipeline = transformers.pipeline(
46
+ "text-generation",
47
+ model="microsoft/Phi-4-mini-instruct",
48
+ model_kwargs={"torch_dtype": "auto"},
49
+ device_map="auto",
50
+ )
51
+
52
+ # Define an inference function
53
+ def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
54
+ """Return a list of model outputs for a list of inputs.
55
+
56
+ Args:
57
+ inputs: Input values from an EvalDataset.
58
+ hyperparameters: Model hyperparameters.
59
+
60
+ Returns:
61
+ The model outputs for a list of inputs.
62
+ """
63
+ inference_outputs = []
64
+ for model_input in inputs:
65
+
66
+ # Wrap inputs in the model's message format
67
+ messages = [
68
+ {
69
+ "role": "system",
70
+ "content": hyperparameters.get("system_message"),
71
+ },
72
+ {"role": "user", "content": model_input},
73
+ ]
74
+
75
+ # Run inference on the item
76
+ output = pipeline(messages, temperature=hyperparameters.get("temperature"))
77
+
78
+ # Extract and collect the output generated from the model's response
79
+ inference_outputs.append(output[0]["generated_text"][-1]["content"])
80
+
81
+ return inference_outputs
82
+
83
+ # Evaluate a model against an evaluation dataset
84
+ results = evaluate(
85
+ inference,
86
+ evaluation_dataset,
87
+ hyperparameters={
88
+ "temperature": 0.7,
89
+ "system_message": "Answer the question directly and concisely.",
90
+ },
91
+ return_items=True,
92
+ upload_results=False, # Disable uploading for this example
93
+ )
94
+
95
+ print("\nEvaluation Results:")
96
+ pprint(results)
97
+ return results
98
+
99
+
100
+ if __name__ == "__main__":
101
+ load_dotenv()
102
+ log_file = setup_logging(experiment_id="1-evaluating_local_models", base_dir=Path(__file__).parent)
103
+ output_dir = Path(__file__).parent / "results"
104
+ output_dir.mkdir(exist_ok=True)
105
+ results_dict = main()
106
+ save_results_to_json(results_dict, output_dir, "1-evaluating_local_models_output.json")
@@ -0,0 +1,108 @@
1
+ """Tutorials - Evaluate - Example 2 - Evaluating Local Models with Batching."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any, List
6
+
7
+ import transformers
8
+ from dotenv import load_dotenv
9
+
10
+ from tutorials.utils import save_results_to_json, setup_logging
11
+
12
+ from scorebook import EvalDataset, evaluate
13
+
14
+
15
+ def main() -> Any:
16
+ """Run a Scorebook evaluation using local batch inference.
17
+
18
+ This example demonstrates how to perform batch inference locally.
19
+
20
+ This approach offers several benefits:
21
+ 1. Improved throughput by processing multiple items in parallel
22
+ 2. Better GPU utilization through batched tensor operations
23
+ 3. More efficient memory usage compared to sequential processing
24
+ """
25
+
26
+ # Initialize the pipeline with appropriate settings for batch processing
27
+ model_name = "google/flan-t5-small"
28
+
29
+ # Task is text2text-generation for seq2seq models
30
+ pipeline = transformers.pipeline(
31
+ "text2text-generation",
32
+ model=model_name,
33
+ torch_dtype="auto",
34
+ device_map="auto", # will pick up gpu if available
35
+ )
36
+
37
+ # Define a batch inference function
38
+ def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
39
+ """Process multiple inputs through the model in batches.
40
+
41
+ Args:
42
+ inputs: Input values from an EvalDataset.
43
+ hyperparameters: Model hyperparameters including batch_size and max_new_tokens.
44
+
45
+ Returns:
46
+ List of model outputs for all inputs.
47
+ """
48
+ # Preprocess: Convert inputs to strings
49
+ preprocessed_inputs = [str(input_val) for input_val in inputs]
50
+
51
+ # Run batch inference
52
+ raw_results = pipeline(
53
+ preprocessed_inputs,
54
+ batch_size=hyperparameters["batch_size"],
55
+ max_new_tokens=hyperparameters["max_new_tokens"],
56
+ pad_token_id=pipeline.tokenizer.eos_token_id,
57
+ )
58
+
59
+ # Postprocess: Extract and clean the generated text
60
+ final_outputs = [str(result["generated_text"]).strip() for result in raw_results]
61
+
62
+ return final_outputs
63
+
64
+ # Create a list of evaluation items
65
+ evaluation_items = [
66
+ {"question": "What is 2 + 2?", "answer": "4"},
67
+ {"question": "What is the capital of France?", "answer": "Paris"},
68
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
69
+ ]
70
+
71
+ # Create an evaluation dataset
72
+ evaluation_dataset = EvalDataset.from_list(
73
+ name="basic_questions", # Dataset name
74
+ metrics="accuracy", # Metric/Metrics used to calculate scores
75
+ items=evaluation_items, # List of evaluation items
76
+ input="question", # Key for the input field in evaluation items
77
+ label="answer", # Key for the label field in evaluation items
78
+ )
79
+
80
+
81
+ # Define hyperparameters
82
+ hyperparameters = {
83
+ "max_new_tokens": 128,
84
+ "batch_size": 2,
85
+ }
86
+
87
+ # Run the evaluation with batch inference
88
+ results = evaluate(
89
+ inference,
90
+ evaluation_dataset,
91
+ hyperparameters=hyperparameters,
92
+ return_aggregates=True, # Include aggregate results for each configuration
93
+ return_items=True, # Include results for individual items
94
+ return_output=True, # Include model outputs for debugging
95
+ upload_results=False, # Disable uploading for this example
96
+ )
97
+
98
+ pprint(results)
99
+ return results
100
+
101
+
102
+ if __name__ == "__main__":
103
+ load_dotenv()
104
+ log_file = setup_logging(experiment_id="2-evaluating_local_models_with_batching", base_dir=Path(__file__).parent)
105
+ output_dir = Path(__file__).parent / "results"
106
+ output_dir.mkdir(exist_ok=True)
107
+ results_dict = main()
108
+ save_results_to_json(results_dict, output_dir, "2-evaluating_local_models_with_batching_output.json")
@@ -0,0 +1,109 @@
1
+ """Tutorials - Evaluate - Example 3 - Evaluating Cloud Models."""
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+ from pprint import pprint
6
+ from typing import Any, List
7
+
8
+ from dotenv import load_dotenv
9
+ from openai import AsyncOpenAI
10
+
11
+ from tutorials.utils import save_results_to_json, setup_logging
12
+
13
+ from scorebook import EvalDataset, evaluate_async
14
+
15
+
16
+ async def main() -> Any:
17
+ """Run an evaluation using a cloud-hosted model.
18
+
19
+ This example demonstrates how to evaluate cloud-hosted models using OpenAI's API directly.
20
+
21
+ Prerequisites:
22
+ - OpenAI API key set in environment variable OPENAI_API_KEY
23
+ """
24
+
25
+ # Initialize OpenAI client
26
+ client = AsyncOpenAI()
27
+ model_name = "gpt-4o-mini"
28
+
29
+ # Define an async inference function
30
+ async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
31
+ """Process inputs through OpenAI's API.
32
+
33
+ Args:
34
+ inputs: Input values from an EvalDataset.
35
+ hyperparameters: Model hyperparameters including system_message and temperature.
36
+
37
+ Returns:
38
+ List of model outputs for all inputs.
39
+ """
40
+ outputs = []
41
+ for input_val in inputs:
42
+ # Build messages for OpenAI API
43
+ messages = [
44
+ {
45
+ "role": "system",
46
+ "content": hyperparameters.get(
47
+ "system_message", "You are a helpful assistant."
48
+ ),
49
+ },
50
+ {"role": "user", "content": str(input_val)},
51
+ ]
52
+
53
+ # Call OpenAI API
54
+ try:
55
+ response = await client.chat.completions.create(
56
+ model=model_name,
57
+ messages=messages,
58
+ temperature=hyperparameters.get("temperature", 0.7),
59
+ )
60
+ output = response.choices[0].message.content.strip()
61
+ except Exception as e:
62
+ output = f"Error: {str(e)}"
63
+
64
+ outputs.append(output)
65
+
66
+ return outputs
67
+
68
+ # Create a list of evaluation items
69
+ evaluation_items = [
70
+ {"question": "What is 2 + 2?", "answer": "4"},
71
+ {"question": "What is the capital of France?", "answer": "Paris"},
72
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
73
+ ]
74
+
75
+ # Create an evaluation dataset
76
+ evaluation_dataset = EvalDataset.from_list(
77
+ name="basic_questions", # Dataset name
78
+ metrics="accuracy", # Metric/Metrics used to calculate scores
79
+ items=evaluation_items, # List of evaluation items
80
+ input="question", # Key for the input field in evaluation items
81
+ label="answer", # Key for the label field in evaluation items
82
+ )
83
+
84
+ # Run evaluation
85
+ results = await evaluate_async(
86
+ inference,
87
+ evaluation_dataset,
88
+ hyperparameters={
89
+ "system_message": (
90
+ "Answer the question directly. Provide only the answer, without context."
91
+ ),
92
+ "temperature": 0.7,
93
+ },
94
+ return_items=True,
95
+ return_output=True,
96
+ upload_results=False,
97
+ )
98
+
99
+ pprint(results)
100
+ return results
101
+
102
+
103
+ if __name__ == "__main__":
104
+ load_dotenv()
105
+ log_file = setup_logging(experiment_id="3-evaluating_cloud_models", base_dir=Path(__file__).parent)
106
+ output_dir = Path(__file__).parent / "results"
107
+ output_dir.mkdir(exist_ok=True)
108
+ results_dict = asyncio.run(main())
109
+ save_results_to_json(results_dict, output_dir, "3-evaluating_cloud_models_output.json")