scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +57 -12
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +4 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +36 -19
- scorebook/evaluate/_sync/evaluate.py +36 -19
- scorebook/evaluate/evaluate_helpers.py +4 -3
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +7 -16
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +29 -12
- scorebook/types.py +3 -3
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook/utils/progress_bars.py +58 -786
- scorebook-0.0.15.dist-info/METADATA +300 -0
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -105
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.13.dist-info/METADATA +0 -389
- scorebook-0.0.13.dist-info/RECORD +0 -50
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Tutorials - Evaluation Datasets - Example 3 - Loading from YAML Config."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from pprint import pprint
|
|
6
|
+
from typing import Any, List
|
|
7
|
+
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
from openai import AsyncOpenAI
|
|
10
|
+
|
|
11
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
12
|
+
|
|
13
|
+
from scorebook import EvalDataset, evaluate_async
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def main() -> Any:
|
|
17
|
+
"""Run evaluations using datasets loaded from YAML configuration files.
|
|
18
|
+
|
|
19
|
+
This example demonstrates how to use YAML configuration files to define
|
|
20
|
+
dataset loading parameters. YAML configs are useful for:
|
|
21
|
+
- Storing dataset configurations in version control
|
|
22
|
+
- Reusing the same dataset configuration across projects
|
|
23
|
+
- Defining complex prompt templates and field mappings
|
|
24
|
+
|
|
25
|
+
The YAML files contain:
|
|
26
|
+
- HuggingFace dataset path and split information
|
|
27
|
+
- Metrics to use for evaluation
|
|
28
|
+
- Jinja2 templates for input and label formatting
|
|
29
|
+
- Metadata about the dataset
|
|
30
|
+
|
|
31
|
+
Prerequisites:
|
|
32
|
+
- OpenAI API key set in environment variable OPENAI_API_KEY
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Initialize OpenAI client
|
|
36
|
+
client = AsyncOpenAI()
|
|
37
|
+
model_name = "gpt-4o-mini"
|
|
38
|
+
|
|
39
|
+
# Define an async inference function
|
|
40
|
+
async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
41
|
+
"""Process inputs through OpenAI's API.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
inputs: Input values from an EvalDataset.
|
|
45
|
+
hyperparameters: Model hyperparameters.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
List of model outputs for all inputs.
|
|
49
|
+
"""
|
|
50
|
+
outputs = []
|
|
51
|
+
for input_val in inputs:
|
|
52
|
+
# Build messages for OpenAI API
|
|
53
|
+
messages = [
|
|
54
|
+
{
|
|
55
|
+
"role": "system",
|
|
56
|
+
"content": "Answer the multiple choice question by selecting the correct letter (A, B, C, D, etc.). Provide ONLY the letter of your answer, no additional text or explanation.",
|
|
57
|
+
},
|
|
58
|
+
{"role": "user", "content": str(input_val)},
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
# Call OpenAI API
|
|
62
|
+
try:
|
|
63
|
+
response = await client.chat.completions.create(
|
|
64
|
+
model=model_name,
|
|
65
|
+
messages=messages,
|
|
66
|
+
temperature=0.7,
|
|
67
|
+
)
|
|
68
|
+
output = response.choices[0].message.content.strip()
|
|
69
|
+
except Exception as e:
|
|
70
|
+
output = f"Error: {str(e)}"
|
|
71
|
+
|
|
72
|
+
outputs.append(output)
|
|
73
|
+
|
|
74
|
+
return outputs
|
|
75
|
+
|
|
76
|
+
# Construct paths to YAML config files
|
|
77
|
+
yaml_configs_dir = Path(__file__).parent / "example_yaml_configs"
|
|
78
|
+
cais_mmlu_yaml = yaml_configs_dir / "Cais-MMLU.yaml"
|
|
79
|
+
tiger_mmlu_pro_yaml = yaml_configs_dir / "TIGER-Lab-MMLU-Pro.yaml"
|
|
80
|
+
|
|
81
|
+
# Load Cais-MMLU dataset from YAML configuration
|
|
82
|
+
cais_mmlu = EvalDataset.from_yaml(str(cais_mmlu_yaml))
|
|
83
|
+
print(f"Loaded {cais_mmlu.name} from YAML config: {len(cais_mmlu.items)} items")
|
|
84
|
+
|
|
85
|
+
# Load TIGER-Lab MMLU-Pro dataset from YAML configuration
|
|
86
|
+
tiger_mmlu_pro = EvalDataset.from_yaml(str(tiger_mmlu_pro_yaml))
|
|
87
|
+
print(f"Loaded {tiger_mmlu_pro.name} from YAML config: {len(tiger_mmlu_pro.items)} items")
|
|
88
|
+
|
|
89
|
+
# Run evaluation on both datasets
|
|
90
|
+
results = await evaluate_async(
|
|
91
|
+
inference,
|
|
92
|
+
datasets=[cais_mmlu, tiger_mmlu_pro],
|
|
93
|
+
sample_size=5, # Sample 5 items from each dataset for quick testing
|
|
94
|
+
return_aggregates=True,
|
|
95
|
+
return_items=True,
|
|
96
|
+
return_output=True,
|
|
97
|
+
upload_results=False,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
pprint(results)
|
|
101
|
+
return results
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
load_dotenv()
|
|
106
|
+
log_file = setup_logging(experiment_id="3-evaluation_datasets_from_yaml", base_dir=Path(__file__).parent)
|
|
107
|
+
output_dir = Path(__file__).parent / "results"
|
|
108
|
+
output_dir.mkdir(exist_ok=True)
|
|
109
|
+
results_dict = asyncio.run(main())
|
|
110
|
+
save_results_to_json(results_dict, output_dir, "3-evaluation_datasets_from_yaml_output.json")
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
question,answer
|
|
2
|
+
What is 2 + 2?,4
|
|
3
|
+
What is the capital of France?,Paris
|
|
4
|
+
Who wrote Romeo and Juliet?,William Shakespeare
|
|
5
|
+
What is 5 * 6?,30
|
|
6
|
+
What is the largest planet in our solar system?,Jupiter
|
|
7
|
+
Who painted the Mona Lisa?,Leonardo da Vinci
|
|
8
|
+
What is the square root of 64?,8
|
|
9
|
+
What is the capital of Japan?,Tokyo
|
|
10
|
+
Who invented the telephone?,Alexander Graham Bell
|
|
11
|
+
What is 12 - 7?,5
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"question": "What is 2 + 2?",
|
|
4
|
+
"answer": "4"
|
|
5
|
+
},
|
|
6
|
+
{
|
|
7
|
+
"question": "What is the capital of France?",
|
|
8
|
+
"answer": "Paris"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"question": "Who wrote Romeo and Juliet?",
|
|
12
|
+
"answer": "William Shakespeare"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"question": "What is 5 * 6?",
|
|
16
|
+
"answer": "30"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"question": "What is the largest planet in our solar system?",
|
|
20
|
+
"answer": "Jupiter"
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"question": "Who painted the Mona Lisa?",
|
|
24
|
+
"answer": "Leonardo da Vinci"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"question": "What is the square root of 64?",
|
|
28
|
+
"answer": "8"
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"question": "What is the capital of Japan?",
|
|
32
|
+
"answer": "Tokyo"
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"question": "Who invented the telephone?",
|
|
36
|
+
"answer": "Alexander Graham Bell"
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"question": "What is 12 - 7?",
|
|
40
|
+
"answer": "5"
|
|
41
|
+
}
|
|
42
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
path: "cais/mmlu"
|
|
2
|
+
name: "Cais-MMLU"
|
|
3
|
+
split: "test"
|
|
4
|
+
config: "all"
|
|
5
|
+
metrics:
|
|
6
|
+
- "accuracy"
|
|
7
|
+
|
|
8
|
+
templates:
|
|
9
|
+
input: |
|
|
10
|
+
{{ question }}
|
|
11
|
+
|
|
12
|
+
A. {{ choices[0] }}
|
|
13
|
+
B. {{ choices[1] }}
|
|
14
|
+
C. {{ choices[2] }}
|
|
15
|
+
D. {{ choices[3] }}
|
|
16
|
+
label: "{{ answer }}"
|
|
17
|
+
|
|
18
|
+
metadata:
|
|
19
|
+
description: "MMLU multiple choice questions from Cais"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
path: "TIGER-Lab/MMLU-Pro"
|
|
2
|
+
name: "TIGER-Lab/MMLU-Pro"
|
|
3
|
+
split: "validation"
|
|
4
|
+
config: "default"
|
|
5
|
+
metrics:
|
|
6
|
+
- "accuracy"
|
|
7
|
+
|
|
8
|
+
templates:
|
|
9
|
+
input: |
|
|
10
|
+
{{ question }}
|
|
11
|
+
Options:
|
|
12
|
+
{% for option in options %}
|
|
13
|
+
{{ number_to_letter(loop.index0) }} : {{ option }}
|
|
14
|
+
{% endfor %}
|
|
15
|
+
label: "{{ answer }}"
|
|
16
|
+
|
|
17
|
+
metadata:
|
|
18
|
+
description: "MMLU-Pro multiple choice questions"
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Tutorials - Adaptive Evaluations - Example 1 - Adaptive Evaluation."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import string
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from pprint import pprint
|
|
7
|
+
from typing import Any, List
|
|
8
|
+
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
from openai import AsyncOpenAI
|
|
11
|
+
|
|
12
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
13
|
+
|
|
14
|
+
from scorebook import evaluate_async, login
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def main() -> Any:
|
|
18
|
+
"""Run an adaptive evaluation using Trismik's adaptive testing.
|
|
19
|
+
|
|
20
|
+
This example demonstrates how to use Trismik's adaptive evaluation feature.
|
|
21
|
+
Adaptive evaluations use Item Response Theory (IRT) to efficiently estimate
|
|
22
|
+
model capabilities by selecting questions based on previous responses.
|
|
23
|
+
|
|
24
|
+
Benefits of adaptive evaluation:
|
|
25
|
+
- More efficient: Fewer questions needed to assess capability
|
|
26
|
+
- Precise measurement: Better statistical confidence intervals
|
|
27
|
+
- Optimal difficulty: Questions adapt to model's skill level
|
|
28
|
+
|
|
29
|
+
Prerequisites:
|
|
30
|
+
- Valid Trismik API key set in TRISMIK_API_KEY environment variable
|
|
31
|
+
- A Trismik project ID
|
|
32
|
+
- OpenAI API key set in OPENAI_API_KEY environment variable
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Initialize OpenAI client
|
|
36
|
+
client = AsyncOpenAI()
|
|
37
|
+
model_name = "gpt-4o-mini"
|
|
38
|
+
|
|
39
|
+
# Define an async inference function
|
|
40
|
+
async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
41
|
+
"""Process inputs through OpenAI's API.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
inputs: Input values from an EvalDataset. For adaptive MMLU-Pro,
|
|
45
|
+
each input is a dict with 'question' and 'options' keys.
|
|
46
|
+
hyperparameters: Model hyperparameters.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of model outputs for all inputs.
|
|
50
|
+
"""
|
|
51
|
+
outputs = []
|
|
52
|
+
for input_val in inputs:
|
|
53
|
+
# Handle dict input from adaptive dataset
|
|
54
|
+
if isinstance(input_val, dict):
|
|
55
|
+
prompt = input_val.get("question", "")
|
|
56
|
+
if "options" in input_val:
|
|
57
|
+
prompt += "\nOptions:\n" + "\n".join(
|
|
58
|
+
f"{letter}: {choice}"
|
|
59
|
+
for letter, choice in zip(string.ascii_uppercase, input_val["options"])
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
prompt = str(input_val)
|
|
63
|
+
|
|
64
|
+
# Build messages for OpenAI API
|
|
65
|
+
messages = [
|
|
66
|
+
{
|
|
67
|
+
"role": "system",
|
|
68
|
+
"content": "Answer the question with a single letter representing the correct answer from the list of choices. Do not provide any additional explanation or output beyond the single letter.",
|
|
69
|
+
},
|
|
70
|
+
{"role": "user", "content": prompt},
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
# Call OpenAI API
|
|
74
|
+
try:
|
|
75
|
+
response = await client.chat.completions.create(
|
|
76
|
+
model=model_name,
|
|
77
|
+
messages=messages,
|
|
78
|
+
temperature=0.7,
|
|
79
|
+
)
|
|
80
|
+
output = response.choices[0].message.content.strip()
|
|
81
|
+
except Exception as e:
|
|
82
|
+
output = f"Error: {str(e)}"
|
|
83
|
+
|
|
84
|
+
outputs.append(output)
|
|
85
|
+
|
|
86
|
+
return outputs
|
|
87
|
+
|
|
88
|
+
# Step 1: Log in with your Trismik API key
|
|
89
|
+
# login() reads TRISMIK_API_KEY from environment variables or .env file
|
|
90
|
+
login()
|
|
91
|
+
|
|
92
|
+
# Step 2: Run adaptive evaluation
|
|
93
|
+
results = await evaluate_async(
|
|
94
|
+
inference,
|
|
95
|
+
datasets="trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
|
|
96
|
+
experiment_id="Adaptive-Head-QA-Evaluation",
|
|
97
|
+
project_id='TRISMIK-PROJECT-ID',
|
|
98
|
+
return_dict=True,
|
|
99
|
+
return_aggregates=True,
|
|
100
|
+
return_items=True,
|
|
101
|
+
return_output=True,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
pprint(results)
|
|
105
|
+
return results
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
if __name__ == "__main__":
|
|
109
|
+
load_dotenv()
|
|
110
|
+
log_file = setup_logging(experiment_id="1-adaptive_evaluation", base_dir=Path(__file__).parent)
|
|
111
|
+
output_dir = Path(__file__).parent / "results"
|
|
112
|
+
output_dir.mkdir(exist_ok=True)
|
|
113
|
+
results_dict = asyncio.run(main())
|
|
114
|
+
save_results_to_json(results_dict, output_dir, "1-adaptive_evaluation_output.json")
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Tutorials - Adaptive Evaluations - Example 2 - Adaptive Dataset Splits."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import string
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from pprint import pprint
|
|
7
|
+
from typing import Any, List
|
|
8
|
+
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
from openai import AsyncOpenAI
|
|
11
|
+
|
|
12
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
13
|
+
|
|
14
|
+
from scorebook import evaluate_async, login
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def main() -> Any:
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
Prerequisites:
|
|
21
|
+
- Valid Trismik API key set in TRISMIK_API_KEY environment variable
|
|
22
|
+
- A Trismik project ID
|
|
23
|
+
- OpenAI API key set in OPENAI_API_KEY environment variable
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# Initialize OpenAI client
|
|
27
|
+
client = AsyncOpenAI()
|
|
28
|
+
model_name = "gpt-4o-mini"
|
|
29
|
+
|
|
30
|
+
# Define an async inference function
|
|
31
|
+
async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
32
|
+
"""Process inputs through OpenAI's API.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
inputs: Input values from an EvalDataset. For adaptive headQA,
|
|
36
|
+
each input is a dict with 'question' and 'options' keys.
|
|
37
|
+
hyperparameters: Model hyperparameters.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List of model outputs for all inputs.
|
|
41
|
+
"""
|
|
42
|
+
outputs = []
|
|
43
|
+
for input_val in inputs:
|
|
44
|
+
# Handle dict input from adaptive dataset
|
|
45
|
+
if isinstance(input_val, dict):
|
|
46
|
+
prompt = input_val.get("question", "")
|
|
47
|
+
if "options" in input_val:
|
|
48
|
+
prompt += "\nOptions:\n" + "\n".join(
|
|
49
|
+
f"{letter}: {choice}"
|
|
50
|
+
for letter, choice in zip(string.ascii_uppercase, input_val["options"])
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
prompt = str(input_val)
|
|
54
|
+
|
|
55
|
+
# Build messages for OpenAI API
|
|
56
|
+
messages = [
|
|
57
|
+
{
|
|
58
|
+
"role": "system",
|
|
59
|
+
"content": "Answer the question with a single letter representing the correct answer from the list of choices. Do not provide any additional explanation or output beyond the single letter.",
|
|
60
|
+
},
|
|
61
|
+
{"role": "user", "content": prompt},
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
# Call OpenAI API
|
|
65
|
+
try:
|
|
66
|
+
response = await client.chat.completions.create(
|
|
67
|
+
model=model_name,
|
|
68
|
+
messages=messages,
|
|
69
|
+
temperature=0.7,
|
|
70
|
+
)
|
|
71
|
+
output = response.choices[0].message.content.strip()
|
|
72
|
+
except Exception as e:
|
|
73
|
+
output = f"Error: {str(e)}"
|
|
74
|
+
|
|
75
|
+
outputs.append(output)
|
|
76
|
+
|
|
77
|
+
return outputs
|
|
78
|
+
|
|
79
|
+
# Step 1: Log in with your Trismik API key
|
|
80
|
+
# login() reads TRISMIK_API_KEY from environment variables or .env file
|
|
81
|
+
login()
|
|
82
|
+
|
|
83
|
+
# Step 2: Run adaptive evaluation
|
|
84
|
+
results = await evaluate_async(
|
|
85
|
+
inference,
|
|
86
|
+
datasets="trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
|
|
87
|
+
split="test", # Specify the test split for evaluation
|
|
88
|
+
experiment_id="Adaptive-Head-QA-Test-Evaluation",
|
|
89
|
+
project_id='TRISMIK-PROJECT-ID',
|
|
90
|
+
return_dict=True,
|
|
91
|
+
return_aggregates=True,
|
|
92
|
+
return_items=True,
|
|
93
|
+
return_output=True,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
pprint(results)
|
|
97
|
+
return results
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
load_dotenv()
|
|
102
|
+
log_file = setup_logging(experiment_id="2-adaptive_dataset_splits", base_dir=Path(__file__).parent)
|
|
103
|
+
output_dir = Path(__file__).parent / "results"
|
|
104
|
+
output_dir.mkdir(exist_ok=True)
|
|
105
|
+
results_dict = asyncio.run(main())
|
|
106
|
+
save_results_to_json(results_dict, output_dir, "2-adaptive_dataset_splits_output.json")
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Tutorials - Upload Results - Example 1 - Uploading score() Results."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
10
|
+
|
|
11
|
+
from scorebook import login, score
|
|
12
|
+
from scorebook.metrics.accuracy import Accuracy
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Score pre-computed outputs and upload results to Trismik's dashboard.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how to upload score() results to Trismik.
|
|
19
|
+
The score() function is used when you already have model outputs and
|
|
20
|
+
want to score them against labels.
|
|
21
|
+
|
|
22
|
+
Use score() when you want to:
|
|
23
|
+
- Score pre-computed model outputs
|
|
24
|
+
- Re-score existing results with different metrics
|
|
25
|
+
- Upload scoring results without re-running inference
|
|
26
|
+
|
|
27
|
+
Prerequisites:
|
|
28
|
+
- Valid Trismik API key set in TRISMIK_API_KEY environment variable
|
|
29
|
+
- A Trismik project ID
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# Prepare items with pre-computed outputs and labels
|
|
33
|
+
items = [
|
|
34
|
+
{
|
|
35
|
+
"input": "What is 2 + 2?",
|
|
36
|
+
"output": "4",
|
|
37
|
+
"label": "4"
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"input": "What is the capital of France?",
|
|
41
|
+
"output": "Paris",
|
|
42
|
+
"label": "Paris"
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"input": "Who wrote Romeo and Juliet?",
|
|
46
|
+
"output": "William Shakespeare",
|
|
47
|
+
"label": "William Shakespeare"
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"input": "What is 5 * 6?",
|
|
51
|
+
"output": "30",
|
|
52
|
+
"label": "30"
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"input": "What is the largest planet in our solar system?",
|
|
56
|
+
"output": "Jupiter",
|
|
57
|
+
"label": "Jupiter"
|
|
58
|
+
},
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
# Step 1: Log in with your Trismik API key
|
|
62
|
+
# login() reads TRISMIK_API_KEY from environment variables or .env file
|
|
63
|
+
login()
|
|
64
|
+
|
|
65
|
+
# Step 2: Score the outputs and upload results
|
|
66
|
+
# When you provide experiment_id and project_id, results are automatically uploaded
|
|
67
|
+
results = score(
|
|
68
|
+
items=items,
|
|
69
|
+
metrics=Accuracy,
|
|
70
|
+
dataset_name="basic_questions",
|
|
71
|
+
model_name="gpt-4o-mini",
|
|
72
|
+
experiment_id="Score-Upload-Example",
|
|
73
|
+
project_id="TRISMIK_PROJECT_ID", # TODO: ADD YOUR TRISMIK PROJECT ID
|
|
74
|
+
metadata={
|
|
75
|
+
"description": "Example demonstrating score() result uploading",
|
|
76
|
+
"note": "These are pre-computed outputs",
|
|
77
|
+
},
|
|
78
|
+
upload_results=True, # Explicitly enable uploading
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
print("\nResults uploaded successfully!")
|
|
82
|
+
pprint(results)
|
|
83
|
+
return results
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
load_dotenv()
|
|
88
|
+
log_file = setup_logging(experiment_id="1-uploading_score_results", base_dir=Path(__file__).parent)
|
|
89
|
+
output_dir = Path(__file__).parent / "results"
|
|
90
|
+
output_dir.mkdir(exist_ok=True)
|
|
91
|
+
results_dict = main()
|
|
92
|
+
save_results_to_json(results_dict, output_dir, "1-uploading_score_results_output.json")
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Tutorials - Upload Results - Example 2 - Uploading evaluate() Results."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any, List
|
|
6
|
+
|
|
7
|
+
import transformers
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
|
|
10
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
11
|
+
|
|
12
|
+
from scorebook import EvalDataset, evaluate, login
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Run an evaluation and upload results to Trismik's dashboard.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how to upload evaluate() results to Trismik.
|
|
19
|
+
The evaluate() function runs inference on a dataset and automatically
|
|
20
|
+
uploads the results when you provide experiment_id and project_id.
|
|
21
|
+
|
|
22
|
+
Use evaluate() when you want to:
|
|
23
|
+
- Run inference AND score in one step
|
|
24
|
+
- Track full evaluation runs with hyperparameters
|
|
25
|
+
- Compare different models on the same dataset
|
|
26
|
+
|
|
27
|
+
Prerequisites:
|
|
28
|
+
- Valid Trismik API key set in TRISMIK_API_KEY environment variable
|
|
29
|
+
- A Trismik project ID
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# Initialize HuggingFace model pipeline
|
|
33
|
+
model_name = "microsoft/Phi-4-mini-instruct"
|
|
34
|
+
pipeline = transformers.pipeline(
|
|
35
|
+
"text-generation",
|
|
36
|
+
model=model_name,
|
|
37
|
+
model_kwargs={"torch_dtype": "auto"},
|
|
38
|
+
device_map="auto",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Define an inference function
|
|
42
|
+
def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
43
|
+
"""Process inputs through the model.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
inputs: Input values from an EvalDataset.
|
|
47
|
+
hyperparameters: Model hyperparameters.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of model outputs for all inputs.
|
|
51
|
+
"""
|
|
52
|
+
outputs = []
|
|
53
|
+
for input_val in inputs:
|
|
54
|
+
# Build messages
|
|
55
|
+
messages = [
|
|
56
|
+
{"role": "system", "content": hyperparameters["system_message"]},
|
|
57
|
+
{"role": "user", "content": str(input_val)},
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
# Run inference
|
|
61
|
+
result = pipeline(messages)
|
|
62
|
+
|
|
63
|
+
# Extract the answer
|
|
64
|
+
output = str(result[0]["generated_text"][-1]["content"])
|
|
65
|
+
outputs.append(output)
|
|
66
|
+
|
|
67
|
+
return outputs
|
|
68
|
+
|
|
69
|
+
# Load evaluation dataset
|
|
70
|
+
dataset_path = Path(__file__).parent.parent / "3-evaluation_datasets" / "example_datasets" / "basic_questions.json"
|
|
71
|
+
dataset = EvalDataset.from_json(
|
|
72
|
+
path=str(dataset_path),
|
|
73
|
+
metrics="accuracy",
|
|
74
|
+
input="question",
|
|
75
|
+
label="answer",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Step 1: Log in with your Trismik API key
|
|
79
|
+
# login() reads TRISMIK_API_KEY from environment variables or .env file
|
|
80
|
+
login()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Step 2: Run evaluation with result uploading
|
|
84
|
+
# When you provide experiment_id and project_id, results are automatically uploaded
|
|
85
|
+
print(f"\nRunning evaluation with model: {model_name}")
|
|
86
|
+
print("Results will be uploaded to Trismik dashboard.\n")
|
|
87
|
+
|
|
88
|
+
results = evaluate(
|
|
89
|
+
inference,
|
|
90
|
+
dataset,
|
|
91
|
+
hyperparameters={
|
|
92
|
+
"system_message": "Answer the question directly and concisely.",
|
|
93
|
+
},
|
|
94
|
+
experiment_id="Uploading-Results-Example", # Creates/uses this experiment
|
|
95
|
+
project_id="TRISMIK_PROJECT_ID", # TODO: ADD YOUR TRISMIK PROJECT ID
|
|
96
|
+
metadata={
|
|
97
|
+
"model": model_name,
|
|
98
|
+
"description": "Example evaluation demonstrating result uploading",
|
|
99
|
+
},
|
|
100
|
+
return_aggregates=True,
|
|
101
|
+
return_items=True,
|
|
102
|
+
return_output=True,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
print("\nResults uploaded successfully!")
|
|
106
|
+
pprint(results)
|
|
107
|
+
|
|
108
|
+
return results
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
if __name__ == "__main__":
|
|
112
|
+
load_dotenv()
|
|
113
|
+
log_file = setup_logging(experiment_id="2-uploading_evaluate_results", base_dir=Path(__file__).parent)
|
|
114
|
+
output_dir = Path(__file__).parent / "results"
|
|
115
|
+
output_dir.mkdir(exist_ok=True)
|
|
116
|
+
results_dict = main()
|
|
117
|
+
save_results_to_json(results_dict, output_dir, "2-uploading_evaluate_results_output.json")
|