scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. scorebook/__init__.py +12 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +57 -12
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +4 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +36 -19
  11. scorebook/evaluate/_sync/evaluate.py +36 -19
  12. scorebook/evaluate/evaluate_helpers.py +4 -3
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/README.md +121 -0
  17. scorebook/metrics/__init__.py +7 -16
  18. scorebook/metrics/accuracy.py +2 -6
  19. scorebook/metrics/bertscore.py +50 -0
  20. scorebook/metrics/bleu.py +82 -0
  21. scorebook/metrics/core/__init__.py +1 -0
  22. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  23. scorebook/metrics/core/metric_registry.py +195 -0
  24. scorebook/metrics/exactmatch.py +95 -0
  25. scorebook/metrics/f1.py +96 -0
  26. scorebook/metrics/precision.py +84 -9
  27. scorebook/metrics/recall.py +94 -0
  28. scorebook/metrics/rouge.py +85 -0
  29. scorebook/score/__init__.py +0 -5
  30. scorebook/score/_async/score_async.py +3 -2
  31. scorebook/score/_sync/score.py +3 -2
  32. scorebook/score/score_helpers.py +29 -12
  33. scorebook/types.py +3 -3
  34. scorebook/utils/__init__.py +0 -22
  35. scorebook/utils/common_helpers.py +1 -1
  36. scorebook/utils/mock_llm/__init__.py +41 -0
  37. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  38. scorebook/utils/progress_bars.py +58 -786
  39. scorebook-0.0.15.dist-info/METADATA +300 -0
  40. scorebook-0.0.15.dist-info/RECORD +110 -0
  41. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  42. tutorials/README.md +147 -0
  43. tutorials/__init__.py +5 -0
  44. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  45. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  46. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  47. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  48. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  49. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  50. tutorials/examples/1-score/__init__.py +0 -0
  51. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  52. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  53. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  54. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  55. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  56. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  57. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  58. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  59. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  60. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  61. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  62. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  63. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  64. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  65. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  66. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  67. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  68. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  69. tutorials/examples/6-providers/aws/__init__.py +1 -0
  70. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  71. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  72. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  73. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  74. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  75. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  76. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  77. tutorials/examples/__init__.py +0 -0
  78. tutorials/notebooks/1-scoring.ipynb +162 -0
  79. tutorials/notebooks/2-evaluating.ipynb +316 -0
  80. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  81. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  82. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  83. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  84. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  85. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  86. tutorials/quickstarts/getting_started.ipynb +197 -0
  87. tutorials/utils/__init__.py +35 -0
  88. tutorials/utils/args_parser.py +132 -0
  89. tutorials/utils/output.py +23 -0
  90. tutorials/utils/setup.py +98 -0
  91. scorebook/metrics/metric_registry.py +0 -105
  92. scorebook/trismik/__init__.py +0 -10
  93. scorebook-0.0.13.dist-info/METADATA +0 -389
  94. scorebook-0.0.13.dist-info/RECORD +0 -50
  95. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  96. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,110 @@
1
+ """Tutorials - Evaluation Datasets - Example 3 - Loading from YAML Config."""
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+ from pprint import pprint
6
+ from typing import Any, List
7
+
8
+ from dotenv import load_dotenv
9
+ from openai import AsyncOpenAI
10
+
11
+ from tutorials.utils import save_results_to_json, setup_logging
12
+
13
+ from scorebook import EvalDataset, evaluate_async
14
+
15
+
16
+ async def main() -> Any:
17
+ """Run evaluations using datasets loaded from YAML configuration files.
18
+
19
+ This example demonstrates how to use YAML configuration files to define
20
+ dataset loading parameters. YAML configs are useful for:
21
+ - Storing dataset configurations in version control
22
+ - Reusing the same dataset configuration across projects
23
+ - Defining complex prompt templates and field mappings
24
+
25
+ The YAML files contain:
26
+ - HuggingFace dataset path and split information
27
+ - Metrics to use for evaluation
28
+ - Jinja2 templates for input and label formatting
29
+ - Metadata about the dataset
30
+
31
+ Prerequisites:
32
+ - OpenAI API key set in environment variable OPENAI_API_KEY
33
+ """
34
+
35
+ # Initialize OpenAI client
36
+ client = AsyncOpenAI()
37
+ model_name = "gpt-4o-mini"
38
+
39
+ # Define an async inference function
40
+ async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
41
+ """Process inputs through OpenAI's API.
42
+
43
+ Args:
44
+ inputs: Input values from an EvalDataset.
45
+ hyperparameters: Model hyperparameters.
46
+
47
+ Returns:
48
+ List of model outputs for all inputs.
49
+ """
50
+ outputs = []
51
+ for input_val in inputs:
52
+ # Build messages for OpenAI API
53
+ messages = [
54
+ {
55
+ "role": "system",
56
+ "content": "Answer the multiple choice question by selecting the correct letter (A, B, C, D, etc.). Provide ONLY the letter of your answer, no additional text or explanation.",
57
+ },
58
+ {"role": "user", "content": str(input_val)},
59
+ ]
60
+
61
+ # Call OpenAI API
62
+ try:
63
+ response = await client.chat.completions.create(
64
+ model=model_name,
65
+ messages=messages,
66
+ temperature=0.7,
67
+ )
68
+ output = response.choices[0].message.content.strip()
69
+ except Exception as e:
70
+ output = f"Error: {str(e)}"
71
+
72
+ outputs.append(output)
73
+
74
+ return outputs
75
+
76
+ # Construct paths to YAML config files
77
+ yaml_configs_dir = Path(__file__).parent / "example_yaml_configs"
78
+ cais_mmlu_yaml = yaml_configs_dir / "Cais-MMLU.yaml"
79
+ tiger_mmlu_pro_yaml = yaml_configs_dir / "TIGER-Lab-MMLU-Pro.yaml"
80
+
81
+ # Load Cais-MMLU dataset from YAML configuration
82
+ cais_mmlu = EvalDataset.from_yaml(str(cais_mmlu_yaml))
83
+ print(f"Loaded {cais_mmlu.name} from YAML config: {len(cais_mmlu.items)} items")
84
+
85
+ # Load TIGER-Lab MMLU-Pro dataset from YAML configuration
86
+ tiger_mmlu_pro = EvalDataset.from_yaml(str(tiger_mmlu_pro_yaml))
87
+ print(f"Loaded {tiger_mmlu_pro.name} from YAML config: {len(tiger_mmlu_pro.items)} items")
88
+
89
+ # Run evaluation on both datasets
90
+ results = await evaluate_async(
91
+ inference,
92
+ datasets=[cais_mmlu, tiger_mmlu_pro],
93
+ sample_size=5, # Sample 5 items from each dataset for quick testing
94
+ return_aggregates=True,
95
+ return_items=True,
96
+ return_output=True,
97
+ upload_results=False,
98
+ )
99
+
100
+ pprint(results)
101
+ return results
102
+
103
+
104
+ if __name__ == "__main__":
105
+ load_dotenv()
106
+ log_file = setup_logging(experiment_id="3-evaluation_datasets_from_yaml", base_dir=Path(__file__).parent)
107
+ output_dir = Path(__file__).parent / "results"
108
+ output_dir.mkdir(exist_ok=True)
109
+ results_dict = asyncio.run(main())
110
+ save_results_to_json(results_dict, output_dir, "3-evaluation_datasets_from_yaml_output.json")
@@ -0,0 +1,11 @@
1
+ question,answer
2
+ What is 2 + 2?,4
3
+ What is the capital of France?,Paris
4
+ Who wrote Romeo and Juliet?,William Shakespeare
5
+ What is 5 * 6?,30
6
+ What is the largest planet in our solar system?,Jupiter
7
+ Who painted the Mona Lisa?,Leonardo da Vinci
8
+ What is the square root of 64?,8
9
+ What is the capital of Japan?,Tokyo
10
+ Who invented the telephone?,Alexander Graham Bell
11
+ What is 12 - 7?,5
@@ -0,0 +1,42 @@
1
+ [
2
+ {
3
+ "question": "What is 2 + 2?",
4
+ "answer": "4"
5
+ },
6
+ {
7
+ "question": "What is the capital of France?",
8
+ "answer": "Paris"
9
+ },
10
+ {
11
+ "question": "Who wrote Romeo and Juliet?",
12
+ "answer": "William Shakespeare"
13
+ },
14
+ {
15
+ "question": "What is 5 * 6?",
16
+ "answer": "30"
17
+ },
18
+ {
19
+ "question": "What is the largest planet in our solar system?",
20
+ "answer": "Jupiter"
21
+ },
22
+ {
23
+ "question": "Who painted the Mona Lisa?",
24
+ "answer": "Leonardo da Vinci"
25
+ },
26
+ {
27
+ "question": "What is the square root of 64?",
28
+ "answer": "8"
29
+ },
30
+ {
31
+ "question": "What is the capital of Japan?",
32
+ "answer": "Tokyo"
33
+ },
34
+ {
35
+ "question": "Who invented the telephone?",
36
+ "answer": "Alexander Graham Bell"
37
+ },
38
+ {
39
+ "question": "What is 12 - 7?",
40
+ "answer": "5"
41
+ }
42
+ ]
@@ -0,0 +1,19 @@
1
+ path: "cais/mmlu"
2
+ name: "Cais-MMLU"
3
+ split: "test"
4
+ config: "all"
5
+ metrics:
6
+ - "accuracy"
7
+
8
+ templates:
9
+ input: |
10
+ {{ question }}
11
+
12
+ A. {{ choices[0] }}
13
+ B. {{ choices[1] }}
14
+ C. {{ choices[2] }}
15
+ D. {{ choices[3] }}
16
+ label: "{{ answer }}"
17
+
18
+ metadata:
19
+ description: "MMLU multiple choice questions from Cais"
@@ -0,0 +1,18 @@
1
+ path: "TIGER-Lab/MMLU-Pro"
2
+ name: "TIGER-Lab/MMLU-Pro"
3
+ split: "validation"
4
+ config: "default"
5
+ metrics:
6
+ - "accuracy"
7
+
8
+ templates:
9
+ input: |
10
+ {{ question }}
11
+ Options:
12
+ {% for option in options %}
13
+ {{ number_to_letter(loop.index0) }} : {{ option }}
14
+ {% endfor %}
15
+ label: "{{ answer }}"
16
+
17
+ metadata:
18
+ description: "MMLU-Pro multiple choice questions"
@@ -0,0 +1,114 @@
1
+ """Tutorials - Adaptive Evaluations - Example 1 - Adaptive Evaluation."""
2
+
3
+ import asyncio
4
+ import string
5
+ from pathlib import Path
6
+ from pprint import pprint
7
+ from typing import Any, List
8
+
9
+ from dotenv import load_dotenv
10
+ from openai import AsyncOpenAI
11
+
12
+ from tutorials.utils import save_results_to_json, setup_logging
13
+
14
+ from scorebook import evaluate_async, login
15
+
16
+
17
+ async def main() -> Any:
18
+ """Run an adaptive evaluation using Trismik's adaptive testing.
19
+
20
+ This example demonstrates how to use Trismik's adaptive evaluation feature.
21
+ Adaptive evaluations use Item Response Theory (IRT) to efficiently estimate
22
+ model capabilities by selecting questions based on previous responses.
23
+
24
+ Benefits of adaptive evaluation:
25
+ - More efficient: Fewer questions needed to assess capability
26
+ - Precise measurement: Better statistical confidence intervals
27
+ - Optimal difficulty: Questions adapt to model's skill level
28
+
29
+ Prerequisites:
30
+ - Valid Trismik API key set in TRISMIK_API_KEY environment variable
31
+ - A Trismik project ID
32
+ - OpenAI API key set in OPENAI_API_KEY environment variable
33
+ """
34
+
35
+ # Initialize OpenAI client
36
+ client = AsyncOpenAI()
37
+ model_name = "gpt-4o-mini"
38
+
39
+ # Define an async inference function
40
+ async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
41
+ """Process inputs through OpenAI's API.
42
+
43
+ Args:
44
+ inputs: Input values from an EvalDataset. For adaptive MMLU-Pro,
45
+ each input is a dict with 'question' and 'options' keys.
46
+ hyperparameters: Model hyperparameters.
47
+
48
+ Returns:
49
+ List of model outputs for all inputs.
50
+ """
51
+ outputs = []
52
+ for input_val in inputs:
53
+ # Handle dict input from adaptive dataset
54
+ if isinstance(input_val, dict):
55
+ prompt = input_val.get("question", "")
56
+ if "options" in input_val:
57
+ prompt += "\nOptions:\n" + "\n".join(
58
+ f"{letter}: {choice}"
59
+ for letter, choice in zip(string.ascii_uppercase, input_val["options"])
60
+ )
61
+ else:
62
+ prompt = str(input_val)
63
+
64
+ # Build messages for OpenAI API
65
+ messages = [
66
+ {
67
+ "role": "system",
68
+ "content": "Answer the question with a single letter representing the correct answer from the list of choices. Do not provide any additional explanation or output beyond the single letter.",
69
+ },
70
+ {"role": "user", "content": prompt},
71
+ ]
72
+
73
+ # Call OpenAI API
74
+ try:
75
+ response = await client.chat.completions.create(
76
+ model=model_name,
77
+ messages=messages,
78
+ temperature=0.7,
79
+ )
80
+ output = response.choices[0].message.content.strip()
81
+ except Exception as e:
82
+ output = f"Error: {str(e)}"
83
+
84
+ outputs.append(output)
85
+
86
+ return outputs
87
+
88
+ # Step 1: Log in with your Trismik API key
89
+ # login() reads TRISMIK_API_KEY from environment variables or .env file
90
+ login()
91
+
92
+ # Step 2: Run adaptive evaluation
93
+ results = await evaluate_async(
94
+ inference,
95
+ datasets="trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
96
+ experiment_id="Adaptive-Head-QA-Evaluation",
97
+ project_id='TRISMIK-PROJECT-ID',
98
+ return_dict=True,
99
+ return_aggregates=True,
100
+ return_items=True,
101
+ return_output=True,
102
+ )
103
+
104
+ pprint(results)
105
+ return results
106
+
107
+
108
+ if __name__ == "__main__":
109
+ load_dotenv()
110
+ log_file = setup_logging(experiment_id="1-adaptive_evaluation", base_dir=Path(__file__).parent)
111
+ output_dir = Path(__file__).parent / "results"
112
+ output_dir.mkdir(exist_ok=True)
113
+ results_dict = asyncio.run(main())
114
+ save_results_to_json(results_dict, output_dir, "1-adaptive_evaluation_output.json")
@@ -0,0 +1,106 @@
1
+ """Tutorials - Adaptive Evaluations - Example 2 - Adaptive Dataset Splits."""
2
+
3
+ import asyncio
4
+ import string
5
+ from pathlib import Path
6
+ from pprint import pprint
7
+ from typing import Any, List
8
+
9
+ from dotenv import load_dotenv
10
+ from openai import AsyncOpenAI
11
+
12
+ from tutorials.utils import save_results_to_json, setup_logging
13
+
14
+ from scorebook import evaluate_async, login
15
+
16
+
17
+ async def main() -> Any:
18
+ """
19
+
20
+ Prerequisites:
21
+ - Valid Trismik API key set in TRISMIK_API_KEY environment variable
22
+ - A Trismik project ID
23
+ - OpenAI API key set in OPENAI_API_KEY environment variable
24
+ """
25
+
26
+ # Initialize OpenAI client
27
+ client = AsyncOpenAI()
28
+ model_name = "gpt-4o-mini"
29
+
30
+ # Define an async inference function
31
+ async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
32
+ """Process inputs through OpenAI's API.
33
+
34
+ Args:
35
+ inputs: Input values from an EvalDataset. For adaptive headQA,
36
+ each input is a dict with 'question' and 'options' keys.
37
+ hyperparameters: Model hyperparameters.
38
+
39
+ Returns:
40
+ List of model outputs for all inputs.
41
+ """
42
+ outputs = []
43
+ for input_val in inputs:
44
+ # Handle dict input from adaptive dataset
45
+ if isinstance(input_val, dict):
46
+ prompt = input_val.get("question", "")
47
+ if "options" in input_val:
48
+ prompt += "\nOptions:\n" + "\n".join(
49
+ f"{letter}: {choice}"
50
+ for letter, choice in zip(string.ascii_uppercase, input_val["options"])
51
+ )
52
+ else:
53
+ prompt = str(input_val)
54
+
55
+ # Build messages for OpenAI API
56
+ messages = [
57
+ {
58
+ "role": "system",
59
+ "content": "Answer the question with a single letter representing the correct answer from the list of choices. Do not provide any additional explanation or output beyond the single letter.",
60
+ },
61
+ {"role": "user", "content": prompt},
62
+ ]
63
+
64
+ # Call OpenAI API
65
+ try:
66
+ response = await client.chat.completions.create(
67
+ model=model_name,
68
+ messages=messages,
69
+ temperature=0.7,
70
+ )
71
+ output = response.choices[0].message.content.strip()
72
+ except Exception as e:
73
+ output = f"Error: {str(e)}"
74
+
75
+ outputs.append(output)
76
+
77
+ return outputs
78
+
79
+ # Step 1: Log in with your Trismik API key
80
+ # login() reads TRISMIK_API_KEY from environment variables or .env file
81
+ login()
82
+
83
+ # Step 2: Run adaptive evaluation
84
+ results = await evaluate_async(
85
+ inference,
86
+ datasets="trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
87
+ split="test", # Specify the test split for evaluation
88
+ experiment_id="Adaptive-Head-QA-Test-Evaluation",
89
+ project_id='TRISMIK-PROJECT-ID',
90
+ return_dict=True,
91
+ return_aggregates=True,
92
+ return_items=True,
93
+ return_output=True,
94
+ )
95
+
96
+ pprint(results)
97
+ return results
98
+
99
+
100
+ if __name__ == "__main__":
101
+ load_dotenv()
102
+ log_file = setup_logging(experiment_id="2-adaptive_dataset_splits", base_dir=Path(__file__).parent)
103
+ output_dir = Path(__file__).parent / "results"
104
+ output_dir.mkdir(exist_ok=True)
105
+ results_dict = asyncio.run(main())
106
+ save_results_to_json(results_dict, output_dir, "2-adaptive_dataset_splits_output.json")
@@ -0,0 +1,92 @@
1
+ """Tutorials - Upload Results - Example 1 - Uploading score() Results."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from tutorials.utils import save_results_to_json, setup_logging
10
+
11
+ from scorebook import login, score
12
+ from scorebook.metrics.accuracy import Accuracy
13
+
14
+
15
+ def main() -> Any:
16
+ """Score pre-computed outputs and upload results to Trismik's dashboard.
17
+
18
+ This example demonstrates how to upload score() results to Trismik.
19
+ The score() function is used when you already have model outputs and
20
+ want to score them against labels.
21
+
22
+ Use score() when you want to:
23
+ - Score pre-computed model outputs
24
+ - Re-score existing results with different metrics
25
+ - Upload scoring results without re-running inference
26
+
27
+ Prerequisites:
28
+ - Valid Trismik API key set in TRISMIK_API_KEY environment variable
29
+ - A Trismik project ID
30
+ """
31
+
32
+ # Prepare items with pre-computed outputs and labels
33
+ items = [
34
+ {
35
+ "input": "What is 2 + 2?",
36
+ "output": "4",
37
+ "label": "4"
38
+ },
39
+ {
40
+ "input": "What is the capital of France?",
41
+ "output": "Paris",
42
+ "label": "Paris"
43
+ },
44
+ {
45
+ "input": "Who wrote Romeo and Juliet?",
46
+ "output": "William Shakespeare",
47
+ "label": "William Shakespeare"
48
+ },
49
+ {
50
+ "input": "What is 5 * 6?",
51
+ "output": "30",
52
+ "label": "30"
53
+ },
54
+ {
55
+ "input": "What is the largest planet in our solar system?",
56
+ "output": "Jupiter",
57
+ "label": "Jupiter"
58
+ },
59
+ ]
60
+
61
+ # Step 1: Log in with your Trismik API key
62
+ # login() reads TRISMIK_API_KEY from environment variables or .env file
63
+ login()
64
+
65
+ # Step 2: Score the outputs and upload results
66
+ # When you provide experiment_id and project_id, results are automatically uploaded
67
+ results = score(
68
+ items=items,
69
+ metrics=Accuracy,
70
+ dataset_name="basic_questions",
71
+ model_name="gpt-4o-mini",
72
+ experiment_id="Score-Upload-Example",
73
+ project_id="TRISMIK_PROJECT_ID", # TODO: ADD YOUR TRISMIK PROJECT ID
74
+ metadata={
75
+ "description": "Example demonstrating score() result uploading",
76
+ "note": "These are pre-computed outputs",
77
+ },
78
+ upload_results=True, # Explicitly enable uploading
79
+ )
80
+
81
+ print("\nResults uploaded successfully!")
82
+ pprint(results)
83
+ return results
84
+
85
+
86
+ if __name__ == "__main__":
87
+ load_dotenv()
88
+ log_file = setup_logging(experiment_id="1-uploading_score_results", base_dir=Path(__file__).parent)
89
+ output_dir = Path(__file__).parent / "results"
90
+ output_dir.mkdir(exist_ok=True)
91
+ results_dict = main()
92
+ save_results_to_json(results_dict, output_dir, "1-uploading_score_results_output.json")
@@ -0,0 +1,117 @@
1
+ """Tutorials - Upload Results - Example 2 - Uploading evaluate() Results."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any, List
6
+
7
+ import transformers
8
+ from dotenv import load_dotenv
9
+
10
+ from tutorials.utils import save_results_to_json, setup_logging
11
+
12
+ from scorebook import EvalDataset, evaluate, login
13
+
14
+
15
+ def main() -> Any:
16
+ """Run an evaluation and upload results to Trismik's dashboard.
17
+
18
+ This example demonstrates how to upload evaluate() results to Trismik.
19
+ The evaluate() function runs inference on a dataset and automatically
20
+ uploads the results when you provide experiment_id and project_id.
21
+
22
+ Use evaluate() when you want to:
23
+ - Run inference AND score in one step
24
+ - Track full evaluation runs with hyperparameters
25
+ - Compare different models on the same dataset
26
+
27
+ Prerequisites:
28
+ - Valid Trismik API key set in TRISMIK_API_KEY environment variable
29
+ - A Trismik project ID
30
+ """
31
+
32
+ # Initialize HuggingFace model pipeline
33
+ model_name = "microsoft/Phi-4-mini-instruct"
34
+ pipeline = transformers.pipeline(
35
+ "text-generation",
36
+ model=model_name,
37
+ model_kwargs={"torch_dtype": "auto"},
38
+ device_map="auto",
39
+ )
40
+
41
+ # Define an inference function
42
+ def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
43
+ """Process inputs through the model.
44
+
45
+ Args:
46
+ inputs: Input values from an EvalDataset.
47
+ hyperparameters: Model hyperparameters.
48
+
49
+ Returns:
50
+ List of model outputs for all inputs.
51
+ """
52
+ outputs = []
53
+ for input_val in inputs:
54
+ # Build messages
55
+ messages = [
56
+ {"role": "system", "content": hyperparameters["system_message"]},
57
+ {"role": "user", "content": str(input_val)},
58
+ ]
59
+
60
+ # Run inference
61
+ result = pipeline(messages)
62
+
63
+ # Extract the answer
64
+ output = str(result[0]["generated_text"][-1]["content"])
65
+ outputs.append(output)
66
+
67
+ return outputs
68
+
69
+ # Load evaluation dataset
70
+ dataset_path = Path(__file__).parent.parent / "3-evaluation_datasets" / "example_datasets" / "basic_questions.json"
71
+ dataset = EvalDataset.from_json(
72
+ path=str(dataset_path),
73
+ metrics="accuracy",
74
+ input="question",
75
+ label="answer",
76
+ )
77
+
78
+ # Step 1: Log in with your Trismik API key
79
+ # login() reads TRISMIK_API_KEY from environment variables or .env file
80
+ login()
81
+
82
+
83
+ # Step 2: Run evaluation with result uploading
84
+ # When you provide experiment_id and project_id, results are automatically uploaded
85
+ print(f"\nRunning evaluation with model: {model_name}")
86
+ print("Results will be uploaded to Trismik dashboard.\n")
87
+
88
+ results = evaluate(
89
+ inference,
90
+ dataset,
91
+ hyperparameters={
92
+ "system_message": "Answer the question directly and concisely.",
93
+ },
94
+ experiment_id="Uploading-Results-Example", # Creates/uses this experiment
95
+ project_id="TRISMIK_PROJECT_ID", # TODO: ADD YOUR TRISMIK PROJECT ID
96
+ metadata={
97
+ "model": model_name,
98
+ "description": "Example evaluation demonstrating result uploading",
99
+ },
100
+ return_aggregates=True,
101
+ return_items=True,
102
+ return_output=True,
103
+ )
104
+
105
+ print("\nResults uploaded successfully!")
106
+ pprint(results)
107
+
108
+ return results
109
+
110
+
111
+ if __name__ == "__main__":
112
+ load_dotenv()
113
+ log_file = setup_logging(experiment_id="2-uploading_evaluate_results", base_dir=Path(__file__).parent)
114
+ output_dir = Path(__file__).parent / "results"
115
+ output_dir.mkdir(exist_ok=True)
116
+ results_dict = main()
117
+ save_results_to_json(results_dict, output_dir, "2-uploading_evaluate_results_output.json")