scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +2 -0
- scorebook/dashboard/credentials.py +34 -4
- scorebook/eval_datasets/eval_dataset.py +2 -2
- scorebook/evaluate/_async/evaluate_async.py +27 -11
- scorebook/evaluate/_sync/evaluate.py +27 -11
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +8 -0
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/score_helpers.py +28 -11
- scorebook/types.py +2 -2
- scorebook/utils/progress_bars.py +58 -786
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -107
- scorebook-0.0.14.dist-info/RECORD +0 -53
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,43 +1,41 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scorebook
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.15
|
|
4
4
|
Summary: A Python project for LLM evaluation.
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Author: Euan Campbell
|
|
7
7
|
Author-email: euan@trismik.com
|
|
8
|
-
Requires-Python: >=3.
|
|
8
|
+
Requires-Python: >=3.10, <3.14
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
11
10
|
Classifier: Programming Language :: Python :: 3.10
|
|
12
11
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
-
Provides-Extra: bedrock
|
|
16
14
|
Provides-Extra: examples
|
|
17
|
-
Provides-Extra:
|
|
18
|
-
Provides-Extra:
|
|
19
|
-
Provides-Extra: vertex
|
|
15
|
+
Provides-Extra: metrics
|
|
16
|
+
Provides-Extra: providers
|
|
20
17
|
Requires-Dist: accelerate ; extra == "examples"
|
|
21
|
-
Requires-Dist:
|
|
18
|
+
Requires-Dist: bert-score ; extra == "metrics"
|
|
19
|
+
Requires-Dist: boto3 (==1.40.0) ; extra == "providers"
|
|
22
20
|
Requires-Dist: datasets (>=3.6.0)
|
|
23
|
-
Requires-Dist: fsspec[gcs] ; extra == "
|
|
24
|
-
Requires-Dist: google-cloud-storage ; extra == "
|
|
25
|
-
Requires-Dist: google-genai ; extra == "
|
|
26
|
-
Requires-Dist: ipywidgets
|
|
27
|
-
Requires-Dist:
|
|
21
|
+
Requires-Dist: fsspec[gcs] ; extra == "providers"
|
|
22
|
+
Requires-Dist: google-cloud-storage ; extra == "providers"
|
|
23
|
+
Requires-Dist: google-genai ; extra == "providers"
|
|
24
|
+
Requires-Dist: ipywidgets ; extra == "examples"
|
|
25
|
+
Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
|
|
28
26
|
Requires-Dist: notebook ; extra == "examples"
|
|
29
|
-
Requires-Dist: openai ; extra == "
|
|
30
|
-
Requires-Dist: pandas ; extra == "
|
|
31
|
-
Requires-Dist: portkey-ai ; extra == "
|
|
32
|
-
Requires-Dist: python-dotenv
|
|
33
|
-
Requires-Dist:
|
|
34
|
-
Requires-Dist:
|
|
35
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: openai ; extra == "providers"
|
|
28
|
+
Requires-Dist: pandas ; extra == "providers"
|
|
29
|
+
Requires-Dist: portkey-ai ; extra == "providers"
|
|
30
|
+
Requires-Dist: python-dotenv (>=1.0.0)
|
|
31
|
+
Requires-Dist: rouge-score ; extra == "metrics"
|
|
32
|
+
Requires-Dist: sacrebleu ; extra == "metrics"
|
|
33
|
+
Requires-Dist: scikit-learn (>=1.0.0) ; extra == "metrics"
|
|
36
34
|
Requires-Dist: torch ; extra == "examples"
|
|
37
35
|
Requires-Dist: torchaudio ; extra == "examples"
|
|
38
36
|
Requires-Dist: torchvision ; extra == "examples"
|
|
39
37
|
Requires-Dist: transformers ; extra == "examples"
|
|
40
|
-
Requires-Dist: trismik (
|
|
38
|
+
Requires-Dist: trismik (>=1.0.3)
|
|
41
39
|
Description-Content-Type: text/markdown
|
|
42
40
|
|
|
43
41
|
<h1 align="center">Scorebook</h1>
|
|
@@ -51,6 +49,9 @@ Description-Content-Type: text/markdown
|
|
|
51
49
|
<img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
|
|
52
50
|
</a>
|
|
53
51
|
<img alt="License" src="https://img.shields.io/badge/license-MIT-green">
|
|
52
|
+
<a target="_blank" href="https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb">
|
|
53
|
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
|
54
|
+
</a>
|
|
54
55
|
</p>
|
|
55
56
|
|
|
56
57
|
Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
|
|
@@ -251,9 +252,16 @@ results = evaluate(
|
|
|
251
252
|
|
|
252
253
|
## Metrics
|
|
253
254
|
|
|
254
|
-
| Metric
|
|
255
|
-
|
|
256
|
-
| `Accuracy`
|
|
255
|
+
| Metric | Sync/Async | Aggregate Scores | Item Scores |
|
|
256
|
+
|--------------|------------|--------------------------------------------------|-----------------------------------------|
|
|
257
|
+
| `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
|
|
258
|
+
| `ExactMatch` | Sync | `Float`: Percentage of exact string matches | `Boolean`: Exact match with optional case/whitespace normalization |
|
|
259
|
+
| `F1` | Sync | `Dict[str, Float]`: F1 scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
|
|
260
|
+
| `Precision` | Sync | `Dict[str, Float]`: Precision scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
|
|
261
|
+
| `Recall` | Sync | `Dict[str, Float]`: Recall scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
|
|
262
|
+
| `BLEU` | Sync | `Float`: Corpus-level BLEU score | `Float`: Sentence-level BLEU score |
|
|
263
|
+
| `ROUGE` | Sync | `Dict[str, Float]`: Average F1 scores per ROUGE type | `Dict[str, Float]`: F1 scores per ROUGE type |
|
|
264
|
+
| `BertScore` | Sync | `Dict[str, Float]`: Average precision, recall, and F1 scores | `Dict[str, Float]`: Precision, recall, and F1 scores per item |
|
|
257
265
|
|
|
258
266
|
|
|
259
267
|
## Tutorials
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
scorebook/__init__.py,sha256=YB3PyPB0-sRicIwiQZ8aRhviu04_FZY8Ne5o-FuNWtA,1236
|
|
2
|
+
scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
|
|
3
|
+
scorebook/cli/auth.py,sha256=VGS5T0CSeS0n_7bntNggrYx-vDwxJJHdYxbKedFAq74,2939
|
|
4
|
+
scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
|
|
5
|
+
scorebook/dashboard/__init__.py,sha256=36DxO3oXVcZ2I6kizLFCcJkLBpXOU8UIXFT_ZjeFTB4,50
|
|
6
|
+
scorebook/dashboard/create_project.py,sha256=RK90aMN0_XVM-DnawTY_b59yPJaRnpb_GoidCqXB5Vw,2845
|
|
7
|
+
scorebook/dashboard/credentials.py,sha256=CCxafElx_pMLD-c69JvYAcC-9SzZf3tjAnJQUf8q5Us,4796
|
|
8
|
+
scorebook/dashboard/upload_results.py,sha256=sdgOEf0C7QLt7t2QiXvSoceQpAiiPmlG_4SFEEzVPlc,9738
|
|
9
|
+
scorebook/eval_datasets/__init__.py,sha256=wsmFNyuZJdBxjokcKG4NRfuUzPZKuzsKX3aG21zfFV4,39
|
|
10
|
+
scorebook/eval_datasets/eval_dataset.py,sha256=R7upmIhvDTBwyfuFErfc2iICI6M6AkUTMeO0Oi9NFk0,28051
|
|
11
|
+
scorebook/evaluate/__init__.py,sha256=Qqe-l4y3Nu81Fdx83RbtCQESoXC0XukBgOC3DPSWZZA,39
|
|
12
|
+
scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
scorebook/evaluate/_async/evaluate_async.py,sha256=Y0bH4nSdwU13A1oUO3gM6oEyM4bCQCpr7fhUWDZZJpY,17198
|
|
14
|
+
scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
scorebook/evaluate/_sync/evaluate.py,sha256=MQqKByraT22EUPfqoq1H4VIyRcPIo0ahv6KZpHgQX1A,16972
|
|
16
|
+
scorebook/evaluate/evaluate_helpers.py,sha256=NnanxLEeHwoZNztGXQJc6u_WqKfDkn1vYmck2BrKF-c,17028
|
|
17
|
+
scorebook/exceptions.py,sha256=3sxCWhFqYgXiWNUAMRR2ggLfqvbDI8e5vLjnT9V7X1M,3649
|
|
18
|
+
scorebook/inference/__init__.py,sha256=gGuZG1rdpxKYC54q0eAS6oTHQbRYhgxlBeAqonqHvRU,60
|
|
19
|
+
scorebook/inference/clients/__init__.py,sha256=VaLW7mi4tywJtR3Q9wr2pPci8NlEQ3bJanZyM5S81Z4,51
|
|
20
|
+
scorebook/inference/clients/bedrock.py,sha256=bsnz0IB6ufjZVPd6syD3yVaOelerB5G_YAmPAVqmBmI,10071
|
|
21
|
+
scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
|
|
22
|
+
scorebook/inference/clients/portkey.py,sha256=RCuEZB8xNAVeGEt3IJ0esh_wqreZNB3jrDKiWH6miV0,5949
|
|
23
|
+
scorebook/inference/clients/vertex.py,sha256=g6oNnag0qcLOYCtQ4SXAXfnqKtvPAVdigB--I7wU1yM,9871
|
|
24
|
+
scorebook/inference/inference_pipeline.py,sha256=1qSmfI4fBJFS3EcAhRlA-f4-8aI6wDiupSJu-vNXoYI,5571
|
|
25
|
+
scorebook/metrics/README.md,sha256=DTMgT-aYLqlx32vAPEOCvbfFNrwhYXd3z9g1oTjCPPc,4065
|
|
26
|
+
scorebook/metrics/__init__.py,sha256=5m688iVI8k_NUzZJdOL_-IxvwOWU8j4fa6b1yOGPx7w,232
|
|
27
|
+
scorebook/metrics/accuracy.py,sha256=0Qxu0nbzHhQbIdd5b3iOlbTnr4Nwi_wMh7hu4EStw4I,1284
|
|
28
|
+
scorebook/metrics/bertscore.py,sha256=DDjdLZ8sBTROMF1UpGN5gyUYIkEYv7kQZkEpoMMYyuY,1853
|
|
29
|
+
scorebook/metrics/bleu.py,sha256=xUAHIxWisvyq2f0k0zzrf3eEfCdHZ-VSrcmt73z4Xto,3075
|
|
30
|
+
scorebook/metrics/core/__init__.py,sha256=dWfy6rXNvOxxdHSEvB_eJcbbHZ3xDxhGKEXYHAfrXKI,40
|
|
31
|
+
scorebook/metrics/core/metric_base.py,sha256=XDHn-U8oUzhoPukKyYkg5xJLN05fSvnciAYCrCB28Mg,845
|
|
32
|
+
scorebook/metrics/core/metric_registry.py,sha256=jWt8pB4q96po724TzOivjrabuOj6zBP9_uEdYqHxQlU,7242
|
|
33
|
+
scorebook/metrics/exactmatch.py,sha256=xjd9iSlmdJ8Vk-LjfxAGJZyQ75FLjmeT4polG-P3JkY,3128
|
|
34
|
+
scorebook/metrics/f1.py,sha256=R8yQ5mkb4bfPZM6a1X3IL1Q4OnUFD_--qwQdBSlBqaE,3710
|
|
35
|
+
scorebook/metrics/precision.py,sha256=AFBDemPcs03txaihw3BqSoJfPQUFykZ_QR-IFx3du-s,3705
|
|
36
|
+
scorebook/metrics/recall.py,sha256=ek5NV55p_Ux-2mT2xAkvQdVsg3xCJtJ3zT9O-vPEQyA,3658
|
|
37
|
+
scorebook/metrics/rouge.py,sha256=ZDld4mrOmERRuEESgeZITZ-NYO3QzBTYRidULwuPWqA,3228
|
|
38
|
+
scorebook/score/__init__.py,sha256=CqkslUvOw8QfCCbSfwZgGrbmXeSLpZqIVo4ntrctYuY,66
|
|
39
|
+
scorebook/score/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
+
scorebook/score/_async/score_async.py,sha256=SatV9hEUT8MAru2ACSyM03weKX6VTFx7crW59_uX0L8,6155
|
|
41
|
+
scorebook/score/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
+
scorebook/score/_sync/score.py,sha256=nANQbuyYyIaWnoTQzyGMwPZRMFP6MmyIyHb1GO1mktQ,6101
|
|
43
|
+
scorebook/score/score_helpers.py,sha256=NQt5K-hI4EZErenMyAoR8iQa76KBVTAc-nmUMCbqh8U,8179
|
|
44
|
+
scorebook/settings.py,sha256=qZrNiki6rFXn43udmhjQSmdDKOEaX62WYoEs2Rbggr0,720
|
|
45
|
+
scorebook/types.py,sha256=NJZCV7hB-hKLN-Mmijtm0DoQVMUJVTahPo_n4nfQ5mE,4901
|
|
46
|
+
scorebook/utils/__init__.py,sha256=oBTybVHI5EdHIgzb0TeoAnSLMQdUh20Ww6vcL9542Pk,72
|
|
47
|
+
scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
|
|
48
|
+
scorebook/utils/common_helpers.py,sha256=lJIqO9XGf1T3S3rdGBTjZJ1BzVPvaU_XTONEfPApnEM,1218
|
|
49
|
+
scorebook/utils/io_helpers.py,sha256=ORO6DwtXOKWJq9v_isuunUrz0viE3xy2qYO4lrgU-TM,1437
|
|
50
|
+
scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
|
|
51
|
+
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
52
|
+
scorebook/utils/mock_llm/__init__.py,sha256=dK70wNVBKk4hv1o3fceDTBG1_maFbkMvoOtTriPCe78,1293
|
|
53
|
+
scorebook/utils/mock_llm/data/mock_llm_data.json,sha256=b28j7OCR0igpP0rkXDJAR2NWIiuVkOaAkzB-Miv665Y,381567
|
|
54
|
+
scorebook/utils/progress_bars.py,sha256=4k52Y40oXaUo40ZPYGcMM3rFbWF7nsbzLo3DDpkYv9o,3531
|
|
55
|
+
scorebook/utils/render_template.py,sha256=NOaZt-N1WcR5MA7at1XxzD-4sFMFKo9X0k7fKq6oSSM,1654
|
|
56
|
+
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
57
|
+
tutorials/README.md,sha256=DETPgdCTTZLYMoVZ9DSfad1y7Jq4EMzi6UA9Ijm0GQc,5588
|
|
58
|
+
tutorials/__init__.py,sha256=I1ki4-8iZsO9pbK4_xmRpjzSDYoH0JdYbtwzvgPgeo8,196
|
|
59
|
+
tutorials/examples/1-score/1-scoring_model_accuracy.py,sha256=Uc9DPa31TOR_1egsUhLptUIGXb67FYLtLKpDPtQvcGA,1367
|
|
60
|
+
tutorials/examples/1-score/2-scoring_model_bleu.py,sha256=pE4SMKR32Noa1eU8QUYU1a533WvXzyPi440jsgYs9gk,1991
|
|
61
|
+
tutorials/examples/1-score/3-scoring_model_f1.py,sha256=g_uH_IgVNWtTaQ47MEBztUmGqgUn0y2464xNIoAN_dI,2078
|
|
62
|
+
tutorials/examples/1-score/4-scoring_model_rouge.py,sha256=8HaPzv-Y8oTy88FtxiFR4i0SEd7BndxL_YgzpVSLEP8,2512
|
|
63
|
+
tutorials/examples/1-score/5-scoring_model_exact_match.py,sha256=PcnrF7OfULRQd4veSKwlp5lQ1lMo749cDSdRu88sOVE,3235
|
|
64
|
+
tutorials/examples/1-score/6-scoring_with_bertscore.py,sha256=_LRDrC5K_VmfxH37L7RbvLrQB4DpHbf-8K0Vr0R4cxc,2148
|
|
65
|
+
tutorials/examples/1-score/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
+
tutorials/examples/2-evaluate/1-evaluating_local_models.py,sha256=FSFWQUism2HWHELX3Q-F1jkRJ1feeyx1TZqttSRruPY,3630
|
|
67
|
+
tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py,sha256=EKdyTP7svdtxeOdM0E569RZckwZ_GAH04ZMFi5kaVS8,3840
|
|
68
|
+
tutorials/examples/2-evaluate/3-evaluating_cloud_models.py,sha256=C8QFmfkEd3APhh5k9LLgaxa3rlj1iQfQdV5k6jRAgx8,3566
|
|
69
|
+
tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py,sha256=ckd8NZe9Qyv1FvCbuhAqj3qL1ZKBlSE1fdEy3_jxvXw,6338
|
|
70
|
+
tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py,sha256=p0A_dwoVaGnAgpdRDu1MfC-HUp8UoBanSHvyJBpM97I,3954
|
|
71
|
+
tutorials/examples/2-evaluate/6-inference_pipelines.py,sha256=fy0dmVXV1D7jfLq6Xhhqe3C3UYDBRRtPybvkGctV2E8,4991
|
|
72
|
+
tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py,sha256=VihoUKmdaHfvJftyqFTSP0hkK1N1g1tAQZmw5yH438E,3458
|
|
73
|
+
tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py,sha256=e9PvbrAoXzIVXtqlg9dZP6Xd8b81nKqzEuTuRXLhutk,3225
|
|
74
|
+
tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py,sha256=1odW_2qFNyGES9L0bYWLvP8Cplc8ciIx42SZ_JZ6yis,3911
|
|
75
|
+
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv,sha256=MvTl1wWx-TX0LgR43BvPCrk6tKUuGItp7_jA7RcWfMw,374
|
|
76
|
+
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json,sha256=au86RNV2JRvw8IHNZK-768lWJdJQnCrod-6N-F-Chmk,921
|
|
77
|
+
tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml,sha256=F_Aiuw1BJBUNFy_ENiZDiQvcT2FqNprlUb1Y9kouzhE,320
|
|
78
|
+
tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml,sha256=-D9T78LJ1ihlobHgzrEeJIplimItj6pkzms3lWz50bI,362
|
|
79
|
+
tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py,sha256=2k5U61z3g1kYEwrCW4W27Yt6dYYS-PGL-bVXmwWs5YE,4042
|
|
80
|
+
tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py,sha256=KRhzAg6JxhF0ie3YeE2UXY34Lv6zfDuL_536L9i7Qdg,3598
|
|
81
|
+
tutorials/examples/5-upload_results/1-uploading_score_results.py,sha256=VUq9WzymPzm663OEAGpWHh7Hze-tpo_S6pFbxlZsqjw,2914
|
|
82
|
+
tutorials/examples/5-upload_results/2-uploading_evaluate_results.py,sha256=SkDMtw_I4SZO1ooQpvwmq_ptOMzbYSSZlYuDEU2uJb4,3808
|
|
83
|
+
tutorials/examples/5-upload_results/3-uploading_your_results.py,sha256=XdTNTChV78oFAL9fa58YHHsGRNRFzcQ8qJdzCfDBFdI,5562
|
|
84
|
+
tutorials/examples/6-providers/aws/__init__.py,sha256=hgW4VxBUWhsxOLGErdoA5fo9JnXoj1Hz3E1tNLulHSY,63
|
|
85
|
+
tutorials/examples/6-providers/aws/batch_example.py,sha256=3SSx85E36NV2UTpB6N-FimXQRZL-jvVw9-wee0xodyk,7279
|
|
86
|
+
tutorials/examples/6-providers/portkey/__init__.py,sha256=YoLlgr7ULGjAL6BwSJ6hMmLWhI_rlEHpN-sn9QS0fa4,67
|
|
87
|
+
tutorials/examples/6-providers/portkey/batch_example.py,sha256=ea4BQhiiOKwSJSmW6uoxHLMQVk1Ji3k_k_CZ-y9ixJM,4011
|
|
88
|
+
tutorials/examples/6-providers/portkey/messages_example.py,sha256=9wKGZlqksrSuoxgDY2w9IWts4fBzmZmijRdHZzSRAQs,4397
|
|
89
|
+
tutorials/examples/6-providers/vertex/__init__.py,sha256=zVlz0-hpUZDgYFjM-OmxcPsKT1lTE-inZyRAf20aHD8,77
|
|
90
|
+
tutorials/examples/6-providers/vertex/batch_example.py,sha256=rOQKQ4BO8VFkiLYXcCmZByc6zfqM-Iu3LUMvzP_oioU,5433
|
|
91
|
+
tutorials/examples/6-providers/vertex/messages_example.py,sha256=MeH9AiNtpi0BYL_TRGEY-B-d7eS6qGdc3d0usXSWMQY,4989
|
|
92
|
+
tutorials/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
93
|
+
tutorials/notebooks/1-scoring.ipynb,sha256=0mWJlJuq2MY0OdS9Ts9BX8MH7G53Tv8rGJT4yEuCAv0,4663
|
|
94
|
+
tutorials/notebooks/2-evaluating.ipynb,sha256=XjCtnypK0CJXDxI18RZocdvr28XWsJykGxs47Ig9SUc,9007
|
|
95
|
+
tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb,sha256=r7_tAw-pswVxvVt8xA7-ewPHepMfkL1YA39p0u-UJyM,12269
|
|
96
|
+
tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb,sha256=6dhU2JFL7Mebu2VslY21WFy8C9lrzmo0gE9DQk8kj9w,9160
|
|
97
|
+
tutorials/notebooks/4-uploading_results.ipynb,sha256=a7DzNglzkBoCkN-dyncixzKNkEFalGaHWsLvhu3Kyd4,10361
|
|
98
|
+
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb,sha256=MZoHegB31jnI6wbXVtC0B2kCeNn_hyUf53m0B8G1LJw,7701
|
|
99
|
+
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb,sha256=1CIbdr_Y5XcL0iS9GcNs_Al62Mv2hSgdxvoj3lP1zU8,8428
|
|
100
|
+
tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb,sha256=f7Kjy5AVK_nNxTd3RFtFPmFCEkhr3a17aUvVDvlGyKQ,9295
|
|
101
|
+
tutorials/quickstarts/getting_started.ipynb,sha256=5083CRiTlH-LfGdwjKwBDtn5i2c4FQUGxn0bRgE_DGw,7117
|
|
102
|
+
tutorials/utils/__init__.py,sha256=bI1-auvdIRV53jpT3bWRuKHUhZ92qHVJA_4S6xBB4P0,807
|
|
103
|
+
tutorials/utils/args_parser.py,sha256=SZeqhNaSosw7iT5u0Wkb9m0XqXRx_DAitGXb7QK2nEM,4011
|
|
104
|
+
tutorials/utils/output.py,sha256=4kVdySdxOl8cjSy-as4O71AhJOj7OlGXiOmlww4cZ-E,714
|
|
105
|
+
tutorials/utils/setup.py,sha256=6obslk7PIgUlD4RyAi8SaeoB9ioxM0Vhda65ptovEKQ,3470
|
|
106
|
+
scorebook-0.0.15.dist-info/METADATA,sha256=IOkRQaLVGMQIyfJbkIOqOZxZk90PCANAI3VP3w32PJg,10759
|
|
107
|
+
scorebook-0.0.15.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
|
|
108
|
+
scorebook-0.0.15.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
|
|
109
|
+
scorebook-0.0.15.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
110
|
+
scorebook-0.0.15.dist-info/RECORD,,
|
tutorials/README.md
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Scorebook Tutorials
|
|
2
|
+
|
|
3
|
+
This directory contains tutorials, examples, and quickstarts for learning Scorebook - a Python framework for evaluating large language models.
|
|
4
|
+
|
|
5
|
+
## Directory Structure
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
tutorials/
|
|
9
|
+
├── quickstarts/ # Quick start notebooks for getting up and running
|
|
10
|
+
├── notebooks/ # Interactive Jupyter notebooks
|
|
11
|
+
├── examples/ # Python script examples organized by topic
|
|
12
|
+
│ ├── 1-score/ # Scoring pre-computed outputs
|
|
13
|
+
│ ├── 2-evaluate/ # Running evaluations
|
|
14
|
+
│ ├── 3-evaluation_datasets/ # Loading datasets
|
|
15
|
+
│ ├── 4-adaptive_evaluations/ # Trismik adaptive testing
|
|
16
|
+
│ ├── 5-upload_results/ # Uploading to Trismik
|
|
17
|
+
│ └── 6-providers/ # Cloud provider integrations
|
|
18
|
+
└── utils/ # Helper utilities for examples
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Getting Started
|
|
22
|
+
|
|
23
|
+
### Prerequisites
|
|
24
|
+
|
|
25
|
+
- Python 3.9+
|
|
26
|
+
- Install Scorebook: `pip install scorebook`
|
|
27
|
+
- For local model examples: `pip install scorebook[examples]` (includes transformers, torch)
|
|
28
|
+
- For cloud examples: `pip install scorebook[openai]` and set `OPENAI_API_KEY`
|
|
29
|
+
- For Trismik features: Set `TRISMIK_API_KEY` environment variable
|
|
30
|
+
|
|
31
|
+
### Quickstarts
|
|
32
|
+
|
|
33
|
+
Start here for a rapid introduction:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
jupyter notebook tutorials/quickstarts/getting_started.ipynb
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Available quickstarts:
|
|
40
|
+
- `getting_started.ipynb` - Introduction to Scorebook basics
|
|
41
|
+
- `classical_evaluations/` - Standard evaluation workflows
|
|
42
|
+
- `adaptive_evaluations/` - Trismik's adaptive testing feature
|
|
43
|
+
|
|
44
|
+
### Notebooks
|
|
45
|
+
|
|
46
|
+
Interactive tutorials covering core concepts:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
jupyter notebook tutorials/notebooks/
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
| Notebook | Description |
|
|
53
|
+
|----------|-------------|
|
|
54
|
+
| `1-scoring.ipynb` | Score pre-computed model outputs |
|
|
55
|
+
| `2-evaluating.ipynb` | Run full evaluation pipelines |
|
|
56
|
+
| `3.1-adaptive_evaluation_phi.ipynb` | Adaptive evaluation with local models |
|
|
57
|
+
| `3.2-adaptive_evaluation_gpt.ipynb` | Adaptive evaluation with OpenAI |
|
|
58
|
+
| `4-uploading_results.ipynb` | Upload results to Trismik dashboard |
|
|
59
|
+
|
|
60
|
+
## Examples
|
|
61
|
+
|
|
62
|
+
Python scripts demonstrating specific features. Run examples from the project root:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
python tutorials/examples/1-score/1-scoring_model_accuracy.py
|
|
66
|
+
```
|
|
67
|
+
### 1-score: Scoring Pre-computed Outputs
|
|
68
|
+
|
|
69
|
+
Score model predictions that have already been generated.
|
|
70
|
+
|
|
71
|
+
| Example | Description |
|
|
72
|
+
|---------|-------------|
|
|
73
|
+
| `1-scoring_model_accuracy.py` | Score outputs using accuracy metric |
|
|
74
|
+
| `2-scoring_model_bleu.py` | Score using BLEU metric |
|
|
75
|
+
| `3-scoring_model_f1.py` | Score using F1 metric |
|
|
76
|
+
| `4-scoring_model_rouge.py` | Score using ROUGE metric |
|
|
77
|
+
|
|
78
|
+
### 2-evaluate: Running Evaluations
|
|
79
|
+
|
|
80
|
+
End-to-end evaluation workflows with inference.
|
|
81
|
+
|
|
82
|
+
| Example | Description | Requirements |
|
|
83
|
+
|---------|-------------|--------------|
|
|
84
|
+
| `1-evaluating_local_models.py` | Basic evaluation with local HuggingFace model | - |
|
|
85
|
+
| `2-evaluating_local_models_with_batching.py` | Batch processing for improved throughput | - |
|
|
86
|
+
| `3-evaluating_cloud_models.py` | Evaluate using OpenAI API | OpenAI API key |
|
|
87
|
+
| `4-evaluating_cloud_models_with_batching.py` | OpenAI Batch API for cost savings | OpenAI API key |
|
|
88
|
+
| `5-hyperparameter_sweeps.py` | Test multiple hyperparameter configurations | - |
|
|
89
|
+
| `6-inference_pipelines.py` | Modular preprocessing/inference/postprocessing | - |
|
|
90
|
+
|
|
91
|
+
### 3-evaluation_datasets: Loading Datasets
|
|
92
|
+
|
|
93
|
+
Different ways to load evaluation data.
|
|
94
|
+
|
|
95
|
+
| Example | Description | Requirements |
|
|
96
|
+
|---------|-------------|--------------|
|
|
97
|
+
| `1-evaluation_datasets_from_files.py` | Load from JSON/CSV files | - |
|
|
98
|
+
| `2-evaluation_datasets_from_huggingface.py` | Load from HuggingFace Hub | OpenAI API key |
|
|
99
|
+
| `3-evaluation_datasets_from_huggingface_with_yaml_configs.py` | Use YAML configs for HuggingFace datasets | OpenAI API key |
|
|
100
|
+
|
|
101
|
+
### 4-adaptive_evaluations: Trismik Adaptive Testing
|
|
102
|
+
|
|
103
|
+
Efficient evaluation using Item Response Theory (IRT).
|
|
104
|
+
|
|
105
|
+
| Example | Description | Requirements |
|
|
106
|
+
|---------|-------------|--------------|
|
|
107
|
+
| `1-adaptive_evaluation.py` | Basic adaptive evaluation | Trismik + OpenAI |
|
|
108
|
+
| `2-adaptive_dataset_splits.py` | Adaptive evaluation with dataset splits | Trismik + OpenAI |
|
|
109
|
+
|
|
110
|
+
### 5-upload_results: Uploading to Trismik
|
|
111
|
+
|
|
112
|
+
Persist and share results on the Trismik dashboard.
|
|
113
|
+
|
|
114
|
+
| Example | Description | Requirements |
|
|
115
|
+
|---------|-------------|--------------|
|
|
116
|
+
| `1-uploading_score_results.py` | Upload `score()` results | Trismik API key |
|
|
117
|
+
| `2-uploading_evaluate_results.py` | Upload `evaluate()` results | Trismik API key |
|
|
118
|
+
| `3-uploading_your_results.py` | Upload custom results | Trismik API key |
|
|
119
|
+
|
|
120
|
+
### 6-providers: Cloud Provider Integrations
|
|
121
|
+
|
|
122
|
+
Batch processing with different cloud providers.
|
|
123
|
+
|
|
124
|
+
#### AWS Bedrock (`6-providers/aws/`)
|
|
125
|
+
- `batch_example.py` - Batch inference with Claude models via AWS Bedrock
|
|
126
|
+
|
|
127
|
+
**Requirements:** AWS CLI configured, S3 bucket, IAM role for Bedrock
|
|
128
|
+
|
|
129
|
+
#### Google Cloud Vertex AI (`6-providers/vertex/`)
|
|
130
|
+
- `batch_example.py` - Batch inference with Gemini models
|
|
131
|
+
- `messages_example.py` - Real-time inference with Gemini
|
|
132
|
+
|
|
133
|
+
**Requirements:** Google Cloud SDK, Vertex AI enabled project
|
|
134
|
+
|
|
135
|
+
#### Portkey (`6-providers/portkey/`)
|
|
136
|
+
- `batch_example.py` - Batch inference via Portkey gateway
|
|
137
|
+
- `messages_example.py` - Real-time inference via Portkey
|
|
138
|
+
|
|
139
|
+
**Requirements:** Portkey API key, linked provider account
|
|
140
|
+
|
|
141
|
+
## Additional Resources
|
|
142
|
+
|
|
143
|
+
- [Scorebook Documentation](https://docs.trismik.com/)
|
|
144
|
+
- [Trismik Platform](https://trismik.com)
|
|
145
|
+
- [API Reference](https://docs.trismik.com/category/reference/)
|
|
146
|
+
- [GitHub Issues](https://github.com/trismik/scorebook/issues)
|
|
147
|
+
- Contact support at support@trismik.com
|
tutorials/__init__.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Tutorials - Score - Example 1 - Scoring Models."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
10
|
+
|
|
11
|
+
from scorebook import score
|
|
12
|
+
from scorebook.metrics.accuracy import Accuracy
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Score pre-computed model predictions using Scorebook.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how to score generated model predictions.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# Prepare a list of items with generated outputs and labels
|
|
22
|
+
model_predictions = [
|
|
23
|
+
{"output": "4", "label": "4"},
|
|
24
|
+
{"output": "Paris", "label": "Paris"},
|
|
25
|
+
{"output": "George R. R. Martin", "label": "William Shakespeare"},
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
# Score the predictions against labels using the accuracy metric
|
|
29
|
+
results = score(
|
|
30
|
+
items=model_predictions,
|
|
31
|
+
metrics=Accuracy,
|
|
32
|
+
upload_results=False, # Disable uploading for this example
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
print("\nResults:")
|
|
36
|
+
pprint(results)
|
|
37
|
+
|
|
38
|
+
return results
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
if __name__ == "__main__":
|
|
42
|
+
load_dotenv()
|
|
43
|
+
log_file = setup_logging(experiment_id="1-scoring_model_accuracy", base_dir=Path(__file__).parent)
|
|
44
|
+
output_dir = Path(__file__).parent / "results"
|
|
45
|
+
output_dir.mkdir(exist_ok=True)
|
|
46
|
+
results_dict = main()
|
|
47
|
+
save_results_to_json(results_dict, output_dir, "1-scoring_model_accuracy_output.json")
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Tutorials - Score - Example 2 - Scoring Models with BLEU."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
10
|
+
from scorebook import score
|
|
11
|
+
from scorebook.metrics.bleu import BLEU
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def main() -> Any:
|
|
15
|
+
"""Score pre-computed model predictions using Scorebook.
|
|
16
|
+
|
|
17
|
+
This example demonstrates how to score generated model predictions.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# Prepare a list of items with generated outputs and labels
|
|
21
|
+
model_predictions = [
|
|
22
|
+
{"output": "28-jähriger Koch wurde in San Francisco Mall entdeckt.", "label": "28-jähriger Koch in San Francisco Mall tot aufgefunden"},
|
|
23
|
+
{"output": "Ein 28-jähriger Koch, der kürzlich nach San Francisco gezogen war, wurde in der Treppe eines lokalen Einkaufszentrums dieser Woche ermordet.", "label": "Ein 28-jähriger Koch, der vor kurzem nach San Francisco gezogen ist, wurde im Treppenhaus eines örtlichen Einkaufzentrums tot aufgefunden."},
|
|
24
|
+
{"output": 'Der Bruder des Opfers sagt, er könne sich nicht vorstellen, wer ihm schaden wolle, und sagt: "Die Dinge waren endlich gut für ihn."', "label": 'Der Bruder des Opfers sagte aus, dass er sich niemanden vorstellen kann, der ihm schaden wollen würde, "Endlich ging es bei ihm wieder bergauf."'},
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
# Score the predictions against labels using the accuracy metric
|
|
28
|
+
results = score(
|
|
29
|
+
items=model_predictions,
|
|
30
|
+
metrics=BLEU(compact=False),
|
|
31
|
+
upload_results=False, # Disable uploading for this example
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
print("\nResults:")
|
|
35
|
+
pprint(results)
|
|
36
|
+
|
|
37
|
+
return results
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
if __name__ == "__main__":
|
|
41
|
+
load_dotenv()
|
|
42
|
+
log_file = setup_logging(experiment_id="2-scoring_model_bleu", base_dir=Path(__file__).parent)
|
|
43
|
+
output_dir = Path(__file__).parent / "results"
|
|
44
|
+
output_dir.mkdir(exist_ok=True)
|
|
45
|
+
results_dict = main()
|
|
46
|
+
save_results_to_json(results_dict, output_dir, "2-scoring_model_bleu_output.json")
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Tutorials - Score - Example 3 - F1 Metric Scoring."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
10
|
+
from scorebook import score
|
|
11
|
+
from scorebook.metrics.f1 import F1
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def main() -> Any:
|
|
15
|
+
"""Score pre-computed model predictions using F1 metric.
|
|
16
|
+
|
|
17
|
+
This example demonstrates how to score NER (Named Entity Recognition)
|
|
18
|
+
predictions using the F1 metric with different averaging methods.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# Sample NER predictions (in CoNLL format with BIO tags)
|
|
22
|
+
model_predictions = [
|
|
23
|
+
{"output": "O", "label": "O"},
|
|
24
|
+
{"output": "B-PER", "label": "B-PER"},
|
|
25
|
+
{"output": "I-PER", "label": "I-PER"},
|
|
26
|
+
{"output": "O", "label": "O"},
|
|
27
|
+
{"output": "B-LOC", "label": "B-LOC"},
|
|
28
|
+
{"output": "O", "label": "O"},
|
|
29
|
+
{"output": "B-ORG", "label": "B-LOC"}, # Misclassification
|
|
30
|
+
{"output": "O", "label": "B-MISC"}, # Missed entity
|
|
31
|
+
{"output": "B-PER", "label": "B-PER"},
|
|
32
|
+
{"output": "O", "label": "O"},
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
print(f"Scoring {len(model_predictions)} NER predictions\n")
|
|
36
|
+
|
|
37
|
+
# Score with all averaging methods at once
|
|
38
|
+
print("All averaging methods:")
|
|
39
|
+
results_all = score(
|
|
40
|
+
items=model_predictions,
|
|
41
|
+
metrics=F1(average="all"),
|
|
42
|
+
upload_results=False,
|
|
43
|
+
)
|
|
44
|
+
pprint(results_all["aggregate_results"])
|
|
45
|
+
|
|
46
|
+
# Score with specific combination of methods
|
|
47
|
+
print("\nMicro and weighted averaging:")
|
|
48
|
+
results_combo = score(
|
|
49
|
+
items=model_predictions,
|
|
50
|
+
metrics=F1(average=["micro", "weighted"]),
|
|
51
|
+
upload_results=False,
|
|
52
|
+
)
|
|
53
|
+
pprint(results_combo["aggregate_results"])
|
|
54
|
+
|
|
55
|
+
return results_all
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == "__main__":
|
|
59
|
+
load_dotenv()
|
|
60
|
+
log_file = setup_logging(experiment_id="3-scoring_f1_metric", base_dir=Path(__file__).parent)
|
|
61
|
+
output_dir = Path(__file__).parent / "results"
|
|
62
|
+
output_dir.mkdir(exist_ok=True)
|
|
63
|
+
results_dict = main()
|
|
64
|
+
save_results_to_json(results_dict, output_dir, "3-scoring_f1_metric_output.json")
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Tutorials - Score - Example 4 - Scoring Models with ROUGE."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
10
|
+
|
|
11
|
+
from scorebook import score
|
|
12
|
+
from scorebook.metrics.rouge import ROUGE
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Score text generation predictions using ROUGE metric.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how to score generated summaries
|
|
19
|
+
against reference summaries using ROUGE scores.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# Prepare a list of items with generated summaries and reference summaries
|
|
23
|
+
model_predictions = [
|
|
24
|
+
{
|
|
25
|
+
"output": "A woman donated her kidney to a stranger. This sparked a chain of six kidney transplants.",
|
|
26
|
+
"label": "Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.",
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"output": "Scientists discovered a new species of frog in the Amazon rainforest. The frog has unique markings that distinguish it from other species.",
|
|
30
|
+
"label": "A new frog species with distinctive blue and yellow stripes was found in the Amazon. Researchers say this discovery highlights the biodiversity of the region.",
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"output": "The technology company released its quarterly earnings report showing strong growth.",
|
|
34
|
+
"label": "Tech giant announces record quarterly revenue driven by cloud services and AI products.",
|
|
35
|
+
},
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Score the predictions against labels using the ROUGE metric
|
|
39
|
+
results = score(
|
|
40
|
+
items=model_predictions,
|
|
41
|
+
metrics=ROUGE(rouge_types=["rouge1", "rougeL"], use_stemmer=True),
|
|
42
|
+
upload_results=False, # Disable uploading for this example
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
print("\nResults:")
|
|
46
|
+
pprint(results)
|
|
47
|
+
|
|
48
|
+
# Display individual item scores
|
|
49
|
+
print("\n\nIndividual ROUGE Scores:")
|
|
50
|
+
for i, item_score in enumerate(results["item_results"]):
|
|
51
|
+
print(f"\nItem {i+1}:")
|
|
52
|
+
print(f" ROUGE-1 F1: {item_score['rouge1']:.4f}")
|
|
53
|
+
print(f" ROUGE-L F1: {item_score['rougeL']:.4f}")
|
|
54
|
+
|
|
55
|
+
return results
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == "__main__":
|
|
59
|
+
load_dotenv()
|
|
60
|
+
log_file = setup_logging(experiment_id="4-scoring_model_rouge", base_dir=Path(__file__).parent)
|
|
61
|
+
output_dir = Path(__file__).parent / "results"
|
|
62
|
+
output_dir.mkdir(exist_ok=True)
|
|
63
|
+
results_dict = main()
|
|
64
|
+
save_results_to_json(results_dict, output_dir, "4-scoring_model_rouge_output.json")
|