scorebook 0.0.14__tar.gz → 0.0.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {scorebook-0.0.14 → scorebook-0.0.16}/PKG-INFO +32 -24
  2. {scorebook-0.0.14 → scorebook-0.0.16}/README.md +13 -3
  3. {scorebook-0.0.14 → scorebook-0.0.16}/pyproject.toml +20 -17
  4. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/__init__.py +2 -0
  5. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/dashboard/credentials.py +34 -4
  6. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/eval_datasets/eval_dataset.py +2 -2
  7. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/evaluate/_async/evaluate_async.py +27 -11
  8. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/evaluate/_sync/evaluate.py +27 -11
  9. scorebook-0.0.16/src/scorebook/metrics/README.md +121 -0
  10. scorebook-0.0.16/src/scorebook/metrics/__init__.py +9 -0
  11. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/metrics/accuracy.py +2 -6
  12. scorebook-0.0.16/src/scorebook/metrics/bertscore.py +50 -0
  13. scorebook-0.0.16/src/scorebook/metrics/bleu.py +82 -0
  14. scorebook-0.0.16/src/scorebook/metrics/core/__init__.py +1 -0
  15. {scorebook-0.0.14/src/scorebook/metrics → scorebook-0.0.16/src/scorebook/metrics/core}/metric_base.py +1 -2
  16. scorebook-0.0.16/src/scorebook/metrics/core/metric_registry.py +195 -0
  17. scorebook-0.0.16/src/scorebook/metrics/exactmatch.py +95 -0
  18. scorebook-0.0.16/src/scorebook/metrics/f1.py +96 -0
  19. scorebook-0.0.16/src/scorebook/metrics/precision.py +94 -0
  20. scorebook-0.0.16/src/scorebook/metrics/recall.py +94 -0
  21. scorebook-0.0.16/src/scorebook/metrics/rouge.py +85 -0
  22. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/score/score_helpers.py +28 -11
  23. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/types.py +2 -2
  24. scorebook-0.0.16/src/scorebook/utils/progress_bars.py +128 -0
  25. scorebook-0.0.16/tutorials/README.md +147 -0
  26. scorebook-0.0.16/tutorials/__init__.py +5 -0
  27. scorebook-0.0.16/tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  28. scorebook-0.0.16/tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  29. scorebook-0.0.16/tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  30. scorebook-0.0.16/tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  31. scorebook-0.0.16/tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  32. scorebook-0.0.16/tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  33. scorebook-0.0.16/tutorials/examples/1-score/__init__.py +0 -0
  34. scorebook-0.0.16/tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  35. scorebook-0.0.16/tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  36. scorebook-0.0.16/tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  37. scorebook-0.0.16/tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  38. scorebook-0.0.16/tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  39. scorebook-0.0.16/tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  40. scorebook-0.0.16/tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  41. scorebook-0.0.16/tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  42. scorebook-0.0.16/tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  43. scorebook-0.0.16/tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  44. scorebook-0.0.16/tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  45. scorebook-0.0.16/tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  46. scorebook-0.0.16/tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  47. scorebook-0.0.16/tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  48. scorebook-0.0.16/tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  49. scorebook-0.0.16/tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  50. scorebook-0.0.16/tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  51. scorebook-0.0.16/tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  52. scorebook-0.0.16/tutorials/examples/6-providers/aws/__init__.py +1 -0
  53. scorebook-0.0.16/tutorials/examples/6-providers/aws/batch_example.py +219 -0
  54. scorebook-0.0.16/tutorials/examples/6-providers/portkey/__init__.py +1 -0
  55. scorebook-0.0.16/tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  56. scorebook-0.0.16/tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  57. scorebook-0.0.16/tutorials/examples/6-providers/vertex/__init__.py +1 -0
  58. scorebook-0.0.16/tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  59. scorebook-0.0.16/tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  60. scorebook-0.0.16/tutorials/examples/__init__.py +0 -0
  61. scorebook-0.0.16/tutorials/notebooks/1-scoring.ipynb +162 -0
  62. scorebook-0.0.16/tutorials/notebooks/2-evaluating.ipynb +316 -0
  63. scorebook-0.0.16/tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  64. scorebook-0.0.16/tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  65. scorebook-0.0.16/tutorials/notebooks/4-uploading_results.ipynb +175 -0
  66. scorebook-0.0.16/tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  67. scorebook-0.0.16/tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  68. scorebook-0.0.16/tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  69. scorebook-0.0.16/tutorials/quickstarts/getting_started.ipynb +197 -0
  70. scorebook-0.0.16/tutorials/utils/__init__.py +35 -0
  71. scorebook-0.0.16/tutorials/utils/args_parser.py +132 -0
  72. scorebook-0.0.16/tutorials/utils/output.py +23 -0
  73. scorebook-0.0.16/tutorials/utils/setup.py +98 -0
  74. scorebook-0.0.14/src/scorebook/metrics/__init__.py +0 -1
  75. scorebook-0.0.14/src/scorebook/metrics/metric_registry.py +0 -107
  76. scorebook-0.0.14/src/scorebook/metrics/precision.py +0 -19
  77. scorebook-0.0.14/src/scorebook/utils/progress_bars.py +0 -856
  78. {scorebook-0.0.14 → scorebook-0.0.16}/LICENSE +0 -0
  79. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/cli/__init__.py +0 -0
  80. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/cli/auth.py +0 -0
  81. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/cli/main.py +0 -0
  82. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/dashboard/__init__.py +0 -0
  83. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/dashboard/create_project.py +0 -0
  84. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/dashboard/upload_results.py +0 -0
  85. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/eval_datasets/__init__.py +0 -0
  86. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/evaluate/__init__.py +0 -0
  87. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/evaluate/_async/__init__.py +0 -0
  88. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/evaluate/_sync/__init__.py +0 -0
  89. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/evaluate/evaluate_helpers.py +0 -0
  90. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/exceptions.py +0 -0
  91. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/inference/__init__.py +0 -0
  92. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/inference/clients/__init__.py +0 -0
  93. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/inference/clients/bedrock.py +0 -0
  94. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/inference/clients/openai.py +0 -0
  95. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/inference/clients/portkey.py +0 -0
  96. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/inference/clients/vertex.py +0 -0
  97. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/inference/inference_pipeline.py +0 -0
  98. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/score/__init__.py +0 -0
  99. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/score/_async/__init__.py +0 -0
  100. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/score/_async/score_async.py +0 -0
  101. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/score/_sync/__init__.py +0 -0
  102. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/score/_sync/score.py +0 -0
  103. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/settings.py +0 -0
  104. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/utils/__init__.py +0 -0
  105. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/utils/async_utils.py +0 -0
  106. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/utils/common_helpers.py +0 -0
  107. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/utils/io_helpers.py +0 -0
  108. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/utils/jinja_helpers.py +0 -0
  109. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/utils/mappers.py +0 -0
  110. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/utils/mock_llm/__init__.py +0 -0
  111. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/utils/mock_llm/data/mock_llm_data.json +0 -0
  112. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/utils/render_template.py +0 -0
  113. {scorebook-0.0.14 → scorebook-0.0.16}/src/scorebook/utils/transform_helpers.py +0 -0
@@ -1,43 +1,41 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scorebook
3
- Version: 0.0.14
3
+ Version: 0.0.16
4
4
  Summary: A Python project for LLM evaluation.
5
5
  License-File: LICENSE
6
6
  Author: Euan Campbell
7
7
  Author-email: euan@trismik.com
8
- Requires-Python: >=3.9, <3.14
8
+ Requires-Python: >=3.10
9
9
  Classifier: Programming Language :: Python :: 3
10
- Classifier: Programming Language :: Python :: 3.9
11
10
  Classifier: Programming Language :: Python :: 3.10
12
11
  Classifier: Programming Language :: Python :: 3.11
13
12
  Classifier: Programming Language :: Python :: 3.12
14
13
  Classifier: Programming Language :: Python :: 3.13
15
- Provides-Extra: bedrock
16
14
  Provides-Extra: examples
17
- Provides-Extra: openai
18
- Provides-Extra: portkey
19
- Provides-Extra: vertex
15
+ Provides-Extra: metrics
16
+ Provides-Extra: providers
20
17
  Requires-Dist: accelerate ; extra == "examples"
21
- Requires-Dist: boto3 (==1.40.0) ; extra == "bedrock"
18
+ Requires-Dist: bert-score ; extra == "metrics"
19
+ Requires-Dist: boto3 (==1.40.0) ; extra == "providers"
22
20
  Requires-Dist: datasets (>=3.6.0)
23
- Requires-Dist: fsspec[gcs] ; extra == "vertex"
24
- Requires-Dist: google-cloud-storage ; extra == "vertex"
25
- Requires-Dist: google-genai ; extra == "vertex"
26
- Requires-Dist: ipywidgets (>=8.0.0)
27
- Requires-Dist: notebook (>=7.4.5,<8.0.0)
21
+ Requires-Dist: fsspec[gcs] ; extra == "providers"
22
+ Requires-Dist: google-cloud-storage ; extra == "providers"
23
+ Requires-Dist: google-genai ; extra == "providers"
24
+ Requires-Dist: ipywidgets ; extra == "examples"
25
+ Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
28
26
  Requires-Dist: notebook ; extra == "examples"
29
- Requires-Dist: openai ; extra == "openai"
30
- Requires-Dist: pandas ; extra == "vertex"
31
- Requires-Dist: portkey-ai ; extra == "portkey"
32
- Requires-Dist: python-dotenv ; extra == "bedrock"
33
- Requires-Dist: python-dotenv ; extra == "openai"
34
- Requires-Dist: python-dotenv ; extra == "portkey"
35
- Requires-Dist: python-dotenv ; extra == "vertex"
27
+ Requires-Dist: openai ; extra == "providers"
28
+ Requires-Dist: pandas ; extra == "providers"
29
+ Requires-Dist: portkey-ai ; extra == "providers"
30
+ Requires-Dist: python-dotenv (>=1.0.0)
31
+ Requires-Dist: rouge-score ; extra == "metrics"
32
+ Requires-Dist: sacrebleu ; extra == "metrics"
33
+ Requires-Dist: scikit-learn (>=1.0.0) ; extra == "metrics"
36
34
  Requires-Dist: torch ; extra == "examples"
37
35
  Requires-Dist: torchaudio ; extra == "examples"
38
36
  Requires-Dist: torchvision ; extra == "examples"
39
37
  Requires-Dist: transformers ; extra == "examples"
40
- Requires-Dist: trismik (==1.0.2)
38
+ Requires-Dist: trismik (>=1.0.3)
41
39
  Description-Content-Type: text/markdown
42
40
 
43
41
  <h1 align="center">Scorebook</h1>
@@ -51,6 +49,9 @@ Description-Content-Type: text/markdown
51
49
  <img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
52
50
  </a>
53
51
  <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
52
+ <a target="_blank" href="https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb">
53
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
54
+ </a>
54
55
  </p>
55
56
 
56
57
  Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
@@ -251,9 +252,16 @@ results = evaluate(
251
252
 
252
253
  ## Metrics
253
254
 
254
- | Metric | Sync/Async | Aggregate Scores | Item Scores |
255
- |------------|------------|--------------------------------------------------|-----------------------------------------|
256
- | `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
255
+ | Metric | Sync/Async | Aggregate Scores | Item Scores |
256
+ |--------------|------------|--------------------------------------------------|-----------------------------------------|
257
+ | `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
258
+ | `ExactMatch` | Sync | `Float`: Percentage of exact string matches | `Boolean`: Exact match with optional case/whitespace normalization |
259
+ | `F1` | Sync | `Dict[str, Float]`: F1 scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
260
+ | `Precision` | Sync | `Dict[str, Float]`: Precision scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
261
+ | `Recall` | Sync | `Dict[str, Float]`: Recall scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
262
+ | `BLEU` | Sync | `Float`: Corpus-level BLEU score | `Float`: Sentence-level BLEU score |
263
+ | `ROUGE` | Sync | `Dict[str, Float]`: Average F1 scores per ROUGE type | `Dict[str, Float]`: F1 scores per ROUGE type |
264
+ | `BertScore` | Sync | `Dict[str, Float]`: Average precision, recall, and F1 scores | `Dict[str, Float]`: Precision, recall, and F1 scores per item |
257
265
 
258
266
 
259
267
  ## Tutorials
@@ -9,6 +9,9 @@
9
9
  <img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
10
10
  </a>
11
11
  <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
12
+ <a target="_blank" href="https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb">
13
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
14
+ </a>
12
15
  </p>
13
16
 
14
17
  Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
@@ -209,9 +212,16 @@ results = evaluate(
209
212
 
210
213
  ## Metrics
211
214
 
212
- | Metric | Sync/Async | Aggregate Scores | Item Scores |
213
- |------------|------------|--------------------------------------------------|-----------------------------------------|
214
- | `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
215
+ | Metric | Sync/Async | Aggregate Scores | Item Scores |
216
+ |--------------|------------|--------------------------------------------------|-----------------------------------------|
217
+ | `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
218
+ | `ExactMatch` | Sync | `Float`: Percentage of exact string matches | `Boolean`: Exact match with optional case/whitespace normalization |
219
+ | `F1` | Sync | `Dict[str, Float]`: F1 scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
220
+ | `Precision` | Sync | `Dict[str, Float]`: Precision scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
221
+ | `Recall` | Sync | `Dict[str, Float]`: Recall scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
222
+ | `BLEU` | Sync | `Float`: Corpus-level BLEU score | `Float`: Sentence-level BLEU score |
223
+ | `ROUGE` | Sync | `Dict[str, Float]`: Average F1 scores per ROUGE type | `Dict[str, Float]`: F1 scores per ROUGE type |
224
+ | `BertScore` | Sync | `Dict[str, Float]`: Average precision, recall, and F1 scores | `Dict[str, Float]`: Precision, recall, and F1 scores per item |
215
225
 
216
226
 
217
227
  ## Tutorials
@@ -7,23 +7,26 @@ authors = [
7
7
  { name = "Marco Basaldella", email = "marco@trismik.com" }
8
8
  ]
9
9
  readme = "README.md"
10
- requires-python = ">=3.9, <3.14"
10
+ requires-python = ">=3.10"
11
11
  dependencies = [
12
12
  "datasets>=3.6.0",
13
- "notebook (>=7.4.5,<8.0.0)",
14
- "trismik==1.0.2",
15
- "ipywidgets>=8.0.0",
13
+ "trismik>=1.0.3",
14
+ "python-dotenv>=1.0.0",
15
+ "jinja2 (>=3.1.6,<4.0.0)",
16
16
  ]
17
17
 
18
18
  [project.scripts]
19
19
  scorebook = "scorebook.cli.main:main"
20
20
 
21
21
  [tool.poetry]
22
- version = "0.0.14" # base version
23
- packages = [{ include = "scorebook", from = "src" }]
22
+ version = "0.0.16" # base version
23
+ packages = [
24
+ { include = "scorebook", from = "src" },
25
+ { include = "tutorials" }
26
+ ]
24
27
 
25
28
  [tool.poetry.dependencies]
26
- python = ">=3.9,<3.14"
29
+ python = ">=3.10,<3.14"
27
30
 
28
31
  [[tool.poetry.source]]
29
32
  name = "testpypi"
@@ -42,17 +45,15 @@ mypy = "^1.15.0"
42
45
  autoflake = "^2.3.1"
43
46
  toml = "^0.10.2"
44
47
  types-pyyaml = "^6.0.12.20250822"
45
- unasync = {version = "^0.5.0", python = ">=3.9,<4"}
48
+ unasync = {version = "^0.5.0", python = ">=3.10,<4"}
46
49
  tomlkit = "^0.13.2"
47
50
  detect-secrets = "^1.5.0"
51
+ setuptools = "^75.0.0"
48
52
 
49
53
  [project.optional-dependencies]
50
- openai = ["openai", "python-dotenv"]
51
- portkey = ["portkey-ai", "python-dotenv"]
52
- bedrock = ["boto3==1.40.0", "python-dotenv"]
53
- vertex = ["google-genai", "pandas", "google-cloud-storage", "fsspec[gcs]", "python-dotenv"]
54
- examples = ["transformers", "torch", "torchvision", "torchaudio", "accelerate", "notebook"]
55
-
54
+ providers = ["openai", "portkey-ai", "boto3==1.40.0", "google-genai", "pandas", "google-cloud-storage", "fsspec[gcs]"]
55
+ examples = ["transformers", "torch", "torchvision", "torchaudio", "accelerate", "notebook", "ipywidgets"]
56
+ metrics = ["sacrebleu", "rouge-score", "scikit-learn>=1.0.0", "bert-score"]
56
57
 
57
58
  [build-system]
58
59
  requires = ["poetry-core"]
@@ -60,14 +61,16 @@ build-backend = "poetry.core.masonry.api"
60
61
 
61
62
  [tool.pytest.ini_options]
62
63
  asyncio_default_fixture_loop_scope = "class"
64
+ testpaths = ["tests/unit"]
63
65
  markers = [
64
- "unit: Unit tests that use mocks and don't require external dependencies",
66
+ "unit: Unit tests using only core dependencies (no optional packages)",
67
+ "metrics: Tests requiring metrics extras (sklearn, sacrebleu, rouge-score, bert-score)",
65
68
  "integration: Integration tests that may require network access or external services",
66
69
  ]
67
70
 
68
71
  [tool.black]
69
72
  line-length = 100
70
- target-version = ['py39']
73
+ target-version = ['py310']
71
74
  include = '\.pyi?$'
72
75
 
73
76
  [tool.isort]
@@ -76,7 +79,7 @@ line_length = 100
76
79
  multi_line_output = 3
77
80
 
78
81
  [tool.mypy]
79
- python_version = "3.9"
82
+ python_version = "3.10"
80
83
  warn_return_any = true
81
84
  warn_unused_configs = true
82
85
  disallow_untyped_defs = true
@@ -16,6 +16,7 @@ from scorebook.eval_datasets.eval_dataset import EvalDataset
16
16
  from scorebook.evaluate._async.evaluate_async import evaluate_async
17
17
  from scorebook.evaluate._sync.evaluate import evaluate
18
18
  from scorebook.inference.inference_pipeline import InferencePipeline
19
+ from scorebook.metrics.core.metric_registry import scorebook_metric
19
20
  from scorebook.score._async.score_async import score_async
20
21
  from scorebook.score._sync.score import score
21
22
  from scorebook.utils.render_template import render_template
@@ -35,4 +36,5 @@ __all__ = [
35
36
  "create_project_async",
36
37
  "upload_result",
37
38
  "upload_result_async",
39
+ "scorebook_metric",
38
40
  ]
@@ -3,8 +3,10 @@
3
3
  import logging
4
4
  import os
5
5
  import pathlib
6
+ import warnings
6
7
  from typing import Optional
7
8
 
9
+ from dotenv import load_dotenv
8
10
  from trismik import TrismikClient
9
11
 
10
12
  from scorebook.settings import TRISMIK_SERVICE_URL
@@ -92,16 +94,44 @@ def validate_token(token: str) -> bool:
92
94
  return False
93
95
 
94
96
 
95
- def login(trismik_api_key: str) -> None:
97
+ def login(trismik_api_key: Optional[str] = None) -> None:
96
98
  """Login to trismik by saving API key locally.
97
99
 
100
+ If no API key is provided, the function will attempt to read it from the
101
+ TRISMIK_API_KEY environment variable or .env file (using python-dotenv).
102
+ Environment variables take precedence over .env file values.
103
+
98
104
  Args:
99
- trismik_api_key: The API key to use.
105
+ trismik_api_key: The API key to use. If not provided, reads from
106
+ environment or .env file.
100
107
  Raises:
101
- ValueError: If API key is empty or invalid.
108
+ ValueError: If API key is empty, not found, or invalid.
109
+
110
+ Warns:
111
+ UserWarning: If an explicit API key is passed but TRISMIK_API_KEY
112
+ environment variable is also set.
102
113
  """
114
+ # Warn if user passes explicit key but env var is also set
115
+ if trismik_api_key is not None and os.environ.get("TRISMIK_API_KEY"):
116
+ warnings.warn(
117
+ "TRISMIK_API_KEY environment variable is set. The environment variable "
118
+ "takes precedence over the stored token when calling evaluate(). "
119
+ "To use the explicitly provided key, unset the TRISMIK_API_KEY "
120
+ "environment variable.",
121
+ UserWarning,
122
+ stacklevel=2,
123
+ )
124
+
125
+ if trismik_api_key is None:
126
+ # Load from .env file if TRISMIK_API_KEY is not already set in environment
127
+ load_dotenv()
128
+ trismik_api_key = os.environ.get("TRISMIK_API_KEY")
129
+
103
130
  if not trismik_api_key:
104
- raise ValueError("API key cannot be empty")
131
+ raise ValueError(
132
+ "API key cannot be empty. Either pass it as a parameter or "
133
+ "set the TRISMIK_API_KEY environment variable or .env file."
134
+ )
105
135
 
106
136
  # Validate token
107
137
  if not validate_token(trismik_api_key):
@@ -18,8 +18,8 @@ from scorebook.exceptions import (
18
18
  DatasetSampleError,
19
19
  MissingFieldError,
20
20
  )
21
- from scorebook.metrics.metric_base import MetricBase
22
- from scorebook.metrics.metric_registry import MetricRegistry
21
+ from scorebook.metrics.core.metric_base import MetricBase
22
+ from scorebook.metrics.core.metric_registry import MetricRegistry
23
23
  from scorebook.utils.io_helpers import validate_path
24
24
  from scorebook.utils.render_template import render_template
25
25
 
@@ -113,8 +113,6 @@ async def evaluate_async(
113
113
  with evaluation_progress_context(
114
114
  total_eval_runs=len(eval_run_specs),
115
115
  total_items=total_items,
116
- dataset_count=len(datasets),
117
- hyperparam_count=len(hyperparameter_configs),
118
116
  model_display=model_display,
119
117
  enabled=show_progress_bars,
120
118
  ) as progress_bars:
@@ -151,19 +149,31 @@ async def execute_runs(
151
149
  async def worker(
152
150
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
153
151
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
152
+ # Create progress callback for adaptive evals
153
+ on_progress: Optional[Callable[[int, int], None]] = None
154
+ if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
155
+
156
+ def _on_progress(current: int, total: int) -> None:
157
+ progress_bars.on_item_progress(current, total)
158
+
159
+ on_progress = _on_progress
160
+
154
161
  # Execute run (score_async handles upload internally for classic evals)
155
162
  run_result = await execute_run(
156
- inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
163
+ inference,
164
+ run,
165
+ upload_results,
166
+ experiment_id,
167
+ project_id,
168
+ metadata,
169
+ trismik_client,
170
+ on_progress,
157
171
  )
158
172
 
159
173
  # Update progress bars with items processed and success status
160
174
  if progress_bars is not None:
161
- # Classic evals have .items; adaptive evals use max_iterations
162
- items_processed = (
163
- len(run.dataset.items)
164
- if isinstance(run, EvalRunSpec)
165
- else evaluation_settings["max_iterations"]
166
- )
175
+ # Classic evals: update items count; Adaptive evals: items already tracked via callback
176
+ items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
167
177
  progress_bars.on_run_completed(items_processed, run_result.run_completed)
168
178
 
169
179
  # Update upload progress for classic evals
@@ -195,11 +205,12 @@ async def execute_runs(
195
205
  async def execute_run(
196
206
  inference: Callable,
197
207
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
198
- upload_results: bool, # NEW PARAMETER
208
+ upload_results: bool,
199
209
  experiment_id: Optional[str] = None,
200
210
  project_id: Optional[str] = None,
201
211
  metadata: Optional[Dict[str, Any]] = None,
202
212
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
213
+ on_progress: Optional[Callable[[int, int], None]] = None,
203
214
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
204
215
  """Execute a single evaluation run."""
205
216
 
@@ -218,6 +229,7 @@ async def execute_run(
218
229
  resolved_project_id,
219
230
  metadata,
220
231
  trismik_client,
232
+ on_progress,
221
233
  )
222
234
 
223
235
  else:
@@ -338,6 +350,7 @@ async def execute_adaptive_eval_run(
338
350
  project_id: str,
339
351
  metadata: Optional[Dict[str, Any]] = None,
340
352
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
353
+ on_progress: Optional[Callable[[int, int], None]] = None,
341
354
  ) -> AdaptiveEvalRunResult:
342
355
  """Execute an adaptive evaluation run."""
343
356
  logger.debug("Executing adaptive run for %s", run)
@@ -347,7 +360,7 @@ async def execute_adaptive_eval_run(
347
360
  raise ScoreBookError("Trismik client is required for adaptive evaluation")
348
361
 
349
362
  adaptive_eval_run_result = await run_adaptive_evaluation(
350
- inference, run, experiment_id, project_id, metadata, trismik_client
363
+ inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
351
364
  )
352
365
  logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
353
366
 
@@ -365,6 +378,7 @@ async def run_adaptive_evaluation(
365
378
  project_id: str,
366
379
  metadata: Any,
367
380
  trismik_client: Union[TrismikClient, TrismikAsyncClient],
381
+ on_progress: Optional[Callable[[int, int], None]] = None,
368
382
  ) -> AdaptiveEvalRunResult:
369
383
  """Run an adaptive evaluation using the Trismik API.
370
384
 
@@ -375,6 +389,7 @@ async def run_adaptive_evaluation(
375
389
  project_id: Trismik project ID
376
390
  metadata: Additional metadata
377
391
  trismik_client: Trismik client instance
392
+ on_progress: Optional callback for progress updates (current, total)
378
393
  Returns:
379
394
  Results from the adaptive evaluation
380
395
  """
@@ -404,6 +419,7 @@ async def run_adaptive_evaluation(
404
419
  inference_setup={},
405
420
  ),
406
421
  item_processor=make_trismik_inference(inference_with_hyperparams),
422
+ on_progress=on_progress,
407
423
  return_dict=False,
408
424
  )
409
425
 
@@ -112,8 +112,6 @@ def evaluate(
112
112
  with evaluation_progress_context(
113
113
  total_eval_runs=len(eval_run_specs),
114
114
  total_items=total_items,
115
- dataset_count=len(datasets),
116
- hyperparam_count=len(hyperparameter_configs),
117
115
  model_display=model_display,
118
116
  enabled=show_progress_bars,
119
117
  ) as progress_bars:
@@ -150,19 +148,31 @@ def execute_runs(
150
148
  def worker(
151
149
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
152
150
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
151
+ # Create progress callback for adaptive evals
152
+ on_progress: Optional[Callable[[int, int], None]] = None
153
+ if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
154
+
155
+ def _on_progress(current: int, total: int) -> None:
156
+ progress_bars.on_item_progress(current, total)
157
+
158
+ on_progress = _on_progress
159
+
153
160
  # Execute run (score_async handles upload internally for classic evals)
154
161
  run_result = execute_run(
155
- inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
162
+ inference,
163
+ run,
164
+ upload_results,
165
+ experiment_id,
166
+ project_id,
167
+ metadata,
168
+ trismik_client,
169
+ on_progress,
156
170
  )
157
171
 
158
172
  # Update progress bars with items processed and success status
159
173
  if progress_bars is not None:
160
- # Classic evals have .items; adaptive evals use max_iterations
161
- items_processed = (
162
- len(run.dataset.items)
163
- if isinstance(run, EvalRunSpec)
164
- else evaluation_settings["max_iterations"]
165
- )
174
+ # Classic evals: update items count; Adaptive evals: items already tracked via callback
175
+ items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
166
176
  progress_bars.on_run_completed(items_processed, run_result.run_completed)
167
177
 
168
178
  # Update upload progress for classic evals
@@ -194,11 +204,12 @@ def execute_runs(
194
204
  def execute_run(
195
205
  inference: Callable,
196
206
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
197
- upload_results: bool, # NEW PARAMETER
207
+ upload_results: bool,
198
208
  experiment_id: Optional[str] = None,
199
209
  project_id: Optional[str] = None,
200
210
  metadata: Optional[Dict[str, Any]] = None,
201
211
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
212
+ on_progress: Optional[Callable[[int, int], None]] = None,
202
213
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
203
214
  """Execute a single evaluation run."""
204
215
 
@@ -217,6 +228,7 @@ def execute_run(
217
228
  resolved_project_id,
218
229
  metadata,
219
230
  trismik_client,
231
+ on_progress,
220
232
  )
221
233
 
222
234
  else:
@@ -337,6 +349,7 @@ def execute_adaptive_eval_run(
337
349
  project_id: str,
338
350
  metadata: Optional[Dict[str, Any]] = None,
339
351
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
352
+ on_progress: Optional[Callable[[int, int], None]] = None,
340
353
  ) -> AdaptiveEvalRunResult:
341
354
  """Execute an adaptive evaluation run."""
342
355
  logger.debug("Executing adaptive run for %s", run)
@@ -346,7 +359,7 @@ def execute_adaptive_eval_run(
346
359
  raise ScoreBookError("Trismik client is required for adaptive evaluation")
347
360
 
348
361
  adaptive_eval_run_result = run_adaptive_evaluation(
349
- inference, run, experiment_id, project_id, metadata, trismik_client
362
+ inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
350
363
  )
351
364
  logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
352
365
 
@@ -364,6 +377,7 @@ def run_adaptive_evaluation(
364
377
  project_id: str,
365
378
  metadata: Any,
366
379
  trismik_client: Union[TrismikClient, TrismikAsyncClient],
380
+ on_progress: Optional[Callable[[int, int], None]] = None,
367
381
  ) -> AdaptiveEvalRunResult:
368
382
  """Run an adaptive evaluation using the Trismik API.
369
383
 
@@ -374,6 +388,7 @@ def run_adaptive_evaluation(
374
388
  project_id: Trismik project ID
375
389
  metadata: Additional metadata
376
390
  trismik_client: Trismik client instance
391
+ on_progress: Optional callback for progress updates (current, total)
377
392
  Returns:
378
393
  Results from the adaptive evaluation
379
394
  """
@@ -403,6 +418,7 @@ def run_adaptive_evaluation(
403
418
  inference_setup={},
404
419
  ),
405
420
  item_processor=make_trismik_inference(inference_with_hyperparams),
421
+ on_progress=on_progress,
406
422
  return_dict=False,
407
423
  )
408
424
 
@@ -0,0 +1,121 @@
1
+ # Adding Metrics to Scorebook
2
+
3
+ This guide explains how to add new metrics to Scorebook.
4
+
5
+ ## Quick Start
6
+
7
+ 1. Create a metric file: `src/scorebook/metrics/yourmetric.py`
8
+ 2. Implement the metric class
9
+ 3. Add tests
10
+ 4. Submit PR for review
11
+
12
+ ### Where to Put Tests
13
+
14
+ Tests go in one of two directories:
15
+
16
+ - **`tests/unit/test_metrics/`** - For fast tests using mocked data. These run on every commit.
17
+ - **`tests/extended/test_metrics/`** - For tests that require external dependencies, large datasets, or are computationally expensive.
18
+
19
+ Most metrics only need unit tests. Use extended tests when your metric relies on external APIs, models, or takes significant time to run.
20
+
21
+ See [CONTRIBUTING.md](../../../CONTRIBUTING.md) for instructions on running tests.
22
+
23
+ ---
24
+
25
+ ## Requirements
26
+
27
+ Your metric must:
28
+
29
+ - Use the `@scorebook_metric` decorator
30
+ - Inherit from `MetricBase`
31
+ - Implement the `score()` static method
32
+
33
+ The `score()` method returns a tuple of `(aggregate_scores, item_scores)`:
34
+
35
+ - **aggregate_scores**: A `Dict[str, float]` with overall metric values (e.g., `{"accuracy": 0.85}`)
36
+ - **item_scores**: A `List` of per-item scores. For metrics that produce a single value per item, use `int`, `float`, `bool`, or `str`. For metrics that produce multiple values per item, use a `Dict[str, Union[int, float, bool, str]]` where keys are metric names.
37
+
38
+ ---
39
+
40
+ ## File Naming
41
+
42
+ Metric files must use normalized names (lowercase, no underscores/spaces). This naming convention is required for the registry's lazy loading system to work.
43
+
44
+ 1. User requests a metric by name (e.g., `"f1_score"`, `"F1Score"`, or `"f1 score"`)
45
+ 2. The registry normalizes the input → `"f1score"`
46
+ 3. The registry imports `scorebook.metrics.f1score`
47
+ 4. The `@scorebook_metric` decorator registers the class
48
+
49
+ **Examples:**
50
+ - Class: `F1Score` → File: `f1score.py` → User can request: `"f1score"`, `"F1Score"`, `"f1_score"`, `"f1 score"`
51
+ - Class: `MeanSquaredError` → File: `meansquarederror.py` → User can request: `"MeanSquaredError"`, `"mean_squared_error"`, etc.
52
+
53
+ **Collision detection:** Class names that normalize to the same key will raise an error at registration time. For example, `F1Score` and `F1_Score` both normalize to `"f1score"` and cannot coexist.
54
+
55
+ ---
56
+
57
+ ## Implementation Template
58
+
59
+ Create your metric file in `src/scorebook/metrics/yourmetric.py`:
60
+
61
+ ```python
62
+ """Brief description of the metric."""
63
+
64
+ from typing import Any, Dict, List, Tuple
65
+
66
+ from scorebook.metrics import MetricBase, scorebook_metric
67
+
68
+
69
+ @scorebook_metric
70
+ class YourMetric(MetricBase):
71
+ """One-line description of what this metric measures.
72
+
73
+ Formula or explanation (e.g., Accuracy = correct / total).
74
+ """
75
+
76
+ def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
77
+ """Calculate metric score between outputs and labels.
78
+
79
+ Args:
80
+ outputs: A list of model inference outputs.
81
+ labels: A list of ground truth labels.
82
+
83
+ Returns:
84
+ Tuple containing:
85
+ - Aggregate scores dict (e.g., {"your_metric": 0.85})
86
+ - List of per-item scores
87
+
88
+ Raises:
89
+ ValueError: If outputs and labels have different lengths.
90
+ """
91
+ # Input validation
92
+ if len(outputs) != len(labels):
93
+ raise ValueError("Number of outputs must match number of labels")
94
+
95
+ if not outputs:
96
+ return {"your_metric": 0.0}, []
97
+
98
+ # Calculate per-item scores
99
+ item_scores = [calculate_score(out, lab) for out, lab in zip(outputs, labels)]
100
+
101
+ # Calculate aggregate score
102
+ aggregate_score = sum(item_scores) / len(item_scores)
103
+
104
+ return {"your_metric": aggregate_score}, item_scores
105
+ ```
106
+
107
+ ---
108
+
109
+ ## Documentation
110
+
111
+ Each metric should have:
112
+
113
+ 1. **Module-level docstring**: Brief description at the top of the file
114
+ 2. **Class docstring**: What the metric measures, formula, and any limitations
115
+ 3. **Method docstring**: Args, Returns, and Raises sections
116
+
117
+ ---
118
+
119
+ ## Example
120
+
121
+ See `src/scorebook/metrics/accuracy.py` for a complete reference implementation.
@@ -0,0 +1,9 @@
1
+ """Metrics for evaluating model predictions."""
2
+
3
+ from scorebook.metrics.core.metric_base import MetricBase
4
+ from scorebook.metrics.core.metric_registry import scorebook_metric
5
+
6
+ __all__ = [
7
+ "MetricBase",
8
+ "scorebook_metric",
9
+ ]
@@ -2,11 +2,10 @@
2
2
 
3
3
  from typing import Any, Dict, List, Tuple
4
4
 
5
- from scorebook.metrics.metric_base import MetricBase
6
- from scorebook.metrics.metric_registry import MetricRegistry
5
+ from scorebook.metrics import MetricBase, scorebook_metric
7
6
 
8
7
 
9
- @MetricRegistry.register()
8
+ @scorebook_metric
10
9
  class Accuracy(MetricBase):
11
10
  """Accuracy metric for evaluating model predictions of any type.
12
11
 
@@ -25,9 +24,6 @@ class Accuracy(MetricBase):
25
24
  The aggregate accuracy score for all items (correct predictions / total predictions).
26
25
  The item scores for each output-label pair (true/false).
27
26
  """
28
- if len(outputs) != len(labels):
29
- raise ValueError("Number of outputs must match number of labels")
30
-
31
27
  if not outputs: # Handle empty lists
32
28
  return {"accuracy": 0.0}, []
33
29